DiscordChatExporter/scripts/run-documents-scrape.sh
Boden 88e864c72a feat(scrape): add Documents scrape workflow and token discovery
Auto-discover token files, provide run-documents-scrape and
prove-incremental-append helpers, and document the end-to-end operator flow.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-28 02:23:34 -05:00

84 lines
2.1 KiB
Bash
Executable file

#!/usr/bin/env bash
set -Eeuo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}"
HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh"
VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
usage() {
cat <<EOF
Usage:
$(basename "$0") [--dry-run] [--target NAME] [--config PATH]
End-to-end Documents scrape workflow:
1. Verify enabled targets have seeded archives under ~/Documents/<server>/
2. Bootstrap scrape.env when DISCORD_TOKEN is exported
3. Preflight against Discord (skipped with --dry-run)
4. Incremental scrape (append-only merges into existing JSON files)
Options:
--dry-run Verify archives only; do not call Discord
--target NAME Limit preflight/scrape to one configured target
--config PATH Scrape target config (default: config/scrape-targets.json)
EOF
}
die() {
printf 'ERROR: %s\n' "$*" >&2
exit 1
}
main() {
local dry_run=0
local target=""
local -a passthrough=()
while (($#)); do
case "$1" in
--dry-run)
dry_run=1
shift
;;
--target)
[[ $# -ge 2 ]] || die "Missing value for --target."
target=$2
passthrough+=(--target "$2")
shift 2
;;
--config)
[[ $# -ge 2 ]] || die "Missing value for --config."
CONFIG_PATH=$2
passthrough+=(--config "$2")
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
die "Unknown option: $1"
;;
esac
done
"$VERIFY_SCRIPT" --config "$CONFIG_PATH"
if (( dry_run == 1 )); then
printf 'Dry run complete: archive paths verified. Export DISCORD_TOKEN or create a token file, then rerun without --dry-run.\n'
exit 0
fi
if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then
"$SETUP_AUTH" 2>/dev/null || true
fi
"$HOST_RUNNER" preflight --config "$CONFIG_PATH" "${passthrough[@]}"
"$HOST_RUNNER" scrape --config "$CONFIG_PATH" "${passthrough[@]}"
}
main "$@"