DiscordChatExporter/scripts/run-documents-scrape.sh
Copilot b883943e3a feat(scrape): lock gate and salvage-before on documents scrape
Add archive-root lock preflight and --salvage-before-scrape to
run-documents-scrape.sh so direct operator invocations match validation
safety and KotOR catch-up workflow.
2026-06-03 07:04:56 -05:00

184 lines
5.1 KiB
Bash
Executable file

#!/usr/bin/env bash
set -Eeuo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}"
CONTAINER_CONFIG="${DCE_CONTAINER_CONFIG:-/config/scrape-targets.json}"
HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh"
DISCOVER_TOKEN="$REPO_ROOT/scripts/discover-discord-token.sh"
VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
# shellcheck source=lib/scrape-run-plan.sh
source "$SCRIPT_DIR/lib/scrape-run-plan.sh"
usage() {
cat <<EOF
Usage:
$(basename "$0") [--dry-run] [--salvage-only] [--salvage-before-scrape] [--target NAME] [--config PATH]
End-to-end Documents scrape workflow:
1. Verify enabled targets have seeded archives under ~/Documents/<server>/
2. Bootstrap scrape.env when DISCORD_TOKEN is exported
3. Preflight against Discord (skipped with --dry-run or --salvage-only)
4. Incremental scrape (append-only merges into existing JSON files)
Options:
--dry-run Verify archives only; do not call Discord
--salvage-only Merge quiescent stale .dce-temp exports only (no Discord export)
--salvage-before-scrape Run salvage-only pass before preflight and incremental scrape
--target NAME Limit preflight/scrape to one configured target
--channel ID With exactly one --target, limit scrape to channel ID (repeatable)
--config PATH Scrape target config (default: config/scrape-targets.json)
EOF
}
die() {
printf 'ERROR: %s\n' "$*" >&2
exit 1
}
ensure_scrape_lock_available() {
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
return 0
fi
[[ -x "$LOCK_STATUS" ]] || return 0
if ! "$LOCK_STATUS" --config "$CONFIG_PATH"; then
die "Scrape lock is held; another scrape may be running. Inspect: $LOCK_STATUS --config $CONFIG_PATH"
fi
}
run_local_salvage() {
local -a salvage_args=(--config "$CONFIG_PATH")
local skip_next=0 arg
for arg in "$@"; do
if (( skip_next )); then
skip_next=0
continue
fi
if [[ "$arg" == "--config" ]]; then
skip_next=1
continue
fi
salvage_args+=("$arg")
done
"$HOST_RUNNER" salvage "${salvage_args[@]}"
}
main() {
local dry_run=0
local salvage_only=0
local salvage_before=0
local target=""
local -a passthrough=()
while (($#)); do
case "$1" in
--dry-run)
dry_run=1
shift
;;
--salvage-only)
salvage_only=1
shift
;;
--salvage-before-scrape)
salvage_before=1
shift
;;
--target)
[[ $# -ge 2 ]] || die "Missing value for --target."
target=$2
passthrough+=(--target "$2")
shift 2
;;
--channel)
[[ $# -ge 2 ]] || die "Missing value for --channel."
passthrough+=(--channel "$2")
shift 2
;;
--config)
[[ $# -ge 2 ]] || die "Missing value for --config."
CONFIG_PATH=$2
passthrough+=(--config "$2")
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
die "Unknown option: $1"
;;
esac
done
local exclusive=0
(( dry_run == 1 )) && exclusive=$((exclusive + 1))
(( salvage_only == 1 )) && exclusive=$((exclusive + 1))
(( salvage_before == 1 )) && exclusive=$((exclusive + 1))
if (( exclusive > 1 )); then
die "Use only one of --dry-run, --salvage-only, or --salvage-before-scrape."
fi
"$VERIFY_SCRIPT" --config "$CONFIG_PATH"
local -a plan_targets=()
if [[ -n "$target" ]]; then
plan_targets=("$target")
fi
print_scrape_config_plan "$CONFIG_PATH" "Documents scrape" "${plan_targets[@]}"
if (( dry_run == 1 )); then
printf 'Dry run complete: archive paths verified. Export DISCORD_TOKEN or create a token file, then rerun without --dry-run.\n'
exit 0
fi
"$VERIFY_READY" --disk-only --config "$CONFIG_PATH"
ensure_scrape_lock_available
if (( salvage_only == 1 )); then
run_local_salvage "${passthrough[@]}"
exit 0
fi
if (( salvage_before == 1 )); then
run_local_salvage "${passthrough[@]}"
fi
local -a container_args=("${passthrough[@]}")
local has_config=0 idx=0
while (( idx < ${#container_args[@]} )); do
if [[ "${container_args[idx]}" == "--config" ]]; then
has_config=1
case "${container_args[idx + 1]:-}" in
"$CONFIG_PATH"|config/scrape-targets.json|./config/scrape-targets.json)
container_args[idx + 1]="$CONTAINER_CONFIG"
;;
esac
break
fi
idx=$((idx + 1))
done
if (( has_config == 0 )); then
container_args=(--config "$CONTAINER_CONFIG" "${container_args[@]}")
fi
if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then
"$SETUP_AUTH" 2>/dev/null || true
elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then
"$SETUP_AUTH" 2>/dev/null || true
fi
"$HOST_RUNNER" preflight "${container_args[@]}"
"$HOST_RUNNER" scrape "${container_args[@]}"
}
main "$@"