From 88e864c72a8a3ec59680fa6912edd311a8156a23 Mon Sep 17 00:00:00 2001 From: Boden Date: Thu, 28 May 2026 02:23:34 -0500 Subject: [PATCH] feat(scrape): add Documents scrape workflow and token discovery Auto-discover token files, provide run-documents-scrape and prove-incremental-append helpers, and document the end-to-end operator flow. Co-authored-by: Cursor --- .docs/Recurring-Scrape-Setup.md | 15 ++ .gitignore | 1 + ...28-008-live-documents-scrape-proof-plan.md | 28 ++++ scrape.env.example | 1 + scripts/prove-incremental-append.sh | 157 ++++++++++++++++++ scripts/run-discord-scrape-host.sh | 26 ++- scripts/run-documents-scrape.sh | 83 +++++++++ scripts/tests/documents-scrape-smoke.sh | 78 +++++++++ 8 files changed, 387 insertions(+), 2 deletions(-) create mode 100644 docs/plans/2026-05-28-008-live-documents-scrape-proof-plan.md create mode 100755 scripts/prove-incremental-append.sh create mode 100755 scripts/run-documents-scrape.sh create mode 100755 scripts/tests/documents-scrape-smoke.sh diff --git a/.docs/Recurring-Scrape-Setup.md b/.docs/Recurring-Scrape-Setup.md index f9bced1a..0b12170c 100644 --- a/.docs/Recurring-Scrape-Setup.md +++ b/.docs/Recurring-Scrape-Setup.md @@ -76,6 +76,21 @@ Before the first incremental run, confirm each enabled target points at the corr Each enabled target should show a non-zero **JSON** count and **SEEDED** channel IDs under `/home/brunner56/Documents//`. +**One-command workflow** (verify → preflight → incremental scrape): + +```bash +export DISCORD_TOKEN="your-token" # or place token in ~/.config/discord-scrape/token +./scripts/run-documents-scrape.sh +./scripts/run-documents-scrape.sh --target KotOR_discord_msgs # single server +./scripts/run-documents-scrape.sh --dry-run # archives only, no Discord +``` + +After a scrape, prove archives only grew in place: + +```bash +./scripts/prove-incremental-append.sh --target KotOR_discord_msgs +``` + ### 3. Run Preflight Validation Before installing cron, validate your setup: diff --git a/.gitignore b/.gitignore index 612b5582..66b007fa 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,6 @@ TestResults/ # Local automation secrets and logs scrape.env +.discord-token logs/ .compound-engineering/*.local.yaml diff --git a/docs/plans/2026-05-28-008-live-documents-scrape-proof-plan.md b/docs/plans/2026-05-28-008-live-documents-scrape-proof-plan.md new file mode 100644 index 00000000..702abab3 --- /dev/null +++ b/docs/plans/2026-05-28-008-live-documents-scrape-proof-plan.md @@ -0,0 +1,28 @@ +--- +title: fix: Live Documents scrape proof and token discovery +type: fix +status: completed +date: 2026-05-28 +origin: LFG repeat — ensure ~/Documents/** append scrape works with proper auth +depends_on: docs/plans/2026-05-28-007-verify-documents-auth-bootstrap-plan.md +--- + +# fix: Live Documents scrape proof and token discovery + +## Summary + +Append-safe scraping is implemented but live Discord auth has never been exercised in this environment. Add automatic token-file discovery, a unified operator entrypoint, and a grow-only proof harness that records message counts before/after a scrape. + +## Requirements + +| ID | Requirement | Files | +|----|-------------|-------| +| L1 | Host runner discovers `DISCORD_TOKEN_FILE` from standard paths when unset | `scripts/run-discord-scrape-host.sh`, smoke test | +| L2 | `run-documents-scrape.sh` runs verify → auth check → preflight → scrape | `scripts/run-documents-scrape.sh` | +| L3 | `prove-incremental-append.sh` asserts same paths and non-shrinking message counts | `scripts/prove-incremental-append.sh`, smoke test | + +## Success Criteria + +- `./scripts/run-documents-scrape.sh --dry-run` passes without token +- With valid token, `./scripts/prove-incremental-append.sh --target KotOR_discord_msgs` shows grow-only counts +- Smoke tests pass diff --git a/scrape.env.example b/scrape.env.example index b07686f9..d1bdb9b1 100644 --- a/scrape.env.example +++ b/scrape.env.example @@ -1,6 +1,7 @@ # Copy this file to scrape.env and fill in your real values. DISCORD_TOKEN= # Optional: file whose first line contains DISCORD_TOKEN. Useful for token rotation without editing this env file. +# Standard locations also auto-discovered: .discord-token (repo root) and ~/.config/discord-scrape/token DISCORD_TOKEN_FILE= # Optional (manual runs only): command to refresh Discord auth/session before one retry. # Optional absolute path to an executable reauth script under the repository root. diff --git a/scripts/prove-incremental-append.sh b/scripts/prove-incremental-append.sh new file mode 100755 index 00000000..ddc8992c --- /dev/null +++ b/scripts/prove-incremental-append.sh @@ -0,0 +1,157 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P) +REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}" +CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}" +HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh" +SNAPSHOT_DIR="" + +usage() { + cat <&2 + exit 1 +} + +cleanup() { + [[ -n "$SNAPSHOT_DIR" && -d "$SNAPSHOT_DIR" ]] && rm -rf "$SNAPSHOT_DIR" +} + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing." +} + +target_output_dir() { + local target_name=$1 + jq -r --arg name "$target_name" ' + .targets[] + | select(.name == $name) + | .output_dir + ' "$CONFIG_PATH" +} + +snapshot_archives() { + local output_dir=$1 + local snapshot_file=$2 + local file_path file_name channel_id count + + : >"$snapshot_file" + + [[ -d "$output_dir" ]] || die "Missing output_dir: $output_dir" + + while IFS= read -r -d '' file_path; do + file_name=$(basename "$file_path") + if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then + channel_id=${BASH_REMATCH[1]} + count=$(jq -r '(.messages | length) // 0' "$file_path") + printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file" + fi + done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null) +} + +compare_snapshots() { + local before_file=$1 + local after_file=$2 + local failures=0 + + while IFS=$'\t' read -r path channel_id before_count; do + [[ -n "$path" ]] || continue + local after_line after_path after_count + after_line=$(grep -F "$path"$'\t' "$after_file" || true) + if [[ -z "$after_line" ]]; then + printf 'FAIL: archive disappeared: %s\n' "$path" >&2 + failures=$((failures + 1)) + continue + fi + IFS=$'\t' read -r after_path _ after_count <<<"$after_line" + if (( after_count < before_count )); then + printf 'FAIL: message count shrank for %s (%s -> %s)\n' "$path" "$before_count" "$after_count" >&2 + failures=$((failures + 1)) + continue + fi + if (( after_count > before_count )); then + printf 'OK: appended %s messages in %s\n' "$((after_count - before_count))" "$path" + else + printf 'OK: unchanged %s (%s messages)\n' "$path" "$before_count" + fi + done <"$before_file" + + while IFS=$'\t' read -r path channel_id after_count; do + [[ -n "$path" ]] || continue + if ! grep -Fq "$path"$'\t' "$before_file"; then + printf 'FAIL: unexpected new archive path (not pre-existing): %s\n' "$path" >&2 + failures=$((failures + 1)) + fi + done <"$after_file" + + if (( failures > 0 )); then + die "$failures archive integrity check(s) failed." + fi +} + +main() { + local target="" + + trap cleanup EXIT + + while (($#)); do + case "$1" in + --target) + [[ $# -ge 2 ]] || die "Missing value for --target." + target=$2 + shift 2 + ;; + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + CONFIG_PATH=$2 + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + [[ -n "$target" ]] || die "--target is required." + + require_command jq + [[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH" + + local output_dir + output_dir=$(target_output_dir "$target") + [[ -n "$output_dir" && "$output_dir" != "null" ]] || die "Unknown target: $target" + + SNAPSHOT_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-prove-append.XXXXXX") + local before_file="$SNAPSHOT_DIR/before.tsv" + local after_file="$SNAPSHOT_DIR/after.tsv" + + snapshot_archives "$output_dir" "$before_file" + [[ -s "$before_file" ]] || die "No seeded archives found under $output_dir" + + printf 'Running incremental scrape for target %s...\n' "$target" + "$HOST_RUNNER" scrape --config "$CONFIG_PATH" --target "$target" + + snapshot_archives "$output_dir" "$after_file" + compare_snapshots "$before_file" "$after_file" + printf 'Append-safe proof passed for target %s.\n' "$target" +} + +main "$@" diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index 3f4b8849..59ca9212 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -107,6 +107,7 @@ prepare_compose_env() { fi if [[ -z "${DISCORD_TOKEN:-}" ]]; then + discover_token_file || true load_token_from_file || true fi @@ -115,7 +116,7 @@ prepare_compose_env() { return 0 fi - die "Missing env file: $ENV_FILE (copy scrape.env.example to scrape.env) or export DISCORD_TOKEN / DISCORD_TOKEN_FILE in the shell." + die "Missing env file: $ENV_FILE (copy scrape.env.example to scrape.env), export DISCORD_TOKEN / DISCORD_TOKEN_FILE, or place a token at $REPO_ROOT/.discord-token or ~/.config/discord-scrape/token." } load_token_from_file() { @@ -130,11 +131,32 @@ load_token_from_file() { return 0 } +discover_token_file() { + local candidate + + if [[ -n "${DISCORD_TOKEN_FILE:-}" && -f "${DISCORD_TOKEN_FILE}" ]]; then + return 0 + fi + + for candidate in \ + "$REPO_ROOT/.discord-token" \ + "$HOME/.config/discord-scrape/token" \ + "$HOME/.config/discord-token"; do + if [[ -f "$candidate" ]]; then + export DISCORD_TOKEN_FILE="$candidate" + return 0 + fi + done + + return 1 +} + ensure_token_present() { if [[ -z "${DISCORD_TOKEN:-}" ]]; then + discover_token_file || true load_token_from_file || true fi - [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set. Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in $ENV_FILE or export it in the shell." + [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set. Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in $ENV_FILE, export it in the shell, or place a token at $REPO_ROOT/.discord-token or ~/.config/discord-scrape/token." } compose_run_args() { diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh new file mode 100755 index 00000000..a79b0519 --- /dev/null +++ b/scripts/run-documents-scrape.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P) +REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}" +CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}" +HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh" +VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh" +SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh" + +usage() { + cat </ + 2. Bootstrap scrape.env when DISCORD_TOKEN is exported + 3. Preflight against Discord (skipped with --dry-run) + 4. Incremental scrape (append-only merges into existing JSON files) + +Options: + --dry-run Verify archives only; do not call Discord + --target NAME Limit preflight/scrape to one configured target + --config PATH Scrape target config (default: config/scrape-targets.json) +EOF +} + +die() { + printf 'ERROR: %s\n' "$*" >&2 + exit 1 +} + +main() { + local dry_run=0 + local target="" + local -a passthrough=() + + while (($#)); do + case "$1" in + --dry-run) + dry_run=1 + shift + ;; + --target) + [[ $# -ge 2 ]] || die "Missing value for --target." + target=$2 + passthrough+=(--target "$2") + shift 2 + ;; + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + CONFIG_PATH=$2 + passthrough+=(--config "$2") + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + "$VERIFY_SCRIPT" --config "$CONFIG_PATH" + + if (( dry_run == 1 )); then + printf 'Dry run complete: archive paths verified. Export DISCORD_TOKEN or create a token file, then rerun without --dry-run.\n' + exit 0 + fi + + if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then + "$SETUP_AUTH" 2>/dev/null || true + fi + + "$HOST_RUNNER" preflight --config "$CONFIG_PATH" "${passthrough[@]}" + "$HOST_RUNNER" scrape --config "$CONFIG_PATH" "${passthrough[@]}" +} + +main "$@" diff --git a/scripts/tests/documents-scrape-smoke.sh b/scripts/tests/documents-scrape-smoke.sh new file mode 100755 index 00000000..15109a3a --- /dev/null +++ b/scripts/tests/documents-scrape-smoke.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P) +TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-documents-scrape-smoke.XXXXXX") + +cleanup() { + rm -rf "$TMP_DIR" +} +trap cleanup EXIT + +FAKE_REPO="$TMP_DIR/fake-repo" +mkdir -p "$FAKE_REPO/scripts" +cp "$REPO_ROOT/scripts/run-discord-scrape-host.sh" "$FAKE_REPO/scripts/" +chmod +x "$FAKE_REPO/scripts/run-discord-scrape-host.sh" + +COMPOSE_FILE="$TMP_DIR/docker-compose.yml" +FAKE_DOCKER="$TMP_DIR/docker" +CALL_COUNT="$TMP_DIR/call-count" + +cat >"$COMPOSE_FILE" <<'EOF' +services: + discord-scraper: + image: fake +EOF + +cat >"$FAKE_DOCKER" <<'EOF' +#!/usr/bin/env bash +printf 'run succeeded\n' +EOF +chmod +x "$FAKE_DOCKER" + +printf 'discovered-token\n' >"$FAKE_REPO/.discord-token" +MISSING_ENV="$TMP_DIR/missing-scrape.env" +[[ ! -e "$MISSING_ENV" ]] + +DCE_REPO_ROOT="$FAKE_REPO" \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + DCE_ENV_FILE="$MISSING_ENV" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + FAKE_DOCKER_CALL_COUNT="$CALL_COUNT" \ + "$FAKE_REPO/scripts/run-discord-scrape-host.sh" scrape --target demo >/dev/null + +ARCHIVE="$TMP_DIR/server" +mkdir -p "$ARCHIVE" +printf '{"messages":[{"id":"1","timestamp":"2020-01-01T00:00:00"}]}\n' >"$ARCHIVE/Guild - general [111111111111111111].json" + +cat >"$TMP_DIR/config.json" </dev/null + +"$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" >/dev/null + +echo "documents-scrape-smoke: ok"