From 9e55378c845b59641a22c5f46f65f26d75decb39 Mon Sep 17 00:00:00 2001 From: Boden Date: Thu, 28 May 2026 01:18:05 -0500 Subject: [PATCH] feat(scrape): add Documents verify and auth bootstrap helpers Add scripts to confirm enabled targets map to seeded ~/Documents archives and to create scrape.env from exported DISCORD_TOKEN credentials. Co-authored-by: Cursor --- .docs/Recurring-Scrape-Setup.md | 17 +++ ...07-verify-documents-auth-bootstrap-plan.md | 28 ++++ scripts/setup-scrape-auth.sh | 106 ++++++++++++++ scripts/tests/verify-documents-auth-smoke.sh | 80 +++++++++++ scripts/verify-documents-archives.sh | 129 ++++++++++++++++++ 5 files changed, 360 insertions(+) create mode 100644 docs/plans/2026-05-28-007-verify-documents-auth-bootstrap-plan.md create mode 100755 scripts/setup-scrape-auth.sh create mode 100755 scripts/tests/verify-documents-auth-smoke.sh create mode 100755 scripts/verify-documents-archives.sh diff --git a/.docs/Recurring-Scrape-Setup.md b/.docs/Recurring-Scrape-Setup.md index 51c4b061..f9bced1a 100644 --- a/.docs/Recurring-Scrape-Setup.md +++ b/.docs/Recurring-Scrape-Setup.md @@ -59,6 +59,23 @@ export DISCORD_TOKEN="your-token-here" # optional: export DISCORD_TOKEN_FILE=/path/to/token/file ``` +To materialize `scrape.env` from exported credentials (mode `600`, no manual editing): + +```bash +export DISCORD_TOKEN="your-token-here" +./scripts/setup-scrape-auth.sh +``` + +### 2b. Verify existing ~/Documents archives + +Before the first incremental run, confirm each enabled target points at the correct on-disk server folder and already has seeded channel JSON exports (the scraper appends in place and bootstraps `.dce-meta/channel-map.json` from these files): + +```bash +./scripts/verify-documents-archives.sh --config config/scrape-targets.json +``` + +Each enabled target should show a non-zero **JSON** count and **SEEDED** channel IDs under `/home/brunner56/Documents//`. + ### 3. Run Preflight Validation Before installing cron, validate your setup: diff --git a/docs/plans/2026-05-28-007-verify-documents-auth-bootstrap-plan.md b/docs/plans/2026-05-28-007-verify-documents-auth-bootstrap-plan.md new file mode 100644 index 00000000..2c073602 --- /dev/null +++ b/docs/plans/2026-05-28-007-verify-documents-auth-bootstrap-plan.md @@ -0,0 +1,28 @@ +--- +title: fix: Verify Documents targets and bootstrap scrape auth +type: fix +status: completed +date: 2026-05-28 +origin: LFG repeat — confirm ~/Documents/** append paths and unblock auth setup +depends_on: docs/plans/2026-05-28-006-fix-documents-append-auth-plan.md +--- + +# fix: Verify Documents targets and bootstrap scrape auth + +## Summary + +Plan 006 landed append-safe scraping. This pass adds operator tooling so you can (1) verify every enabled target maps to an on-disk `~/Documents//` tree with seeded channel archives, and (2) create `scrape.env` without manual file editing when a token is already exported. + +## Requirements + +| ID | Requirement | Files | +|----|-------------|-------| +| V1 | `verify-documents-archives.sh` reports per-target output_dir, JSON count, seeded channel IDs, channel-map coverage | `scripts/verify-documents-archives.sh`, smoke test | +| V2 | `setup-scrape-auth.sh` writes `scrape.env` from `DISCORD_TOKEN` or `DISCORD_TOKEN_FILE` (chmod 600), idempotent | `scripts/setup-scrape-auth.sh`, smoke test | +| V3 | Document verify + auth bootstrap in setup guide | `.docs/Recurring-Scrape-Setup.md` | + +## Success Criteria + +- Verify script runs against real `config/scrape-targets.json` and exits 0 when enabled targets have archive dirs +- Auth bootstrap creates scrape.env when token env vars are set +- Smoke tests pass diff --git a/scripts/setup-scrape-auth.sh b/scripts/setup-scrape-auth.sh new file mode 100755 index 00000000..415cf034 --- /dev/null +++ b/scripts/setup-scrape-auth.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P) +REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}" +ENV_FILE="${DCE_ENV_FILE:-$REPO_ROOT/scrape.env}" +EXAMPLE_FILE="$REPO_ROOT/scrape.env.example" + +usage() { + cat <&2 + exit 1 +} + +main() { + local force=0 + + while (($#)); do + case "$1" in + --env-file) + [[ $# -ge 2 ]] || die "Missing value for --env-file." + ENV_FILE=$2 + shift 2 + ;; + --force) + force=1 + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + [[ -f "$EXAMPLE_FILE" ]] || die "Missing example env file: $EXAMPLE_FILE" + + if [[ -f "$ENV_FILE" && "$force" -eq 0 ]]; then + printf 'scrape.env already exists at %s (use --force to overwrite).\n' "$ENV_FILE" + exit 0 + fi + + if [[ -z "${DISCORD_TOKEN:-}" && -z "${DISCORD_TOKEN_FILE:-}" ]]; then + die "Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in the environment, then rerun this script." + fi + + if [[ -n "${DISCORD_TOKEN_FILE:-}" && ! -f "$DISCORD_TOKEN_FILE" ]]; then + die "DISCORD_TOKEN_FILE does not exist: $DISCORD_TOKEN_FILE" + fi + + local tmp_file + tmp_file=$(mktemp "${TMPDIR:-/tmp}/scrape.env.XXXXXX") + while IFS= read -r line || [[ -n "$line" ]]; do + case "$line" in + DISCORD_TOKEN=*) + if [[ -n "${DISCORD_TOKEN:-}" ]]; then + printf 'DISCORD_TOKEN=%s\n' "$DISCORD_TOKEN" + else + printf '%s\n' "$line" + fi + ;; + DISCORD_TOKEN_FILE=*) + if [[ -n "${DISCORD_TOKEN_FILE:-}" ]]; then + printf 'DISCORD_TOKEN_FILE=%s\n' "$DISCORD_TOKEN_FILE" + else + printf '%s\n' "$line" + fi + ;; + *) + printf '%s\n' "$line" + ;; + esac + done <"$EXAMPLE_FILE" >"$tmp_file" + mv "$tmp_file" "$ENV_FILE" + + chmod 600 "$ENV_FILE" + printf 'Created %s (mode 600).\n' "$ENV_FILE" + printf 'Next: ./scripts/verify-documents-archives.sh && ./scripts/run-discord-scrape-host.sh preflight\n' +} + +main "$@" diff --git a/scripts/tests/verify-documents-auth-smoke.sh b/scripts/tests/verify-documents-auth-smoke.sh new file mode 100755 index 00000000..0232bd51 --- /dev/null +++ b/scripts/tests/verify-documents-auth-smoke.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P) +TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-verify-auth-smoke.XXXXXX") + +cleanup() { + rm -rf "$TMP_DIR" +} +trap cleanup EXIT + +mkdir -p "$TMP_DIR/good-server" "$TMP_DIR/bad-server" +printf '{"messages":[{"id":"1"}]}\n' >"$TMP_DIR/good-server/Guild - general [111111111111111111].json" + +cat >"$TMP_DIR/config.json" </dev/null && { + echo "expected verify to fail when enabled target dir is missing" >&2 + exit 1 +} + +mkdir -p "$TMP_DIR/missing-server" +printf '{"messages":[{"id":"1"}]}\n' >"$TMP_DIR/missing-server/Guild - general [222222222222222222].json" +"$REPO_ROOT/scripts/verify-documents-archives.sh" --config "$TMP_DIR/config.json" >/dev/null + +ENV_OUT="$TMP_DIR/scrape.env" +DISCORD_TOKEN=smoke-token \ + "$REPO_ROOT/scripts/setup-scrape-auth.sh" --env-file "$ENV_OUT" + +grep -q '^DISCORD_TOKEN=smoke-token$' "$ENV_OUT" || { + echo "expected setup-scrape-auth to write DISCORD_TOKEN" >&2 + exit 1 +} +[[ "$(stat -c '%a' "$ENV_OUT")" == "600" ]] || { + echo "expected scrape.env mode 600" >&2 + exit 1 +} + +DISCORD_TOKEN_FILE="$TMP_DIR/token.txt" +printf 'file-token\n' >"$DISCORD_TOKEN_FILE" +DISCORD_TOKEN_FILE="$DISCORD_TOKEN_FILE" \ + "$REPO_ROOT/scripts/setup-scrape-auth.sh" --env-file "$ENV_OUT" --force +grep -q "^DISCORD_TOKEN_FILE=$DISCORD_TOKEN_FILE\$" "$ENV_OUT" || { + echo "expected setup-scrape-auth to write DISCORD_TOKEN_FILE" >&2 + exit 1 +} + +echo "verify-documents-auth-smoke: ok" diff --git a/scripts/verify-documents-archives.sh b/scripts/verify-documents-archives.sh new file mode 100755 index 00000000..33dbf12c --- /dev/null +++ b/scripts/verify-documents-archives.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P) +REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}" +CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}" + +usage() { + cat <&2 + exit 1 +} + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing." +} + +count_archive_json() { + local output_dir=$1 + find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' 2>/dev/null | wc -l | tr -d ' ' +} + +count_seeded_channel_ids() { + local output_dir=$1 + local file_path file_name + + [[ -d "$output_dir" ]] || return 0 + + while IFS= read -r -d '' file_path; do + file_name=$(basename "$file_path") + if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then + printf '%s\n' "${BASH_REMATCH[1]}" + fi + done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null) | sort -u | wc -l | tr -d ' ' +} + +count_channel_map_entries() { + local map_file=$1 + [[ -f "$map_file" ]] || { printf '0'; return 0; } + jq -r 'keys | length' "$map_file" +} + +main() { + while (($#)); do + case "$1" in + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + CONFIG_PATH=$2 + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + require_command jq + [[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH" + + local archive_root failures=0 + archive_root=$(jq -r '.archive_root // empty' "$CONFIG_PATH") + [[ -n "$archive_root" ]] || die "Config is missing archive_root." + + printf 'Archive root: %s\n\n' "$archive_root" + printf '%-28s %-40s %8s %8s %8s %s\n' "TARGET" "OUTPUT_DIR" "JSON" "SEEDED" "MAP" "STATUS" + printf '%-28s %-40s %8s %8s %8s %s\n' "------" "----------" "----" "------" "-----" "------" + + local target_json name output_dir enabled json_count seeded_count map_count map_file status + while IFS= read -r target_json; do + name=$(jq -r '.name' <<<"$target_json") + output_dir=$(jq -r '.output_dir' <<<"$target_json") + enabled=$(jq -r 'if has("enabled") then .enabled else true end' <<<"$target_json") + + if [[ "$enabled" == "false" ]]; then + printf '%-28s %-40s %8s %8s %8s %s\n' "$name" "$output_dir" "-" "-" "-" "disabled" + continue + fi + + json_count=0 + seeded_count=0 + map_count=0 + status="ok" + + if [[ ! -d "$output_dir" ]]; then + status="missing output_dir" + failures=$((failures + 1)) + else + json_count=$(count_archive_json "$output_dir") + seeded_count=$(count_seeded_channel_ids "$output_dir") + map_file="$output_dir/.dce-meta/channel-map.json" + map_count=$(count_channel_map_entries "$map_file") + if (( json_count == 0 )); then + status="no json archives" + failures=$((failures + 1)) + elif (( seeded_count == 0 )); then + status="no seeded channel ids" + failures=$((failures + 1)) + elif (( map_count == 0 )); then + status="ok (map will bootstrap on first run)" + fi + fi + + printf '%-28s %-40s %8s %8s %8s %s\n' "$name" "$output_dir" "$json_count" "$seeded_count" "$map_count" "$status" + done < <(jq -c '.targets[]' "$CONFIG_PATH") + + printf '\n' + if (( failures > 0 )); then + die "$failures enabled target(s) failed archive verification." + fi + + printf 'All enabled targets have archive directories with seeded channel exports.\n' +} + +main "$@"