diff --git a/docs/plans/2026-06-04-075-feat-documents-scrape-json-summary-plan.md b/docs/plans/2026-06-04-075-feat-documents-scrape-json-summary-plan.md new file mode 100644 index 00000000..034b802d --- /dev/null +++ b/docs/plans/2026-06-04-075-feat-documents-scrape-json-summary-plan.md @@ -0,0 +1,53 @@ +--- +title: "feat: Auto JSON summary on documents scrape" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — plan 074 deferred auto-enable JSON summary on bare scrape entrypoints; cron uses run-documents-scrape.sh +--- + +# feat: Auto JSON summary on documents scrape + +## Summary + +When `run-documents-scrape.sh` performs a live Discord scrape, auto-enable `DCE_RUN_SUMMARY_JSON=1` and write `logs/documents-scrape-.summary.json` unless the operator already set `DCE_RUN_SUMMARY_FILE` or passes `--summary-file`. + +## Problem Frame + +Validation and proof auto-export JSON summaries (plans 070–073). The primary incremental path — `run-documents-scrape.sh` and monthly cron — still requires manual env vars for machine-readable totals. Host runner recovery (plan 072) can populate the file from compose logs when env is set. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | Live scrape path exports `DCE_RUN_SUMMARY_JSON=1` when not dry-run/salvage-only | +| R2 | Default `DCE_RUN_SUMMARY_FILE` to `logs/documents-scrape-.summary.json` when unset | +| R3 | Optional `--summary-file PATH` overrides default destination | +| R4 | Prints `JSON summary file:` before preflight/scrape | +| R5 | Dry-run and salvage-only do not enable JSON export | +| R6 | `documents-scrape-smoke.sh` asserts summary path on live scrape and absence on dry-run | +| R7 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 23/23 | + +## Implementation Units + +### U1. run-documents-scrape.sh + +**Files:** `scripts/run-documents-scrape.sh`, `scripts/tests/documents-scrape-smoke.sh` + +### U2. Docs + +**Files:** `docs/recurring-scrape-merge-readiness.md`, `scrape.env.example` + +## Verification + +```bash +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up on host +- Per-target separate summary files in multi-target proof/validation loops +- Tee full documents-scrape stdout to a log file diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index a2217881..1bdd3e91 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -178,6 +178,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 074 (2026-06-04):** `print-scrape-summary.sh` pretty-prints `*.summary.json` (`--json`, `--oom-only`, stdin `-`). +**Plan 075 (2026-06-04):** `run-documents-scrape.sh` auto-writes `logs/documents-scrape-.summary.json` on live scrapes. + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/scrape.env.example b/scrape.env.example index 15283cb0..a7128253 100644 --- a/scrape.env.example +++ b/scrape.env.example @@ -27,6 +27,8 @@ DCE_USERNS_MODE= # DCE_CONTAINER_MEMORY=8g # Optional: machine-readable scrape summary (run-discord-scrape.sh). +# run-documents-scrape.sh, run-operator-validation.sh, and run-operator-proof.sh +# auto-enable summary export on live scrapes unless these are already set. # Host paths under logs/ map to /logs/ in the container (see docker-compose.yml). # DCE_RUN_SUMMARY_JSON=1 # DCE_RUN_SUMMARY_FILE=logs/scrape-summary.json diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh index 81f67ed1..72236213 100755 --- a/scripts/run-documents-scrape.sh +++ b/scripts/run-documents-scrape.sh @@ -12,6 +12,7 @@ VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh" VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh" SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh" LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh" +LOG_DIR="${DCE_LOG_DIR:-$REPO_ROOT/logs}" # shellcheck source=lib/scrape-lock.sh source "$SCRIPT_DIR/lib/scrape-lock.sh" # shellcheck source=lib/scrape-run-plan.sh @@ -35,6 +36,7 @@ Options: --target NAME Limit preflight/scrape to one configured target --channel ID With exactly one --target, limit scrape to channel ID (repeatable) --config PATH Scrape target config (default: config/scrape-targets.json) + --summary-file PATH Machine-readable scrape summary JSON (default: logs/documents-scrape-UTC.summary.json) EOF } @@ -71,6 +73,7 @@ main() { local salvage_only=0 local salvage_before=0 local target="" + local summary_file="" local -a passthrough=() while (($#)); do @@ -104,6 +107,11 @@ main() { passthrough+=(--config "$2") shift 2 ;; + --summary-file) + [[ $# -ge 2 ]] || die "Missing value for --summary-file." + summary_file=$2 + shift 2 + ;; --help|-h) usage exit 0 @@ -174,6 +182,17 @@ main() { "$SETUP_AUTH" 2>/dev/null || true fi + export DCE_RUN_SUMMARY_JSON=1 + if [[ -z "${DCE_RUN_SUMMARY_FILE:-}" ]]; then + if [[ -n "$summary_file" ]]; then + export DCE_RUN_SUMMARY_FILE="$summary_file" + else + mkdir -p "$LOG_DIR" + export DCE_RUN_SUMMARY_FILE="$LOG_DIR/documents-scrape-$(date -u +%Y%m%dT%H%M%SZ).summary.json" + fi + fi + printf 'JSON summary file: %s\n' "$DCE_RUN_SUMMARY_FILE" + "$HOST_RUNNER" preflight "${container_args[@]}" "$HOST_RUNNER" scrape "${container_args[@]}" } diff --git a/scripts/tests/documents-scrape-smoke.sh b/scripts/tests/documents-scrape-smoke.sh index 2b5103c9..3b2626fd 100755 --- a/scripts/tests/documents-scrape-smoke.sh +++ b/scripts/tests/documents-scrape-smoke.sh @@ -83,6 +83,10 @@ grep -q 'Documents scrape run plan' "$DOC_OUT" || { echo "expected Documents scrape run plan in dry-run output" >&2 exit 1 } +grep -q 'JSON summary file:' "$DOC_OUT" && { + echo "dry-run should not enable JSON summary export" >&2 + exit 1 +} CHANNEL_DRY="$TMP_DIR/channel-dry-run.log" "$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >"$CHANNEL_DRY" 2>&1 @@ -90,6 +94,10 @@ grep -q 'Documents scrape run plan' "$CHANNEL_DRY" || { echo "expected dry-run to accept --channel passthrough" >&2 exit 1 } +grep -q 'JSON summary file:' "$CHANNEL_DRY" && { + echo "dry-run with --channel should not enable JSON summary export" >&2 + exit 1 +} ARGS_LOG="$TMP_DIR/compose-args.log" cat >"$FAKE_DOCKER" <<'EOF' @@ -100,13 +108,20 @@ EOF chmod +x "$FAKE_DOCKER" printf 'DISCORD_TOKEN=dummy-token\n' >"$TMP_DIR/scrape.env" +LIVE_DOC_OUT="$TMP_DIR/documents-live.log" DCE_MIN_FREE_MB=0 \ DCE_SKIP_SCRAPE_LOCK=1 \ DCE_DOCKER_BIN="$FAKE_DOCKER" \ FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \ DCE_ENV_FILE="$TMP_DIR/scrape.env" \ - "$REPO_ROOT/scripts/run-documents-scrape.sh" --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >/dev/null + DCE_LOG_DIR="$TMP_DIR/logs" \ + "$REPO_ROOT/scripts/run-documents-scrape.sh" --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >"$LIVE_DOC_OUT" 2>&1 +grep -q 'JSON summary file:' "$LIVE_DOC_OUT" || { + echo "expected live documents scrape to enable JSON summary export" >&2 + cat "$LIVE_DOC_OUT" >&2 + exit 1 +} grep -q '111111111111111111' "$ARGS_LOG" || { echo "expected --channel to reach container compose invocation" >&2 cat "$ARGS_LOG" >&2 @@ -128,6 +143,10 @@ grep -q 'salvage completed' "$SALVAGE_DOC_LOG" || { cat "$SALVAGE_DOC_LOG" >&2 exit 1 } +grep -q 'JSON summary file:' "$SALVAGE_DOC_LOG" && { + echo "salvage-only should not enable JSON summary export" >&2 + exit 1 +} SALVAGE_BEFORE_LOG="$TMP_DIR/salvage-before.log" : >"$ARGS_LOG" @@ -152,6 +171,11 @@ grep -q 'compose' "$ARGS_LOG" || { cat "$ARGS_LOG" >&2 exit 1 } +grep -q 'JSON summary file:' "$SALVAGE_BEFORE_LOG" || { + echo "expected --salvage-before-scrape live path to enable JSON summary export" >&2 + cat "$SALVAGE_BEFORE_LOG" >&2 + exit 1 +} command -v flock >/dev/null 2>&1 && { LOCK_FILE="$TMP_DIR/.dce-scrape.lock"