diff --git a/docs/plans/2026-06-04-069-feat-json-scrape-run-summary-plan.md b/docs/plans/2026-06-04-069-feat-json-scrape-run-summary-plan.md new file mode 100644 index 00000000..1d75124d --- /dev/null +++ b/docs/plans/2026-06-04-069-feat-json-scrape-run-summary-plan.md @@ -0,0 +1,44 @@ +--- +title: "feat: Optional JSON scrape run summary" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — plan 038 deferred structured JSON run logs for operator validation and automation +--- + +# feat: Optional JSON scrape run summary + +## Summary + +When `DCE_RUN_SUMMARY_JSON=1` and/or `DCE_RUN_SUMMARY_FILE` is set, emit a machine-readable scrape summary alongside the existing human log summary in `run-discord-scrape.sh`. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | JSON includes version, finished_at, totals, and per-channel entries matching the text summary | +| R2 | `DCE_RUN_SUMMARY_JSON=1` logs one `DCE_JSON_SUMMARY:` line (compact JSON) | +| R3 | `DCE_RUN_SUMMARY_FILE` writes pretty-printed JSON when parent dir exists | +| R4 | `scrape.env.example` documents both env vars | +| R5 | `run-discord-scrape-smoke.sh` asserts valid JSON file with merged channel | +| R6 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 21/21 | + +## Implementation Units + +### U1. JSON writer in run-discord-scrape.sh + +**Files:** `scripts/run-discord-scrape.sh`, `scripts/tests/run-discord-scrape-smoke.sh`, `scrape.env.example` + +## Verification + +```bash +./scripts/tests/run-discord-scrape-smoke.sh +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up on host +- Host compose passthrough of summary file path (operators can grep `DCE_JSON_SUMMARY`) diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index 605228da..54b05e3a 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -164,6 +164,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 068 (2026-06-04):** `verify-documents-archives` MEM column and `verify-operator-ready` target memory hints when global cap unset. +**Plan 069 (2026-06-04):** Optional JSON scrape run summary via `DCE_RUN_SUMMARY_JSON` / `DCE_RUN_SUMMARY_FILE`. + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/scrape.env.example b/scrape.env.example index 1dbefc2d..0c882261 100644 --- a/scrape.env.example +++ b/scrape.env.example @@ -25,3 +25,7 @@ DCE_USERNS_MODE= # Per-target: set container_memory on a target in config/scrape-targets.json (single --target runs). # Global override (wins over config): uncomment below. # DCE_CONTAINER_MEMORY=8g + +# Optional: machine-readable scrape summary (run-discord-scrape.sh). +# DCE_RUN_SUMMARY_JSON=1 +# DCE_RUN_SUMMARY_FILE=/path/to/scrape-summary.json diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh index 6791510d..4fc3d2ee 100755 --- a/scripts/run-discord-scrape.sh +++ b/scripts/run-discord-scrape.sh @@ -211,7 +211,79 @@ print_scrape_summary() { log "Totals: $created created, $merged merged, $unchanged unchanged, $skipped skipped; +$appended messages appended" if (( skipped_oom > 0 )); then - log "Hint: for OOM/aborted channels, set DCE_CONTAINER_MEMORY=8g in scrape.env, run --salvage-before-scrape, then retry with --channel." + log "Hint: for OOM/aborted channels, raise container memory (target container_memory in config or DCE_CONTAINER_MEMORY in scrape.env), run --salvage-before-scrape, then retry with --channel." + fi + write_scrape_summary_json "$created" "$merged" "$unchanged" "$skipped" "$skipped_oom" "$appended" +} + +write_scrape_summary_json() { + local created=$1 merged=$2 unchanged=$3 skipped=$4 skipped_oom=$5 appended=$6 + local entry target_name channel_id guild_label file_path action + local before_count fetched_count after_count delta channels_json='[]' summary_json + + [[ "${DCE_RUN_SUMMARY_JSON:-0}" == "1" || -n "${DCE_RUN_SUMMARY_FILE:-}" ]] || return 0 + + for entry in "${SCRAPE_SUMMARY_ENTRIES[@]}"; do + IFS=$'\t' read -r target_name channel_id guild_label file_path action before_count fetched_count after_count <<<"$entry" + delta=$((after_count - before_count)) + channels_json=$( + jq -cn \ + --argjson arr "$channels_json" \ + --arg target "$target_name" \ + --arg channel_id "$channel_id" \ + --arg guild_label "$guild_label" \ + --arg file_path "$file_path" \ + --arg action "$action" \ + --argjson before_count "$before_count" \ + --argjson fetched_count "$fetched_count" \ + --argjson after_count "$after_count" \ + --argjson delta "$delta" \ + '$arr + [{ + target: $target, + channel_id: $channel_id, + guild_label: $guild_label, + file_path: $file_path, + action: $action, + before_count: $before_count, + fetched_count: $fetched_count, + after_count: $after_count, + delta: $delta + }]' + ) + done + + summary_json=$( + jq -cn \ + --arg finished_at "$(timestamp)" \ + --argjson channels "$channels_json" \ + --argjson created "$created" \ + --argjson merged "$merged" \ + --argjson unchanged "$unchanged" \ + --argjson skipped "$skipped" \ + --argjson skipped_oom "$skipped_oom" \ + --argjson messages_appended "$appended" \ + '{ + version: 1, + finished_at: $finished_at, + totals: { + created: $created, + merged: $merged, + unchanged: $unchanged, + skipped: $skipped, + skipped_oom: $skipped_oom, + messages_appended: $messages_appended + }, + channels: $channels + }' + ) + + if [[ "${DCE_RUN_SUMMARY_JSON:-0}" == "1" ]]; then + log "DCE_JSON_SUMMARY: $(jq -c . <<<"$summary_json")" + fi + if [[ -n "${DCE_RUN_SUMMARY_FILE:-}" ]]; then + mkdir -p "$(dirname "$DCE_RUN_SUMMARY_FILE")" + jq . <<<"$summary_json" >"$DCE_RUN_SUMMARY_FILE" + log "JSON summary written: $DCE_RUN_SUMMARY_FILE" fi } diff --git a/scripts/tests/run-discord-scrape-smoke.sh b/scripts/tests/run-discord-scrape-smoke.sh index bd227f9f..f36b5b50 100755 --- a/scripts/tests/run-discord-scrape-smoke.sh +++ b/scripts/tests/run-discord-scrape-smoke.sh @@ -502,5 +502,33 @@ if ( commit_merged_export "$SHRINK_EXISTING" "$SHRINK_MERGED" >/dev/null 2>&1 ); fi [[ "$(jq -r '.messages | length' "$SHRINK_EXISTING")" == "2" ]] || { echo "existing archive changed after rejected shrink merge" >&2; exit 1; } +SUMMARY_JSON="$TMP_DIR/scrape-summary.json" +DCE_RUN_SUMMARY_FILE="$SUMMARY_JSON" \ + DISCORD_TOKEN=dummy \ + DCE_CLI_BIN="$FAKE_CLI" \ + DCE_PRIMARY_CONFIG="$CONFIG_PATH" \ + DCE_FALLBACK_CONFIG="$CONFIG_PATH" \ + FAKE_DCE_FIXTURE_DIR="$FIXTURE_DIR" \ + FAKE_DCE_MODE=append \ + "$REPO_ROOT/scripts/run-discord-scrape.sh" scrape --target idempotent 2>"$TMP_DIR/json-scrape.log" +jq -e '.version == 1 and (.channels | length) >= 1 and (.totals.merged + .totals.unchanged) >= 1' "$SUMMARY_JSON" >/dev/null || { + echo "expected valid JSON scrape summary file" >&2 + cat "$SUMMARY_JSON" >&2 + exit 1 +} +DCE_RUN_SUMMARY_JSON=1 \ + DCE_RUN_SUMMARY_FILE="$TMP_DIR/scrape-summary-inline.json" \ + DISCORD_TOKEN=dummy \ + DCE_CLI_BIN="$FAKE_CLI" \ + DCE_PRIMARY_CONFIG="$CONFIG_PATH" \ + DCE_FALLBACK_CONFIG="$CONFIG_PATH" \ + FAKE_DCE_FIXTURE_DIR="$FIXTURE_DIR" \ + FAKE_DCE_MODE=append \ + "$REPO_ROOT/scripts/run-discord-scrape.sh" scrape --target idempotent 2>"$TMP_DIR/json-inline.log" +grep -q 'DCE_JSON_SUMMARY:' "$TMP_DIR/json-inline.log" || { + echo "expected DCE_JSON_SUMMARY line when DCE_RUN_SUMMARY_JSON=1" >&2 + exit 1 +} + echo "U1: append-only merge test coverage passed"