diff --git a/docs/plans/2026-06-04-074-feat-print-scrape-summary-cli-plan.md b/docs/plans/2026-06-04-074-feat-print-scrape-summary-cli-plan.md new file mode 100644 index 00000000..32b71cda --- /dev/null +++ b/docs/plans/2026-06-04-074-feat-print-scrape-summary-cli-plan.md @@ -0,0 +1,57 @@ +--- +title: "feat: print-scrape-summary CLI for JSON run artifacts" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — plan 073 deferred print-scrape-summary.sh to pretty-print machine-readable scrape totals +--- + +# feat: print-scrape-summary CLI for JSON run artifacts + +## Summary + +Add `scripts/print-scrape-summary.sh` to render human-readable scrape totals from `*.summary.json` files produced by validation, proof, and host runs. + +## Problem Frame + +Plans 069–073 emit JSON summaries beside operator logs. Operators still need `jq` one-liners to inspect OOM skips, per-channel deltas, and appended message counts. A small read-only CLI closes the loop without opening raw JSON. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | Accept summary file path argument; read stdin when path is `-` | +| R2 | Validate `version == 1` and required `totals` fields with `jq` | +| R3 | Default output: finished_at, totals line, and per-channel table (ACTION, CHANNEL, LABEL, DELTA, FILE) | +| R4 | `--json` prints raw file unchanged | +| R5 | `--oom-only` lists only channels whose action is `SKIPPED_OOM` | +| R6 | Exit non-zero on missing/invalid file | +| R7 | Offline smoke with fixture JSON; `run-all-smokes.sh` → 23/23 | + +## Implementation Units + +### U1. print-scrape-summary.sh + +**Files:** `scripts/print-scrape-summary.sh`, `scripts/tests/print-scrape-summary-smoke.sh` + +**Approach:** jq for parsing/formatting; match action labels from `run-discord-scrape.sh` summary text where practical. + +### U2. Docs + +**Files:** `docs/recurring-scrape-merge-readiness.md`, `docs/recurring-scrape-operator-checklist.md` + +**Approach:** Plan 074 stamp; one-line usage beside summary.json examples. + +## Verification + +```bash +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up on host +- Per-target separate summary files in multi-target proof loops +- Auto-enable JSON summary on bare `host.sh scrape` without env vars diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index 753e59fa..a2217881 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -176,6 +176,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 073 (2026-06-04):** Operator proof auto-writes `*.summary.json` beside proof log with tee-log recovery (parity with validation). +**Plan 074 (2026-06-04):** `print-scrape-summary.sh` pretty-prints `*.summary.json` (`--json`, `--oom-only`, stdin `-`). + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/docs/recurring-scrape-operator-checklist.md b/docs/recurring-scrape-operator-checklist.md index 92722aad..e398acda 100644 --- a/docs/recurring-scrape-operator-checklist.md +++ b/docs/recurring-scrape-operator-checklist.md @@ -66,6 +66,7 @@ Salvage then incremental scrape: --target KotOR_discord_msgs --channel 221726893064454144 \ --log-file logs/kotor-yes-general.log # Also writes logs/kotor-yes-general.summary.json (machine-readable scrape totals) +# Inspect: ./scripts/print-scrape-summary.sh logs/kotor-yes-general.summary.json ./scripts/prove-incremental-append.sh \ --target KotOR_discord_msgs --channel 221726893064454144 diff --git a/scripts/print-scrape-summary.sh b/scripts/print-scrape-summary.sh new file mode 100755 index 00000000..74cbf61b --- /dev/null +++ b/scripts/print-scrape-summary.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +OOM_ONLY=0 +RAW_JSON=0 + +usage() { + cat <&2 + exit 1 +} + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing." +} + +validate_summary() { + local file=$1 + jq -e '.version == 1 and (.totals | type) == "object" and (.channels | type) == "array"' "$file" >/dev/null \ + || die "Invalid or unsupported summary JSON: $file" +} + +print_summary() { + local file=$1 + local finished_at channel_count + + validate_summary "$file" + + if (( RAW_JSON )); then + cat "$file" + return 0 + fi + + finished_at=$(jq -r '.finished_at // "unknown"' "$file") + printf 'Scrape summary (finished %s)\n' "$finished_at" + jq -r ' + .totals + | "Totals: \(.created) created, \(.merged) merged, \(.unchanged) unchanged, \(.skipped) skipped (\(.skipped_oom) OOM); +\(.messages_appended) messages appended" + ' "$file" + + channel_count=$(jq -r '.channels | length' "$file") + if (( channel_count == 0 )); then + printf '\nNo channel activity recorded.\n' + return 0 + fi + + if (( OOM_ONLY )); then + local oom_count + oom_count=$(jq -r '[.channels[] | select(.action == "SKIPPED_OOM")] | length' "$file") + if (( oom_count == 0 )); then + printf '\nNo OOM/aborted channel skips recorded.\n' + return 0 + fi + printf '\nOOM/aborted skips:\n' + jq -r ' + .channels[] + | select(.action == "SKIPPED_OOM") + | " channel \(.channel_id) \(.guild_label) target=\(.target)" + ' "$file" + return 0 + fi + + printf '\n%-12s %-20s %-20s %8s %s\n' ACTION CHANNEL LABEL DELTA FILE + jq -r ' + .channels[] + | [ + (if .action == "SKIPPED_OOM" then "SKIPPED(OOM)" else .action end), + .channel_id, + .guild_label, + (if .delta >= 0 then "+" + (.delta | tostring) else (.delta | tostring) end), + (.file_path // "") + ] + | @tsv + ' "$file" | while IFS=$'\t' read -r action channel_id guild_label delta file_path; do + printf '%-12s %-20s %-20s %8s %s\n' "$action" "$channel_id" "$guild_label" "$delta" "$file_path" + done +} + +main() { + local summary_file="" + + while (($#)); do + case "$1" in + --json) + RAW_JSON=1 + shift + ;; + --oom-only) + OOM_ONLY=1 + shift + ;; + --help|-h) + usage + exit 0 + ;; + -) + [[ -z "$summary_file" ]] || die "Unexpected extra argument: $1" + summary_file=- + shift + ;; + --) + shift + break + ;; + -*) + die "Unknown option: $1" + ;; + *) + [[ -z "$summary_file" ]] || die "Unexpected extra argument: $1" + summary_file=$1 + shift + ;; + esac + done + + [[ -n "$summary_file" ]] || { + usage >&2 + die "Missing summary file path." + } + + require_command jq + + local input_file + if [[ "$summary_file" == "-" ]]; then + input_file=$(mktemp "${TMPDIR:-/tmp}/dce-summary-in.XXXXXX.json") + trap "rm -f '$input_file'" EXIT + cat >"$input_file" + print_summary "$input_file" + else + [[ -f "$summary_file" && -r "$summary_file" ]] || die "Summary file not found: $summary_file" + print_summary "$summary_file" + fi +} + +main "$@" diff --git a/scripts/tests/print-scrape-summary-smoke.sh b/scripts/tests/print-scrape-summary-smoke.sh new file mode 100755 index 00000000..448d02f5 --- /dev/null +++ b/scripts/tests/print-scrape-summary-smoke.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P) +PRINT="$REPO_ROOT/scripts/print-scrape-summary.sh" +TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-print-summary-smoke.XXXXXX") +trap 'rm -rf "$TMP_DIR"' EXIT + +FIXTURE="$TMP_DIR/fixture.summary.json" +cat >"$FIXTURE" <<'JSON' +{ + "version": 1, + "finished_at": "2026-06-04T12:00:00Z", + "totals": { + "created": 0, + "merged": 1, + "unchanged": 1, + "skipped": 1, + "skipped_oom": 1, + "messages_appended": 5 + }, + "channels": [ + { + "target": "demo", + "channel_id": "111111111111111111", + "guild_label": "general", + "file_path": "/tmp/archive/demo/general.json", + "action": "MERGED", + "before_count": 10, + "fetched_count": 3, + "after_count": 13, + "delta": 3 + }, + { + "target": "demo", + "channel_id": "222222222222222222", + "guild_label": "other", + "file_path": "/tmp/archive/demo/other.json", + "action": "UNCHANGED", + "before_count": 5, + "fetched_count": 0, + "after_count": 5, + "delta": 0 + }, + { + "target": "demo", + "channel_id": "333333333333333333", + "guild_label": "big-channel", + "file_path": "", + "action": "SKIPPED_OOM", + "before_count": 0, + "fetched_count": 0, + "after_count": 0, + "delta": 0 + } + ] +} +JSON + +chmod +x "$PRINT" + +output=$("$PRINT" "$FIXTURE") +grep -q 'Scrape summary (finished 2026-06-04T12:00:00Z)' <<<"$output" || { + printf 'ERROR: missing finished_at header\n' >&2 + printf '%s\n' "$output" >&2 + exit 1 +} +grep -q '+5 messages appended' <<<"$output" || { + printf 'ERROR: missing totals line\n' >&2 + exit 1 +} +grep -q 'MERGED' <<<"$output" || { + printf 'ERROR: missing MERGED channel row\n' >&2 + exit 1 +} +grep -q 'SKIPPED(OOM)' <<<"$output" || { + printf 'ERROR: missing SKIPPED(OOM) channel row\n' >&2 + exit 1 +} + +json_out=$("$PRINT" --json "$FIXTURE") +diff -q "$FIXTURE" <(printf '%s\n' "$json_out") >/dev/null || { + printf 'ERROR: --json output differs from source file\n' >&2 + exit 1 +} + +oom_out=$("$PRINT" --oom-only "$FIXTURE") +grep -q '333333333333333333' <<<"$oom_out" || { + printf 'ERROR: --oom-only missing OOM channel\n' >&2 + exit 1 +} +grep -q '111111111111111111' <<<"$oom_out" && { + printf 'ERROR: --oom-only should exclude non-OOM channels\n' >&2 + exit 1 +} + +stdin_out=$(cat "$FIXTURE" | "$PRINT" -) +grep -q 'MERGED' <<<"$stdin_out" || { + printf 'ERROR: stdin (-) mode failed\n' >&2 + exit 1 +} + +if "$PRINT" "$TMP_DIR/missing.json" >/dev/null 2>&1; then + printf 'ERROR: expected non-zero exit for missing file\n' >&2 + exit 1 +fi + +printf '{"version":2}\n' >"$TMP_DIR/bad.summary.json" +if "$PRINT" "$TMP_DIR/bad.summary.json" >/dev/null 2>&1; then + printf 'ERROR: expected non-zero exit for invalid schema\n' >&2 + exit 1 +fi + +printf 'print-scrape-summary-smoke: ok\n'