mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
feat(scrape): add print-scrape-summary CLI for JSON artifacts
Pretty-print version-1 scrape summary files with totals table, --oom-only filter, and stdin support for operator validation/proof outputs.
This commit is contained in:
parent
dbc887d81c
commit
a929be48e8
|
|
@ -0,0 +1,57 @@
|
||||||
|
---
|
||||||
|
title: "feat: print-scrape-summary CLI for JSON run artifacts"
|
||||||
|
type: feat
|
||||||
|
status: complete
|
||||||
|
date: 2026-06-04
|
||||||
|
origin: /lfg — plan 073 deferred print-scrape-summary.sh to pretty-print machine-readable scrape totals
|
||||||
|
---
|
||||||
|
|
||||||
|
# feat: print-scrape-summary CLI for JSON run artifacts
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Add `scripts/print-scrape-summary.sh` to render human-readable scrape totals from `*.summary.json` files produced by validation, proof, and host runs.
|
||||||
|
|
||||||
|
## Problem Frame
|
||||||
|
|
||||||
|
Plans 069–073 emit JSON summaries beside operator logs. Operators still need `jq` one-liners to inspect OOM skips, per-channel deltas, and appended message counts. A small read-only CLI closes the loop without opening raw JSON.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement |
|
||||||
|
|----|-------------|
|
||||||
|
| R1 | Accept summary file path argument; read stdin when path is `-` |
|
||||||
|
| R2 | Validate `version == 1` and required `totals` fields with `jq` |
|
||||||
|
| R3 | Default output: finished_at, totals line, and per-channel table (ACTION, CHANNEL, LABEL, DELTA, FILE) |
|
||||||
|
| R4 | `--json` prints raw file unchanged |
|
||||||
|
| R5 | `--oom-only` lists only channels whose action is `SKIPPED_OOM` |
|
||||||
|
| R6 | Exit non-zero on missing/invalid file |
|
||||||
|
| R7 | Offline smoke with fixture JSON; `run-all-smokes.sh` → 23/23 |
|
||||||
|
|
||||||
|
## Implementation Units
|
||||||
|
|
||||||
|
### U1. print-scrape-summary.sh
|
||||||
|
|
||||||
|
**Files:** `scripts/print-scrape-summary.sh`, `scripts/tests/print-scrape-summary-smoke.sh`
|
||||||
|
|
||||||
|
**Approach:** jq for parsing/formatting; match action labels from `run-discord-scrape.sh` summary text where practical.
|
||||||
|
|
||||||
|
### U2. Docs
|
||||||
|
|
||||||
|
**Files:** `docs/recurring-scrape-merge-readiness.md`, `docs/recurring-scrape-operator-checklist.md`
|
||||||
|
|
||||||
|
**Approach:** Plan 074 stamp; one-line usage beside summary.json examples.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scope Boundaries
|
||||||
|
|
||||||
|
### Deferred
|
||||||
|
|
||||||
|
- Live KotOR catch-up on host
|
||||||
|
- Per-target separate summary files in multi-target proof loops
|
||||||
|
- Auto-enable JSON summary on bare `host.sh scrape` without env vars
|
||||||
|
|
@ -176,6 +176,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \
|
||||||
|
|
||||||
**Plan 073 (2026-06-04):** Operator proof auto-writes `*.summary.json` beside proof log with tee-log recovery (parity with validation).
|
**Plan 073 (2026-06-04):** Operator proof auto-writes `*.summary.json` beside proof log with tee-log recovery (parity with validation).
|
||||||
|
|
||||||
|
**Plan 074 (2026-06-04):** `print-scrape-summary.sh` pretty-prints `*.summary.json` (`--json`, `--oom-only`, stdin `-`).
|
||||||
|
|
||||||
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
||||||
|
|
||||||
## CI note (fork PRs)
|
## CI note (fork PRs)
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,7 @@ Salvage then incremental scrape:
|
||||||
--target KotOR_discord_msgs --channel 221726893064454144 \
|
--target KotOR_discord_msgs --channel 221726893064454144 \
|
||||||
--log-file logs/kotor-yes-general.log
|
--log-file logs/kotor-yes-general.log
|
||||||
# Also writes logs/kotor-yes-general.summary.json (machine-readable scrape totals)
|
# Also writes logs/kotor-yes-general.summary.json (machine-readable scrape totals)
|
||||||
|
# Inspect: ./scripts/print-scrape-summary.sh logs/kotor-yes-general.summary.json
|
||||||
|
|
||||||
./scripts/prove-incremental-append.sh \
|
./scripts/prove-incremental-append.sh \
|
||||||
--target KotOR_discord_msgs --channel 221726893064454144
|
--target KotOR_discord_msgs --channel 221726893064454144
|
||||||
|
|
|
||||||
149
scripts/print-scrape-summary.sh
Executable file
149
scripts/print-scrape-summary.sh
Executable file
|
|
@ -0,0 +1,149 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
OOM_ONLY=0
|
||||||
|
RAW_JSON=0
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage:
|
||||||
|
$(basename "$0") [--json] [--oom-only] FILE|-
|
||||||
|
|
||||||
|
Pretty-print a scrape run summary JSON file (version 1) from operator validation,
|
||||||
|
operator proof, or DCE_RUN_SUMMARY_FILE exports.
|
||||||
|
|
||||||
|
--json Print raw JSON unchanged
|
||||||
|
--oom-only List only SKIPPED_OOM channels
|
||||||
|
FILE Path to *.summary.json (use - for stdin)
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf 'ERROR: %s\n' "$*" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
require_command() {
|
||||||
|
command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing."
|
||||||
|
}
|
||||||
|
|
||||||
|
validate_summary() {
|
||||||
|
local file=$1
|
||||||
|
jq -e '.version == 1 and (.totals | type) == "object" and (.channels | type) == "array"' "$file" >/dev/null \
|
||||||
|
|| die "Invalid or unsupported summary JSON: $file"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_summary() {
|
||||||
|
local file=$1
|
||||||
|
local finished_at channel_count
|
||||||
|
|
||||||
|
validate_summary "$file"
|
||||||
|
|
||||||
|
if (( RAW_JSON )); then
|
||||||
|
cat "$file"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
finished_at=$(jq -r '.finished_at // "unknown"' "$file")
|
||||||
|
printf 'Scrape summary (finished %s)\n' "$finished_at"
|
||||||
|
jq -r '
|
||||||
|
.totals
|
||||||
|
| "Totals: \(.created) created, \(.merged) merged, \(.unchanged) unchanged, \(.skipped) skipped (\(.skipped_oom) OOM); +\(.messages_appended) messages appended"
|
||||||
|
' "$file"
|
||||||
|
|
||||||
|
channel_count=$(jq -r '.channels | length' "$file")
|
||||||
|
if (( channel_count == 0 )); then
|
||||||
|
printf '\nNo channel activity recorded.\n'
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( OOM_ONLY )); then
|
||||||
|
local oom_count
|
||||||
|
oom_count=$(jq -r '[.channels[] | select(.action == "SKIPPED_OOM")] | length' "$file")
|
||||||
|
if (( oom_count == 0 )); then
|
||||||
|
printf '\nNo OOM/aborted channel skips recorded.\n'
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
printf '\nOOM/aborted skips:\n'
|
||||||
|
jq -r '
|
||||||
|
.channels[]
|
||||||
|
| select(.action == "SKIPPED_OOM")
|
||||||
|
| " channel \(.channel_id) \(.guild_label) target=\(.target)"
|
||||||
|
' "$file"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf '\n%-12s %-20s %-20s %8s %s\n' ACTION CHANNEL LABEL DELTA FILE
|
||||||
|
jq -r '
|
||||||
|
.channels[]
|
||||||
|
| [
|
||||||
|
(if .action == "SKIPPED_OOM" then "SKIPPED(OOM)" else .action end),
|
||||||
|
.channel_id,
|
||||||
|
.guild_label,
|
||||||
|
(if .delta >= 0 then "+" + (.delta | tostring) else (.delta | tostring) end),
|
||||||
|
(.file_path // "")
|
||||||
|
]
|
||||||
|
| @tsv
|
||||||
|
' "$file" | while IFS=$'\t' read -r action channel_id guild_label delta file_path; do
|
||||||
|
printf '%-12s %-20s %-20s %8s %s\n' "$action" "$channel_id" "$guild_label" "$delta" "$file_path"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
local summary_file=""
|
||||||
|
|
||||||
|
while (($#)); do
|
||||||
|
case "$1" in
|
||||||
|
--json)
|
||||||
|
RAW_JSON=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--oom-only)
|
||||||
|
OOM_ONLY=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--help|-h)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
-)
|
||||||
|
[[ -z "$summary_file" ]] || die "Unexpected extra argument: $1"
|
||||||
|
summary_file=-
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--)
|
||||||
|
shift
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
-*)
|
||||||
|
die "Unknown option: $1"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
[[ -z "$summary_file" ]] || die "Unexpected extra argument: $1"
|
||||||
|
summary_file=$1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[[ -n "$summary_file" ]] || {
|
||||||
|
usage >&2
|
||||||
|
die "Missing summary file path."
|
||||||
|
}
|
||||||
|
|
||||||
|
require_command jq
|
||||||
|
|
||||||
|
local input_file
|
||||||
|
if [[ "$summary_file" == "-" ]]; then
|
||||||
|
input_file=$(mktemp "${TMPDIR:-/tmp}/dce-summary-in.XXXXXX.json")
|
||||||
|
trap "rm -f '$input_file'" EXIT
|
||||||
|
cat >"$input_file"
|
||||||
|
print_summary "$input_file"
|
||||||
|
else
|
||||||
|
[[ -f "$summary_file" && -r "$summary_file" ]] || die "Summary file not found: $summary_file"
|
||||||
|
print_summary "$summary_file"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
115
scripts/tests/print-scrape-summary-smoke.sh
Executable file
115
scripts/tests/print-scrape-summary-smoke.sh
Executable file
|
|
@ -0,0 +1,115 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
|
||||||
|
PRINT="$REPO_ROOT/scripts/print-scrape-summary.sh"
|
||||||
|
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-print-summary-smoke.XXXXXX")
|
||||||
|
trap 'rm -rf "$TMP_DIR"' EXIT
|
||||||
|
|
||||||
|
FIXTURE="$TMP_DIR/fixture.summary.json"
|
||||||
|
cat >"$FIXTURE" <<'JSON'
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"finished_at": "2026-06-04T12:00:00Z",
|
||||||
|
"totals": {
|
||||||
|
"created": 0,
|
||||||
|
"merged": 1,
|
||||||
|
"unchanged": 1,
|
||||||
|
"skipped": 1,
|
||||||
|
"skipped_oom": 1,
|
||||||
|
"messages_appended": 5
|
||||||
|
},
|
||||||
|
"channels": [
|
||||||
|
{
|
||||||
|
"target": "demo",
|
||||||
|
"channel_id": "111111111111111111",
|
||||||
|
"guild_label": "general",
|
||||||
|
"file_path": "/tmp/archive/demo/general.json",
|
||||||
|
"action": "MERGED",
|
||||||
|
"before_count": 10,
|
||||||
|
"fetched_count": 3,
|
||||||
|
"after_count": 13,
|
||||||
|
"delta": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"target": "demo",
|
||||||
|
"channel_id": "222222222222222222",
|
||||||
|
"guild_label": "other",
|
||||||
|
"file_path": "/tmp/archive/demo/other.json",
|
||||||
|
"action": "UNCHANGED",
|
||||||
|
"before_count": 5,
|
||||||
|
"fetched_count": 0,
|
||||||
|
"after_count": 5,
|
||||||
|
"delta": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"target": "demo",
|
||||||
|
"channel_id": "333333333333333333",
|
||||||
|
"guild_label": "big-channel",
|
||||||
|
"file_path": "",
|
||||||
|
"action": "SKIPPED_OOM",
|
||||||
|
"before_count": 0,
|
||||||
|
"fetched_count": 0,
|
||||||
|
"after_count": 0,
|
||||||
|
"delta": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
|
chmod +x "$PRINT"
|
||||||
|
|
||||||
|
output=$("$PRINT" "$FIXTURE")
|
||||||
|
grep -q 'Scrape summary (finished 2026-06-04T12:00:00Z)' <<<"$output" || {
|
||||||
|
printf 'ERROR: missing finished_at header\n' >&2
|
||||||
|
printf '%s\n' "$output" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q '+5 messages appended' <<<"$output" || {
|
||||||
|
printf 'ERROR: missing totals line\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'MERGED' <<<"$output" || {
|
||||||
|
printf 'ERROR: missing MERGED channel row\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'SKIPPED(OOM)' <<<"$output" || {
|
||||||
|
printf 'ERROR: missing SKIPPED(OOM) channel row\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
json_out=$("$PRINT" --json "$FIXTURE")
|
||||||
|
diff -q "$FIXTURE" <(printf '%s\n' "$json_out") >/dev/null || {
|
||||||
|
printf 'ERROR: --json output differs from source file\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
oom_out=$("$PRINT" --oom-only "$FIXTURE")
|
||||||
|
grep -q '333333333333333333' <<<"$oom_out" || {
|
||||||
|
printf 'ERROR: --oom-only missing OOM channel\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q '111111111111111111' <<<"$oom_out" && {
|
||||||
|
printf 'ERROR: --oom-only should exclude non-OOM channels\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
stdin_out=$(cat "$FIXTURE" | "$PRINT" -)
|
||||||
|
grep -q 'MERGED' <<<"$stdin_out" || {
|
||||||
|
printf 'ERROR: stdin (-) mode failed\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if "$PRINT" "$TMP_DIR/missing.json" >/dev/null 2>&1; then
|
||||||
|
printf 'ERROR: expected non-zero exit for missing file\n' >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf '{"version":2}\n' >"$TMP_DIR/bad.summary.json"
|
||||||
|
if "$PRINT" "$TMP_DIR/bad.summary.json" >/dev/null 2>&1; then
|
||||||
|
printf 'ERROR: expected non-zero exit for invalid schema\n' >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf 'print-scrape-summary-smoke: ok\n'
|
||||||
Loading…
Reference in a new issue