feat(scrape): optional JSON run summary for automation

Emit DCE_JSON_SUMMARY log line and/or write DCE_RUN_SUMMARY_FILE
with per-channel actions and totals after scrape completes.
This commit is contained in:
Copilot 2026-06-03 10:08:44 -05:00
parent aa85fe50fa
commit 1dda40ae1b
5 changed files with 151 additions and 1 deletions

View file

@ -0,0 +1,44 @@
---
title: "feat: Optional JSON scrape run summary"
type: feat
status: complete
date: 2026-06-04
origin: /lfg — plan 038 deferred structured JSON run logs for operator validation and automation
---
# feat: Optional JSON scrape run summary
## Summary
When `DCE_RUN_SUMMARY_JSON=1` and/or `DCE_RUN_SUMMARY_FILE` is set, emit a machine-readable scrape summary alongside the existing human log summary in `run-discord-scrape.sh`.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | JSON includes version, finished_at, totals, and per-channel entries matching the text summary |
| R2 | `DCE_RUN_SUMMARY_JSON=1` logs one `DCE_JSON_SUMMARY:` line (compact JSON) |
| R3 | `DCE_RUN_SUMMARY_FILE` writes pretty-printed JSON when parent dir exists |
| R4 | `scrape.env.example` documents both env vars |
| R5 | `run-discord-scrape-smoke.sh` asserts valid JSON file with merged channel |
| R6 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 21/21 |
## Implementation Units
### U1. JSON writer in run-discord-scrape.sh
**Files:** `scripts/run-discord-scrape.sh`, `scripts/tests/run-discord-scrape-smoke.sh`, `scrape.env.example`
## Verification
```bash
./scripts/tests/run-discord-scrape-smoke.sh
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
```
## Scope Boundaries
### Deferred
- Live KotOR catch-up on host
- Host compose passthrough of summary file path (operators can grep `DCE_JSON_SUMMARY`)

View file

@ -164,6 +164,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \
**Plan 068 (2026-06-04):** `verify-documents-archives` MEM column and `verify-operator-ready` target memory hints when global cap unset.
**Plan 069 (2026-06-04):** Optional JSON scrape run summary via `DCE_RUN_SUMMARY_JSON` / `DCE_RUN_SUMMARY_FILE`.
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
## CI note (fork PRs)

View file

@ -25,3 +25,7 @@ DCE_USERNS_MODE=
# Per-target: set container_memory on a target in config/scrape-targets.json (single --target runs).
# Global override (wins over config): uncomment below.
# DCE_CONTAINER_MEMORY=8g
# Optional: machine-readable scrape summary (run-discord-scrape.sh).
# DCE_RUN_SUMMARY_JSON=1
# DCE_RUN_SUMMARY_FILE=/path/to/scrape-summary.json

View file

@ -211,7 +211,79 @@ print_scrape_summary() {
log "Totals: $created created, $merged merged, $unchanged unchanged, $skipped skipped; +$appended messages appended"
if (( skipped_oom > 0 )); then
log "Hint: for OOM/aborted channels, set DCE_CONTAINER_MEMORY=8g in scrape.env, run --salvage-before-scrape, then retry with --channel."
log "Hint: for OOM/aborted channels, raise container memory (target container_memory in config or DCE_CONTAINER_MEMORY in scrape.env), run --salvage-before-scrape, then retry with --channel."
fi
write_scrape_summary_json "$created" "$merged" "$unchanged" "$skipped" "$skipped_oom" "$appended"
}
write_scrape_summary_json() {
local created=$1 merged=$2 unchanged=$3 skipped=$4 skipped_oom=$5 appended=$6
local entry target_name channel_id guild_label file_path action
local before_count fetched_count after_count delta channels_json='[]' summary_json
[[ "${DCE_RUN_SUMMARY_JSON:-0}" == "1" || -n "${DCE_RUN_SUMMARY_FILE:-}" ]] || return 0
for entry in "${SCRAPE_SUMMARY_ENTRIES[@]}"; do
IFS=$'\t' read -r target_name channel_id guild_label file_path action before_count fetched_count after_count <<<"$entry"
delta=$((after_count - before_count))
channels_json=$(
jq -cn \
--argjson arr "$channels_json" \
--arg target "$target_name" \
--arg channel_id "$channel_id" \
--arg guild_label "$guild_label" \
--arg file_path "$file_path" \
--arg action "$action" \
--argjson before_count "$before_count" \
--argjson fetched_count "$fetched_count" \
--argjson after_count "$after_count" \
--argjson delta "$delta" \
'$arr + [{
target: $target,
channel_id: $channel_id,
guild_label: $guild_label,
file_path: $file_path,
action: $action,
before_count: $before_count,
fetched_count: $fetched_count,
after_count: $after_count,
delta: $delta
}]'
)
done
summary_json=$(
jq -cn \
--arg finished_at "$(timestamp)" \
--argjson channels "$channels_json" \
--argjson created "$created" \
--argjson merged "$merged" \
--argjson unchanged "$unchanged" \
--argjson skipped "$skipped" \
--argjson skipped_oom "$skipped_oom" \
--argjson messages_appended "$appended" \
'{
version: 1,
finished_at: $finished_at,
totals: {
created: $created,
merged: $merged,
unchanged: $unchanged,
skipped: $skipped,
skipped_oom: $skipped_oom,
messages_appended: $messages_appended
},
channels: $channels
}'
)
if [[ "${DCE_RUN_SUMMARY_JSON:-0}" == "1" ]]; then
log "DCE_JSON_SUMMARY: $(jq -c . <<<"$summary_json")"
fi
if [[ -n "${DCE_RUN_SUMMARY_FILE:-}" ]]; then
mkdir -p "$(dirname "$DCE_RUN_SUMMARY_FILE")"
jq . <<<"$summary_json" >"$DCE_RUN_SUMMARY_FILE"
log "JSON summary written: $DCE_RUN_SUMMARY_FILE"
fi
}

View file

@ -502,5 +502,33 @@ if ( commit_merged_export "$SHRINK_EXISTING" "$SHRINK_MERGED" >/dev/null 2>&1 );
fi
[[ "$(jq -r '.messages | length' "$SHRINK_EXISTING")" == "2" ]] || { echo "existing archive changed after rejected shrink merge" >&2; exit 1; }
SUMMARY_JSON="$TMP_DIR/scrape-summary.json"
DCE_RUN_SUMMARY_FILE="$SUMMARY_JSON" \
DISCORD_TOKEN=dummy \
DCE_CLI_BIN="$FAKE_CLI" \
DCE_PRIMARY_CONFIG="$CONFIG_PATH" \
DCE_FALLBACK_CONFIG="$CONFIG_PATH" \
FAKE_DCE_FIXTURE_DIR="$FIXTURE_DIR" \
FAKE_DCE_MODE=append \
"$REPO_ROOT/scripts/run-discord-scrape.sh" scrape --target idempotent 2>"$TMP_DIR/json-scrape.log"
jq -e '.version == 1 and (.channels | length) >= 1 and (.totals.merged + .totals.unchanged) >= 1' "$SUMMARY_JSON" >/dev/null || {
echo "expected valid JSON scrape summary file" >&2
cat "$SUMMARY_JSON" >&2
exit 1
}
DCE_RUN_SUMMARY_JSON=1 \
DCE_RUN_SUMMARY_FILE="$TMP_DIR/scrape-summary-inline.json" \
DISCORD_TOKEN=dummy \
DCE_CLI_BIN="$FAKE_CLI" \
DCE_PRIMARY_CONFIG="$CONFIG_PATH" \
DCE_FALLBACK_CONFIG="$CONFIG_PATH" \
FAKE_DCE_FIXTURE_DIR="$FIXTURE_DIR" \
FAKE_DCE_MODE=append \
"$REPO_ROOT/scripts/run-discord-scrape.sh" scrape --target idempotent 2>"$TMP_DIR/json-inline.log"
grep -q 'DCE_JSON_SUMMARY:' "$TMP_DIR/json-inline.log" || {
echo "expected DCE_JSON_SUMMARY line when DCE_RUN_SUMMARY_JSON=1" >&2
exit 1
}
echo "U1: append-only merge test coverage passed"