mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-09 15:52:37 -06:00
feat(scrape): auto JSON summary on documents scrape runs
Enable DCE_RUN_SUMMARY_JSON by default for live run-documents-scrape paths with optional --summary-file override; skip on dry-run/salvage-only.
This commit is contained in:
parent
a929be48e8
commit
8c36fdbdda
|
|
@ -0,0 +1,53 @@
|
||||||
|
---
|
||||||
|
title: "feat: Auto JSON summary on documents scrape"
|
||||||
|
type: feat
|
||||||
|
status: complete
|
||||||
|
date: 2026-06-04
|
||||||
|
origin: /lfg — plan 074 deferred auto-enable JSON summary on bare scrape entrypoints; cron uses run-documents-scrape.sh
|
||||||
|
---
|
||||||
|
|
||||||
|
# feat: Auto JSON summary on documents scrape
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
When `run-documents-scrape.sh` performs a live Discord scrape, auto-enable `DCE_RUN_SUMMARY_JSON=1` and write `logs/documents-scrape-<UTC>.summary.json` unless the operator already set `DCE_RUN_SUMMARY_FILE` or passes `--summary-file`.
|
||||||
|
|
||||||
|
## Problem Frame
|
||||||
|
|
||||||
|
Validation and proof auto-export JSON summaries (plans 070–073). The primary incremental path — `run-documents-scrape.sh` and monthly cron — still requires manual env vars for machine-readable totals. Host runner recovery (plan 072) can populate the file from compose logs when env is set.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement |
|
||||||
|
|----|-------------|
|
||||||
|
| R1 | Live scrape path exports `DCE_RUN_SUMMARY_JSON=1` when not dry-run/salvage-only |
|
||||||
|
| R2 | Default `DCE_RUN_SUMMARY_FILE` to `logs/documents-scrape-<UTC>.summary.json` when unset |
|
||||||
|
| R3 | Optional `--summary-file PATH` overrides default destination |
|
||||||
|
| R4 | Prints `JSON summary file:` before preflight/scrape |
|
||||||
|
| R5 | Dry-run and salvage-only do not enable JSON export |
|
||||||
|
| R6 | `documents-scrape-smoke.sh` asserts summary path on live scrape and absence on dry-run |
|
||||||
|
| R7 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 23/23 |
|
||||||
|
|
||||||
|
## Implementation Units
|
||||||
|
|
||||||
|
### U1. run-documents-scrape.sh
|
||||||
|
|
||||||
|
**Files:** `scripts/run-documents-scrape.sh`, `scripts/tests/documents-scrape-smoke.sh`
|
||||||
|
|
||||||
|
### U2. Docs
|
||||||
|
|
||||||
|
**Files:** `docs/recurring-scrape-merge-readiness.md`, `scrape.env.example`
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scope Boundaries
|
||||||
|
|
||||||
|
### Deferred
|
||||||
|
|
||||||
|
- Live KotOR catch-up on host
|
||||||
|
- Per-target separate summary files in multi-target proof/validation loops
|
||||||
|
- Tee full documents-scrape stdout to a log file
|
||||||
|
|
@ -178,6 +178,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \
|
||||||
|
|
||||||
**Plan 074 (2026-06-04):** `print-scrape-summary.sh` pretty-prints `*.summary.json` (`--json`, `--oom-only`, stdin `-`).
|
**Plan 074 (2026-06-04):** `print-scrape-summary.sh` pretty-prints `*.summary.json` (`--json`, `--oom-only`, stdin `-`).
|
||||||
|
|
||||||
|
**Plan 075 (2026-06-04):** `run-documents-scrape.sh` auto-writes `logs/documents-scrape-<UTC>.summary.json` on live scrapes.
|
||||||
|
|
||||||
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
||||||
|
|
||||||
## CI note (fork PRs)
|
## CI note (fork PRs)
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,8 @@ DCE_USERNS_MODE=
|
||||||
# DCE_CONTAINER_MEMORY=8g
|
# DCE_CONTAINER_MEMORY=8g
|
||||||
|
|
||||||
# Optional: machine-readable scrape summary (run-discord-scrape.sh).
|
# Optional: machine-readable scrape summary (run-discord-scrape.sh).
|
||||||
|
# run-documents-scrape.sh, run-operator-validation.sh, and run-operator-proof.sh
|
||||||
|
# auto-enable summary export on live scrapes unless these are already set.
|
||||||
# Host paths under logs/ map to /logs/ in the container (see docker-compose.yml).
|
# Host paths under logs/ map to /logs/ in the container (see docker-compose.yml).
|
||||||
# DCE_RUN_SUMMARY_JSON=1
|
# DCE_RUN_SUMMARY_JSON=1
|
||||||
# DCE_RUN_SUMMARY_FILE=logs/scrape-summary.json
|
# DCE_RUN_SUMMARY_FILE=logs/scrape-summary.json
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
|
||||||
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
|
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
|
||||||
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
|
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
|
||||||
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
|
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
|
||||||
|
LOG_DIR="${DCE_LOG_DIR:-$REPO_ROOT/logs}"
|
||||||
# shellcheck source=lib/scrape-lock.sh
|
# shellcheck source=lib/scrape-lock.sh
|
||||||
source "$SCRIPT_DIR/lib/scrape-lock.sh"
|
source "$SCRIPT_DIR/lib/scrape-lock.sh"
|
||||||
# shellcheck source=lib/scrape-run-plan.sh
|
# shellcheck source=lib/scrape-run-plan.sh
|
||||||
|
|
@ -35,6 +36,7 @@ Options:
|
||||||
--target NAME Limit preflight/scrape to one configured target
|
--target NAME Limit preflight/scrape to one configured target
|
||||||
--channel ID With exactly one --target, limit scrape to channel ID (repeatable)
|
--channel ID With exactly one --target, limit scrape to channel ID (repeatable)
|
||||||
--config PATH Scrape target config (default: config/scrape-targets.json)
|
--config PATH Scrape target config (default: config/scrape-targets.json)
|
||||||
|
--summary-file PATH Machine-readable scrape summary JSON (default: logs/documents-scrape-UTC.summary.json)
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -71,6 +73,7 @@ main() {
|
||||||
local salvage_only=0
|
local salvage_only=0
|
||||||
local salvage_before=0
|
local salvage_before=0
|
||||||
local target=""
|
local target=""
|
||||||
|
local summary_file=""
|
||||||
local -a passthrough=()
|
local -a passthrough=()
|
||||||
|
|
||||||
while (($#)); do
|
while (($#)); do
|
||||||
|
|
@ -104,6 +107,11 @@ main() {
|
||||||
passthrough+=(--config "$2")
|
passthrough+=(--config "$2")
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--summary-file)
|
||||||
|
[[ $# -ge 2 ]] || die "Missing value for --summary-file."
|
||||||
|
summary_file=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
usage
|
usage
|
||||||
exit 0
|
exit 0
|
||||||
|
|
@ -174,6 +182,17 @@ main() {
|
||||||
"$SETUP_AUTH" 2>/dev/null || true
|
"$SETUP_AUTH" 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
export DCE_RUN_SUMMARY_JSON=1
|
||||||
|
if [[ -z "${DCE_RUN_SUMMARY_FILE:-}" ]]; then
|
||||||
|
if [[ -n "$summary_file" ]]; then
|
||||||
|
export DCE_RUN_SUMMARY_FILE="$summary_file"
|
||||||
|
else
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
export DCE_RUN_SUMMARY_FILE="$LOG_DIR/documents-scrape-$(date -u +%Y%m%dT%H%M%SZ).summary.json"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
printf 'JSON summary file: %s\n' "$DCE_RUN_SUMMARY_FILE"
|
||||||
|
|
||||||
"$HOST_RUNNER" preflight "${container_args[@]}"
|
"$HOST_RUNNER" preflight "${container_args[@]}"
|
||||||
"$HOST_RUNNER" scrape "${container_args[@]}"
|
"$HOST_RUNNER" scrape "${container_args[@]}"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -83,6 +83,10 @@ grep -q 'Documents scrape run plan' "$DOC_OUT" || {
|
||||||
echo "expected Documents scrape run plan in dry-run output" >&2
|
echo "expected Documents scrape run plan in dry-run output" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
grep -q 'JSON summary file:' "$DOC_OUT" && {
|
||||||
|
echo "dry-run should not enable JSON summary export" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
CHANNEL_DRY="$TMP_DIR/channel-dry-run.log"
|
CHANNEL_DRY="$TMP_DIR/channel-dry-run.log"
|
||||||
"$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >"$CHANNEL_DRY" 2>&1
|
"$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >"$CHANNEL_DRY" 2>&1
|
||||||
|
|
@ -90,6 +94,10 @@ grep -q 'Documents scrape run plan' "$CHANNEL_DRY" || {
|
||||||
echo "expected dry-run to accept --channel passthrough" >&2
|
echo "expected dry-run to accept --channel passthrough" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
grep -q 'JSON summary file:' "$CHANNEL_DRY" && {
|
||||||
|
echo "dry-run with --channel should not enable JSON summary export" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
ARGS_LOG="$TMP_DIR/compose-args.log"
|
ARGS_LOG="$TMP_DIR/compose-args.log"
|
||||||
cat >"$FAKE_DOCKER" <<'EOF'
|
cat >"$FAKE_DOCKER" <<'EOF'
|
||||||
|
|
@ -100,13 +108,20 @@ EOF
|
||||||
chmod +x "$FAKE_DOCKER"
|
chmod +x "$FAKE_DOCKER"
|
||||||
printf 'DISCORD_TOKEN=dummy-token\n' >"$TMP_DIR/scrape.env"
|
printf 'DISCORD_TOKEN=dummy-token\n' >"$TMP_DIR/scrape.env"
|
||||||
|
|
||||||
|
LIVE_DOC_OUT="$TMP_DIR/documents-live.log"
|
||||||
DCE_MIN_FREE_MB=0 \
|
DCE_MIN_FREE_MB=0 \
|
||||||
DCE_SKIP_SCRAPE_LOCK=1 \
|
DCE_SKIP_SCRAPE_LOCK=1 \
|
||||||
DCE_DOCKER_BIN="$FAKE_DOCKER" \
|
DCE_DOCKER_BIN="$FAKE_DOCKER" \
|
||||||
FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \
|
FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \
|
||||||
DCE_ENV_FILE="$TMP_DIR/scrape.env" \
|
DCE_ENV_FILE="$TMP_DIR/scrape.env" \
|
||||||
"$REPO_ROOT/scripts/run-documents-scrape.sh" --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >/dev/null
|
DCE_LOG_DIR="$TMP_DIR/logs" \
|
||||||
|
"$REPO_ROOT/scripts/run-documents-scrape.sh" --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >"$LIVE_DOC_OUT" 2>&1
|
||||||
|
|
||||||
|
grep -q 'JSON summary file:' "$LIVE_DOC_OUT" || {
|
||||||
|
echo "expected live documents scrape to enable JSON summary export" >&2
|
||||||
|
cat "$LIVE_DOC_OUT" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
grep -q '111111111111111111' "$ARGS_LOG" || {
|
grep -q '111111111111111111' "$ARGS_LOG" || {
|
||||||
echo "expected --channel to reach container compose invocation" >&2
|
echo "expected --channel to reach container compose invocation" >&2
|
||||||
cat "$ARGS_LOG" >&2
|
cat "$ARGS_LOG" >&2
|
||||||
|
|
@ -128,6 +143,10 @@ grep -q 'salvage completed' "$SALVAGE_DOC_LOG" || {
|
||||||
cat "$SALVAGE_DOC_LOG" >&2
|
cat "$SALVAGE_DOC_LOG" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
grep -q 'JSON summary file:' "$SALVAGE_DOC_LOG" && {
|
||||||
|
echo "salvage-only should not enable JSON summary export" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
SALVAGE_BEFORE_LOG="$TMP_DIR/salvage-before.log"
|
SALVAGE_BEFORE_LOG="$TMP_DIR/salvage-before.log"
|
||||||
: >"$ARGS_LOG"
|
: >"$ARGS_LOG"
|
||||||
|
|
@ -152,6 +171,11 @@ grep -q 'compose' "$ARGS_LOG" || {
|
||||||
cat "$ARGS_LOG" >&2
|
cat "$ARGS_LOG" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
grep -q 'JSON summary file:' "$SALVAGE_BEFORE_LOG" || {
|
||||||
|
echo "expected --salvage-before-scrape live path to enable JSON summary export" >&2
|
||||||
|
cat "$SALVAGE_BEFORE_LOG" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
command -v flock >/dev/null 2>&1 && {
|
command -v flock >/dev/null 2>&1 && {
|
||||||
LOCK_FILE="$TMP_DIR/.dce-scrape.lock"
|
LOCK_FILE="$TMP_DIR/.dce-scrape.lock"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue