feat(scrape): offline prove smoke and snapshot-only mode

Add --snapshot-only and --compare-snapshots to prove-incremental-append,
fix EXIT trap status, wire prove smoke into CI, and document LFG closure plan.
This commit is contained in:
Boden 2026-05-29 14:12:18 -05:00
parent a2aeaaab9c
commit 10cd2a534d
4 changed files with 174 additions and 2 deletions

View file

@ -79,6 +79,7 @@ jobs:
./scripts/tests/scrape-here-smoke.sh ./scripts/tests/scrape-here-smoke.sh
./scripts/tests/bootstrap-recurring-scrape-smoke.sh ./scripts/tests/bootstrap-recurring-scrape-smoke.sh
./scripts/tests/audit-archive-json-smoke.sh ./scripts/tests/audit-archive-json-smoke.sh
./scripts/tests/prove-incremental-append-smoke.sh
test: test:
# Tests need access to secrets, so we can't run them against PRs because of limited trust # Tests need access to secrets, so we can't run them against PRs because of limited trust

View file

@ -0,0 +1,52 @@
---
title: feat: LFG closure — prove smoke and workspace bridge
type: feat
status: complete
date: 2026-05-29
origin: Repeated /lfg — recurring scrape stack complete; close gaps for operators and CI
---
# feat: LFG closure — prove smoke and workspace bridge
## Summary
Recurring scrape is feature-complete on `feat/recurring-cli-scrape`. This slice adds an offline prove smoke test, documents audit/salvage in the GUI zip bridge, and refreshes the open PR summary.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | `prove-incremental-append.sh` supports `--snapshot-only` for offline verification |
| R2 | `scripts/tests/prove-incremental-append-smoke.sh` validates invalid JSON skip + grow-only compare |
| R3 | CI `recurring-scrape-smoke` job runs prove smoke |
| R4 | `DiscordChatExporter.linux-x64/RECURRING-SCRAPE.md` mentions audit/salvage |
| R5 | PR #1538 body includes plan 018 audit/salvage summary |
## Implementation Units
### U1. Prove snapshot-only mode
**Files:** `scripts/prove-incremental-append.sh`
Add `--snapshot-only` that writes snapshot TSV and exits (no Discord scrape).
### U2. Prove smoke test
**Files:** `scripts/tests/prove-incremental-append-smoke.sh`
Fixture archives: valid JSON, invalid JSON (skipped), then simulate grow-only compare.
### U3. Workspace bridge
**Files:** `../DiscordChatExporter.linux-x64/RECURRING-SCRAPE.md` (sibling path from repo: document in plan as operator copy target — implement via `scripts/sync-workspace-bridge.sh` or direct edit if path exists)
Use repo-relative note: bridge file lives beside repo at `DiscordChatExporter.linux-x64/RECURRING-SCRAPE.md`.
### U4. PR body refresh
Update PR #1538 via `gh pr edit` with Latest section for `a2aeaaa` and plan 019.
## Verification
- `./scripts/tests/prove-incremental-append-smoke.sh`
- All existing `scripts/tests/*.sh` pass

View file

@ -13,6 +13,8 @@ usage() {
cat <<EOF cat <<EOF
Usage: Usage:
$(basename "$0") --target NAME [--config PATH] $(basename "$0") --target NAME [--config PATH]
$(basename "$0") --target NAME --snapshot-only --snapshot-file PATH [--config PATH]
$(basename "$0") --compare-snapshots BEFORE.tsv AFTER.tsv
Record message counts for every JSON archive under the target's output_dir, Record message counts for every JSON archive under the target's output_dir,
run one incremental scrape, then assert: run one incremental scrape, then assert:
@ -29,7 +31,10 @@ die() {
} }
cleanup() { cleanup() {
[[ -n "$SNAPSHOT_DIR" && -d "$SNAPSHOT_DIR" ]] && rm -rf "$SNAPSHOT_DIR" if [[ -n "${SNAPSHOT_DIR:-}" && -d "$SNAPSHOT_DIR" ]]; then
rm -rf "$SNAPSHOT_DIR"
fi
return 0
} }
require_command() { require_command() {
@ -110,6 +115,10 @@ compare_snapshots() {
main() { main() {
local target="" local target=""
local snapshot_only=0
local snapshot_file=""
local compare_before=""
local compare_after=""
trap cleanup EXIT trap cleanup EXIT
@ -125,6 +134,21 @@ main() {
CONFIG_PATH=$2 CONFIG_PATH=$2
shift 2 shift 2
;; ;;
--snapshot-only)
snapshot_only=1
shift
;;
--snapshot-file)
[[ $# -ge 2 ]] || die "Missing value for --snapshot-file."
snapshot_file=$2
shift 2
;;
--compare-snapshots)
[[ $# -ge 3 ]] || die "Missing paths for --compare-snapshots."
compare_before=$2
compare_after=$3
shift 3
;;
--help|-h) --help|-h)
usage usage
exit 0 exit 0
@ -135,15 +159,32 @@ main() {
esac esac
done done
require_command jq
if [[ -n "$compare_before" ]]; then
[[ -f "$compare_before" ]] || die "Missing snapshot: $compare_before"
[[ -f "$compare_after" ]] || die "Missing snapshot: $compare_after"
compare_snapshots "$compare_before" "$compare_after"
printf 'Snapshot comparison passed.\n'
exit 0
fi
[[ -n "$target" ]] || die "--target is required." [[ -n "$target" ]] || die "--target is required."
require_command jq
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH" [[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
local output_dir local output_dir
output_dir=$(target_output_dir "$target") output_dir=$(target_output_dir "$target")
[[ -n "$output_dir" && "$output_dir" != "null" ]] || die "Unknown target: $target" [[ -n "$output_dir" && "$output_dir" != "null" ]] || die "Unknown target: $target"
if (( snapshot_only )); then
[[ -n "$snapshot_file" ]] || die "--snapshot-file is required with --snapshot-only."
snapshot_archives "$output_dir" "$snapshot_file"
[[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir"
printf 'Snapshot written: %s\n' "$snapshot_file"
exit 0
fi
SNAPSHOT_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-prove-append.XXXXXX") SNAPSHOT_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-prove-append.XXXXXX")
local before_file="$SNAPSHOT_DIR/before.tsv" local before_file="$SNAPSHOT_DIR/before.tsv"
local after_file="$SNAPSHOT_DIR/after.tsv" local after_file="$SNAPSHOT_DIR/after.tsv"

View file

@ -0,0 +1,78 @@
#!/usr/bin/env bash
set -Eeuo pipefail
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
PROVE="$REPO_ROOT/scripts/prove-incremental-append.sh"
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-prove-smoke.XXXXXX")
ARCHIVE_ROOT="$TMP_DIR/archive"
CONFIG_PATH="$TMP_DIR/config.json"
BEFORE="$TMP_DIR/before.tsv"
AFTER="$TMP_DIR/after.tsv"
cleanup() {
rm -rf "$TMP_DIR"
}
trap cleanup EXIT
mkdir -p "$ARCHIVE_ROOT/demo"
cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
{
"guild": {"id": "1", "name": "Guild"},
"channel": {"id": "111111111111111111", "name": "general"},
"messages": [
{"id": "1", "timestamp": "2020-01-01T00:00:00+00:00", "type": "Default", "content": "one"}
],
"messageCount": 1
}
JSON
printf '{"messages":[\n' >"$ARCHIVE_ROOT/demo/truncated [222222222222222222].json"
cat >"$CONFIG_PATH" <<JSON
{
"archive_root": "$ARCHIVE_ROOT",
"targets": [
{
"name": "demo",
"kind": "guild",
"output_dir": "$ARCHIVE_ROOT/demo",
"enabled": true
}
]
}
JSON
DCE_PRIMARY_CONFIG="$CONFIG_PATH" "$PROVE" --target demo --snapshot-only --snapshot-file "$BEFORE"
if ! grep -q '111111111111111111' "$BEFORE"; then
printf 'ERROR: snapshot missing valid channel archive\n' >&2
exit 1
fi
if grep -q '222222222222222222' "$BEFORE"; then
printf 'ERROR: invalid JSON file should be skipped in snapshot\n' >&2
exit 1
fi
cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
{
"guild": {"id": "1", "name": "Guild"},
"channel": {"id": "111111111111111111", "name": "general"},
"messages": [
{"id": "1", "timestamp": "2020-01-01T00:00:00+00:00", "type": "Default", "content": "one"},
{"id": "2", "timestamp": "2020-01-02T00:00:00+00:00", "type": "Default", "content": "two"}
],
"messageCount": 2
}
JSON
DCE_PRIMARY_CONFIG="$CONFIG_PATH" "$PROVE" --target demo --snapshot-only --snapshot-file "$AFTER"
"$PROVE" --compare-snapshots "$BEFORE" "$AFTER"
if "$PROVE" --compare-snapshots "$AFTER" "$BEFORE" 2>/dev/null; then
printf 'ERROR: shrink comparison should have failed\n' >&2
exit 1
fi
printf 'prove-incremental-append-smoke: ok\n'