diff --git a/docs/plans/2026-06-04-066-feat-prove-channel-filtered-snapshots-plan.md b/docs/plans/2026-06-04-066-feat-prove-channel-filtered-snapshots-plan.md new file mode 100644 index 00000000..53bf2965 --- /dev/null +++ b/docs/plans/2026-06-04-066-feat-prove-channel-filtered-snapshots-plan.md @@ -0,0 +1,48 @@ +--- +title: "feat: Channel-filtered prove-incremental-append snapshots" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — prove-incremental-append accepts --channel for scrape but snapshots all archives; yes_general proof should assert only the target channel +--- + +# feat: Channel-filtered prove-incremental-append snapshots + +## Summary + +When `prove-incremental-append.sh` is invoked with `--channel`, limit before/after snapshots and grow-only comparison to those channel IDs only. + +## Problem + +KotOR targets have dozens of channel JSON files. A yes_general-only proof run still snapshots and compares every archive, making failures harder to interpret and unrelated channels part of the pass/fail surface. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `snapshot_archives` skips archives whose channel ID is not in the `--channel` filter when filter is non-empty | +| R2 | Full prove flow applies the same filter to before and after snapshots | +| R3 | `--snapshot-only` honors `--channel` filter | +| R4 | Usage documents channel-scoped snapshot behavior | +| R5 | Smoke asserts filtered snapshot excludes other valid channels | +| R6 | `run-all-smokes.sh` → 21/21 | + +## Implementation Units + +### U1. Filtered snapshots + +**Files:** `scripts/prove-incremental-append.sh`, `scripts/tests/prove-incremental-append-smoke.sh` + +## Verification + +```bash +./scripts/tests/prove-incremental-append-smoke.sh +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up on host +- Per-target memory in config JSON diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index aeb52e9c..8e1de662 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -158,6 +158,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 065 (2026-06-04):** Scrape summary labels OOM skips as `SKIPPED (OOM/aborted)` with operator hint; `verify-operator-ready` prints configured container memory. +**Plan 066 (2026-06-04):** `prove-incremental-append --channel` filters snapshots and grow-only comparison to selected channels. + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/scripts/prove-incremental-append.sh b/scripts/prove-incremental-append.sh index 543308dc..f11e7b94 100755 --- a/scripts/prove-incremental-append.sh +++ b/scripts/prove-incremental-append.sh @@ -16,12 +16,12 @@ Usage: $(basename "$0") --target NAME --snapshot-only --snapshot-file PATH [--config PATH] $(basename "$0") --compare-snapshots BEFORE.tsv AFTER.tsv -Record message counts for every JSON archive under the target's output_dir, +Record message counts for JSON archives under the target's output_dir, run one incremental scrape, then assert: - archive file paths are unchanged (no parallel channels/ fallbacks) - message counts never shrink - --channel ID Limit incremental scrape to channel ID (repeatable; requires --target) + --channel ID Limit scrape and snapshot/compare to channel ID (repeatable) Requires valid Discord auth (scrape.env, exported DISCORD_TOKEN, or token file). EOF @@ -52,9 +52,24 @@ target_output_dir() { ' "$CONFIG_PATH" } +snapshot_channel_allowed() { + local channel_id=$1 + shift + local -a filter_ids=("$@") + local id + + ((${#filter_ids[@]} == 0)) && return 0 + for id in "${filter_ids[@]}"; do + [[ "$id" == "$channel_id" ]] && return 0 + done + return 1 +} + snapshot_archives() { local output_dir=$1 local snapshot_file=$2 + shift 2 + local -a channel_filter=("$@") local file_path file_name channel_id count : >"$snapshot_file" @@ -65,6 +80,9 @@ snapshot_archives() { file_name=$(basename "$file_path") if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then channel_id=${BASH_REMATCH[1]} + if ! snapshot_channel_allowed "$channel_id" "${channel_filter[@]}"; then + continue + fi if ! jq empty "$file_path" >/dev/null 2>&1; then printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2 continue @@ -72,7 +90,7 @@ snapshot_archives() { count=$(jq -r '(.messages | length) // 0' "$file_path") printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file" fi - done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null) + done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' ! -path '*/.dce-temp/*' -print0 2>/dev/null) } compare_snapshots() { @@ -122,6 +140,7 @@ main() { local compare_before="" local compare_after="" local -a channel_args=() + local -a channel_ids=() trap cleanup EXIT @@ -155,6 +174,7 @@ main() { --channel) [[ $# -ge 2 ]] || die "Missing value for --channel." channel_args+=(--channel "$2") + channel_ids+=("$2") shift 2 ;; --help|-h) @@ -187,8 +207,8 @@ main() { if (( snapshot_only )); then [[ -n "$snapshot_file" ]] || die "--snapshot-file is required with --snapshot-only." - snapshot_archives "$output_dir" "$snapshot_file" - [[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir" + snapshot_archives "$output_dir" "$snapshot_file" "${channel_ids[@]}" + [[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir for channel filter." printf 'Snapshot written: %s\n' "$snapshot_file" exit 0 fi @@ -197,9 +217,12 @@ main() { local before_file="$SNAPSHOT_DIR/before.tsv" local after_file="$SNAPSHOT_DIR/after.tsv" - snapshot_archives "$output_dir" "$before_file" - [[ -s "$before_file" ]] || die "No seeded archives found under $output_dir" + snapshot_archives "$output_dir" "$before_file" "${channel_ids[@]}" + [[ -s "$before_file" ]] || die "No seeded archives found under $output_dir for channel filter." + if ((${#channel_ids[@]} > 0)); then + printf 'Channel-scoped proof for %s channel(s).\n' "${#channel_ids[@]}" + fi printf 'Running incremental scrape for target %s...\n' "$target" local container_config="$CONTAINER_CONFIG" case "$CONFIG_PATH" in @@ -208,7 +231,7 @@ main() { esac "$HOST_RUNNER" scrape --config "$container_config" --target "$target" "${channel_args[@]}" - snapshot_archives "$output_dir" "$after_file" + snapshot_archives "$output_dir" "$after_file" "${channel_ids[@]}" compare_snapshots "$before_file" "$after_file" printf 'Append-safe proof passed for target %s.\n' "$target" } diff --git a/scripts/tests/prove-incremental-append-smoke.sh b/scripts/tests/prove-incremental-append-smoke.sh index 1a12a34d..966525e9 100755 --- a/scripts/tests/prove-incremental-append-smoke.sh +++ b/scripts/tests/prove-incremental-append-smoke.sh @@ -28,6 +28,17 @@ cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON' } JSON +cat >"$ARCHIVE_ROOT/demo/Guild - other [333333333333333333].json" <<'JSON' +{ + "guild": {"id": "1", "name": "Guild"}, + "channel": {"id": "333333333333333333", "name": "other"}, + "messages": [ + {"id": "9", "timestamp": "2020-01-01T00:00:00+00:00", "type": "Default", "content": "other"} + ], + "messageCount": 1 +} +JSON + printf '{"messages":[\n' >"$ARCHIVE_ROOT/demo/truncated [222222222222222222].json" cat >"$CONFIG_PATH" <&2 + exit 1 +} +if grep -q '333333333333333333' "$FILTERED"; then + printf 'ERROR: channel-filtered snapshot should exclude other valid channels\n' >&2 + exit 1 +fi + cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON' { "guild": {"id": "1", "name": "Guild"},