mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-09 15:52:37 -06:00
feat(prove): filter incremental snapshots by --channel
Channel-scoped proof runs snapshot and compare only selected archives, so yes_general-focused validation ignores unrelated KotOR channels. Smoke covers filtered snapshot-only mode; exclude .dce-temp from find.
This commit is contained in:
parent
a827e6b9bc
commit
3e96514f3e
|
|
@ -0,0 +1,48 @@
|
|||
---
|
||||
title: "feat: Channel-filtered prove-incremental-append snapshots"
|
||||
type: feat
|
||||
status: complete
|
||||
date: 2026-06-04
|
||||
origin: /lfg — prove-incremental-append accepts --channel for scrape but snapshots all archives; yes_general proof should assert only the target channel
|
||||
---
|
||||
|
||||
# feat: Channel-filtered prove-incremental-append snapshots
|
||||
|
||||
## Summary
|
||||
|
||||
When `prove-incremental-append.sh` is invoked with `--channel`, limit before/after snapshots and grow-only comparison to those channel IDs only.
|
||||
|
||||
## Problem
|
||||
|
||||
KotOR targets have dozens of channel JSON files. A yes_general-only proof run still snapshots and compares every archive, making failures harder to interpret and unrelated channels part of the pass/fail surface.
|
||||
|
||||
## Requirements
|
||||
|
||||
| ID | Requirement |
|
||||
|----|-------------|
|
||||
| R1 | `snapshot_archives` skips archives whose channel ID is not in the `--channel` filter when filter is non-empty |
|
||||
| R2 | Full prove flow applies the same filter to before and after snapshots |
|
||||
| R3 | `--snapshot-only` honors `--channel` filter |
|
||||
| R4 | Usage documents channel-scoped snapshot behavior |
|
||||
| R5 | Smoke asserts filtered snapshot excludes other valid channels |
|
||||
| R6 | `run-all-smokes.sh` → 21/21 |
|
||||
|
||||
## Implementation Units
|
||||
|
||||
### U1. Filtered snapshots
|
||||
|
||||
**Files:** `scripts/prove-incremental-append.sh`, `scripts/tests/prove-incremental-append-smoke.sh`
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
./scripts/tests/prove-incremental-append-smoke.sh
|
||||
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
|
||||
```
|
||||
|
||||
## Scope Boundaries
|
||||
|
||||
### Deferred
|
||||
|
||||
- Live KotOR catch-up on host
|
||||
- Per-target memory in config JSON
|
||||
|
|
@ -158,6 +158,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \
|
|||
|
||||
**Plan 065 (2026-06-04):** Scrape summary labels OOM skips as `SKIPPED (OOM/aborted)` with operator hint; `verify-operator-ready` prints configured container memory.
|
||||
|
||||
**Plan 066 (2026-06-04):** `prove-incremental-append --channel` filters snapshots and grow-only comparison to selected channels.
|
||||
|
||||
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
||||
|
||||
## CI note (fork PRs)
|
||||
|
|
|
|||
|
|
@ -16,12 +16,12 @@ Usage:
|
|||
$(basename "$0") --target NAME --snapshot-only --snapshot-file PATH [--config PATH]
|
||||
$(basename "$0") --compare-snapshots BEFORE.tsv AFTER.tsv
|
||||
|
||||
Record message counts for every JSON archive under the target's output_dir,
|
||||
Record message counts for JSON archives under the target's output_dir,
|
||||
run one incremental scrape, then assert:
|
||||
- archive file paths are unchanged (no parallel channels/ fallbacks)
|
||||
- message counts never shrink
|
||||
|
||||
--channel ID Limit incremental scrape to channel ID (repeatable; requires --target)
|
||||
--channel ID Limit scrape and snapshot/compare to channel ID (repeatable)
|
||||
|
||||
Requires valid Discord auth (scrape.env, exported DISCORD_TOKEN, or token file).
|
||||
EOF
|
||||
|
|
@ -52,9 +52,24 @@ target_output_dir() {
|
|||
' "$CONFIG_PATH"
|
||||
}
|
||||
|
||||
snapshot_channel_allowed() {
|
||||
local channel_id=$1
|
||||
shift
|
||||
local -a filter_ids=("$@")
|
||||
local id
|
||||
|
||||
((${#filter_ids[@]} == 0)) && return 0
|
||||
for id in "${filter_ids[@]}"; do
|
||||
[[ "$id" == "$channel_id" ]] && return 0
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
snapshot_archives() {
|
||||
local output_dir=$1
|
||||
local snapshot_file=$2
|
||||
shift 2
|
||||
local -a channel_filter=("$@")
|
||||
local file_path file_name channel_id count
|
||||
|
||||
: >"$snapshot_file"
|
||||
|
|
@ -65,6 +80,9 @@ snapshot_archives() {
|
|||
file_name=$(basename "$file_path")
|
||||
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
||||
channel_id=${BASH_REMATCH[1]}
|
||||
if ! snapshot_channel_allowed "$channel_id" "${channel_filter[@]}"; then
|
||||
continue
|
||||
fi
|
||||
if ! jq empty "$file_path" >/dev/null 2>&1; then
|
||||
printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2
|
||||
continue
|
||||
|
|
@ -72,7 +90,7 @@ snapshot_archives() {
|
|||
count=$(jq -r '(.messages | length) // 0' "$file_path")
|
||||
printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file"
|
||||
fi
|
||||
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null)
|
||||
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' ! -path '*/.dce-temp/*' -print0 2>/dev/null)
|
||||
}
|
||||
|
||||
compare_snapshots() {
|
||||
|
|
@ -122,6 +140,7 @@ main() {
|
|||
local compare_before=""
|
||||
local compare_after=""
|
||||
local -a channel_args=()
|
||||
local -a channel_ids=()
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
|
|
@ -155,6 +174,7 @@ main() {
|
|||
--channel)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --channel."
|
||||
channel_args+=(--channel "$2")
|
||||
channel_ids+=("$2")
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
|
|
@ -187,8 +207,8 @@ main() {
|
|||
|
||||
if (( snapshot_only )); then
|
||||
[[ -n "$snapshot_file" ]] || die "--snapshot-file is required with --snapshot-only."
|
||||
snapshot_archives "$output_dir" "$snapshot_file"
|
||||
[[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir"
|
||||
snapshot_archives "$output_dir" "$snapshot_file" "${channel_ids[@]}"
|
||||
[[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir for channel filter."
|
||||
printf 'Snapshot written: %s\n' "$snapshot_file"
|
||||
exit 0
|
||||
fi
|
||||
|
|
@ -197,9 +217,12 @@ main() {
|
|||
local before_file="$SNAPSHOT_DIR/before.tsv"
|
||||
local after_file="$SNAPSHOT_DIR/after.tsv"
|
||||
|
||||
snapshot_archives "$output_dir" "$before_file"
|
||||
[[ -s "$before_file" ]] || die "No seeded archives found under $output_dir"
|
||||
snapshot_archives "$output_dir" "$before_file" "${channel_ids[@]}"
|
||||
[[ -s "$before_file" ]] || die "No seeded archives found under $output_dir for channel filter."
|
||||
|
||||
if ((${#channel_ids[@]} > 0)); then
|
||||
printf 'Channel-scoped proof for %s channel(s).\n' "${#channel_ids[@]}"
|
||||
fi
|
||||
printf 'Running incremental scrape for target %s...\n' "$target"
|
||||
local container_config="$CONTAINER_CONFIG"
|
||||
case "$CONFIG_PATH" in
|
||||
|
|
@ -208,7 +231,7 @@ main() {
|
|||
esac
|
||||
"$HOST_RUNNER" scrape --config "$container_config" --target "$target" "${channel_args[@]}"
|
||||
|
||||
snapshot_archives "$output_dir" "$after_file"
|
||||
snapshot_archives "$output_dir" "$after_file" "${channel_ids[@]}"
|
||||
compare_snapshots "$before_file" "$after_file"
|
||||
printf 'Append-safe proof passed for target %s.\n' "$target"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,17 @@ cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
|
|||
}
|
||||
JSON
|
||||
|
||||
cat >"$ARCHIVE_ROOT/demo/Guild - other [333333333333333333].json" <<'JSON'
|
||||
{
|
||||
"guild": {"id": "1", "name": "Guild"},
|
||||
"channel": {"id": "333333333333333333", "name": "other"},
|
||||
"messages": [
|
||||
{"id": "9", "timestamp": "2020-01-01T00:00:00+00:00", "type": "Default", "content": "other"}
|
||||
],
|
||||
"messageCount": 1
|
||||
}
|
||||
JSON
|
||||
|
||||
printf '{"messages":[\n' >"$ARCHIVE_ROOT/demo/truncated [222222222222222222].json"
|
||||
|
||||
cat >"$CONFIG_PATH" <<JSON
|
||||
|
|
@ -55,6 +66,17 @@ if grep -q '222222222222222222' "$BEFORE"; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
FILTERED="$TMP_DIR/filtered.tsv"
|
||||
DCE_PRIMARY_CONFIG="$CONFIG_PATH" "$PROVE" --target demo --snapshot-only --snapshot-file "$FILTERED" --channel 111111111111111111
|
||||
grep -q '111111111111111111' "$FILTERED" || {
|
||||
printf 'ERROR: channel-filtered snapshot missing target channel\n' >&2
|
||||
exit 1
|
||||
}
|
||||
if grep -q '333333333333333333' "$FILTERED"; then
|
||||
printf 'ERROR: channel-filtered snapshot should exclude other valid channels\n' >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
|
||||
{
|
||||
"guild": {"id": "1", "name": "Guild"},
|
||||
|
|
|
|||
Loading…
Reference in a new issue