mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-09 15:52:37 -06:00
feat(prove): filter incremental snapshots by --channel
Channel-scoped proof runs snapshot and compare only selected archives, so yes_general-focused validation ignores unrelated KotOR channels. Smoke covers filtered snapshot-only mode; exclude .dce-temp from find.
This commit is contained in:
parent
a827e6b9bc
commit
3e96514f3e
|
|
@ -0,0 +1,48 @@
|
||||||
|
---
|
||||||
|
title: "feat: Channel-filtered prove-incremental-append snapshots"
|
||||||
|
type: feat
|
||||||
|
status: complete
|
||||||
|
date: 2026-06-04
|
||||||
|
origin: /lfg — prove-incremental-append accepts --channel for scrape but snapshots all archives; yes_general proof should assert only the target channel
|
||||||
|
---
|
||||||
|
|
||||||
|
# feat: Channel-filtered prove-incremental-append snapshots
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
When `prove-incremental-append.sh` is invoked with `--channel`, limit before/after snapshots and grow-only comparison to those channel IDs only.
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
KotOR targets have dozens of channel JSON files. A yes_general-only proof run still snapshots and compares every archive, making failures harder to interpret and unrelated channels part of the pass/fail surface.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement |
|
||||||
|
|----|-------------|
|
||||||
|
| R1 | `snapshot_archives` skips archives whose channel ID is not in the `--channel` filter when filter is non-empty |
|
||||||
|
| R2 | Full prove flow applies the same filter to before and after snapshots |
|
||||||
|
| R3 | `--snapshot-only` honors `--channel` filter |
|
||||||
|
| R4 | Usage documents channel-scoped snapshot behavior |
|
||||||
|
| R5 | Smoke asserts filtered snapshot excludes other valid channels |
|
||||||
|
| R6 | `run-all-smokes.sh` → 21/21 |
|
||||||
|
|
||||||
|
## Implementation Units
|
||||||
|
|
||||||
|
### U1. Filtered snapshots
|
||||||
|
|
||||||
|
**Files:** `scripts/prove-incremental-append.sh`, `scripts/tests/prove-incremental-append-smoke.sh`
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/tests/prove-incremental-append-smoke.sh
|
||||||
|
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scope Boundaries
|
||||||
|
|
||||||
|
### Deferred
|
||||||
|
|
||||||
|
- Live KotOR catch-up on host
|
||||||
|
- Per-target memory in config JSON
|
||||||
|
|
@ -158,6 +158,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \
|
||||||
|
|
||||||
**Plan 065 (2026-06-04):** Scrape summary labels OOM skips as `SKIPPED (OOM/aborted)` with operator hint; `verify-operator-ready` prints configured container memory.
|
**Plan 065 (2026-06-04):** Scrape summary labels OOM skips as `SKIPPED (OOM/aborted)` with operator hint; `verify-operator-ready` prints configured container memory.
|
||||||
|
|
||||||
|
**Plan 066 (2026-06-04):** `prove-incremental-append --channel` filters snapshots and grow-only comparison to selected channels.
|
||||||
|
|
||||||
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
||||||
|
|
||||||
## CI note (fork PRs)
|
## CI note (fork PRs)
|
||||||
|
|
|
||||||
|
|
@ -16,12 +16,12 @@ Usage:
|
||||||
$(basename "$0") --target NAME --snapshot-only --snapshot-file PATH [--config PATH]
|
$(basename "$0") --target NAME --snapshot-only --snapshot-file PATH [--config PATH]
|
||||||
$(basename "$0") --compare-snapshots BEFORE.tsv AFTER.tsv
|
$(basename "$0") --compare-snapshots BEFORE.tsv AFTER.tsv
|
||||||
|
|
||||||
Record message counts for every JSON archive under the target's output_dir,
|
Record message counts for JSON archives under the target's output_dir,
|
||||||
run one incremental scrape, then assert:
|
run one incremental scrape, then assert:
|
||||||
- archive file paths are unchanged (no parallel channels/ fallbacks)
|
- archive file paths are unchanged (no parallel channels/ fallbacks)
|
||||||
- message counts never shrink
|
- message counts never shrink
|
||||||
|
|
||||||
--channel ID Limit incremental scrape to channel ID (repeatable; requires --target)
|
--channel ID Limit scrape and snapshot/compare to channel ID (repeatable)
|
||||||
|
|
||||||
Requires valid Discord auth (scrape.env, exported DISCORD_TOKEN, or token file).
|
Requires valid Discord auth (scrape.env, exported DISCORD_TOKEN, or token file).
|
||||||
EOF
|
EOF
|
||||||
|
|
@ -52,9 +52,24 @@ target_output_dir() {
|
||||||
' "$CONFIG_PATH"
|
' "$CONFIG_PATH"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
snapshot_channel_allowed() {
|
||||||
|
local channel_id=$1
|
||||||
|
shift
|
||||||
|
local -a filter_ids=("$@")
|
||||||
|
local id
|
||||||
|
|
||||||
|
((${#filter_ids[@]} == 0)) && return 0
|
||||||
|
for id in "${filter_ids[@]}"; do
|
||||||
|
[[ "$id" == "$channel_id" ]] && return 0
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
snapshot_archives() {
|
snapshot_archives() {
|
||||||
local output_dir=$1
|
local output_dir=$1
|
||||||
local snapshot_file=$2
|
local snapshot_file=$2
|
||||||
|
shift 2
|
||||||
|
local -a channel_filter=("$@")
|
||||||
local file_path file_name channel_id count
|
local file_path file_name channel_id count
|
||||||
|
|
||||||
: >"$snapshot_file"
|
: >"$snapshot_file"
|
||||||
|
|
@ -65,6 +80,9 @@ snapshot_archives() {
|
||||||
file_name=$(basename "$file_path")
|
file_name=$(basename "$file_path")
|
||||||
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
||||||
channel_id=${BASH_REMATCH[1]}
|
channel_id=${BASH_REMATCH[1]}
|
||||||
|
if ! snapshot_channel_allowed "$channel_id" "${channel_filter[@]}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
if ! jq empty "$file_path" >/dev/null 2>&1; then
|
if ! jq empty "$file_path" >/dev/null 2>&1; then
|
||||||
printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2
|
printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2
|
||||||
continue
|
continue
|
||||||
|
|
@ -72,7 +90,7 @@ snapshot_archives() {
|
||||||
count=$(jq -r '(.messages | length) // 0' "$file_path")
|
count=$(jq -r '(.messages | length) // 0' "$file_path")
|
||||||
printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file"
|
printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file"
|
||||||
fi
|
fi
|
||||||
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null)
|
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' ! -path '*/.dce-temp/*' -print0 2>/dev/null)
|
||||||
}
|
}
|
||||||
|
|
||||||
compare_snapshots() {
|
compare_snapshots() {
|
||||||
|
|
@ -122,6 +140,7 @@ main() {
|
||||||
local compare_before=""
|
local compare_before=""
|
||||||
local compare_after=""
|
local compare_after=""
|
||||||
local -a channel_args=()
|
local -a channel_args=()
|
||||||
|
local -a channel_ids=()
|
||||||
|
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
|
@ -155,6 +174,7 @@ main() {
|
||||||
--channel)
|
--channel)
|
||||||
[[ $# -ge 2 ]] || die "Missing value for --channel."
|
[[ $# -ge 2 ]] || die "Missing value for --channel."
|
||||||
channel_args+=(--channel "$2")
|
channel_args+=(--channel "$2")
|
||||||
|
channel_ids+=("$2")
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
--help|-h)
|
--help|-h)
|
||||||
|
|
@ -187,8 +207,8 @@ main() {
|
||||||
|
|
||||||
if (( snapshot_only )); then
|
if (( snapshot_only )); then
|
||||||
[[ -n "$snapshot_file" ]] || die "--snapshot-file is required with --snapshot-only."
|
[[ -n "$snapshot_file" ]] || die "--snapshot-file is required with --snapshot-only."
|
||||||
snapshot_archives "$output_dir" "$snapshot_file"
|
snapshot_archives "$output_dir" "$snapshot_file" "${channel_ids[@]}"
|
||||||
[[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir"
|
[[ -s "$snapshot_file" ]] || die "No seeded archives found under $output_dir for channel filter."
|
||||||
printf 'Snapshot written: %s\n' "$snapshot_file"
|
printf 'Snapshot written: %s\n' "$snapshot_file"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
@ -197,9 +217,12 @@ main() {
|
||||||
local before_file="$SNAPSHOT_DIR/before.tsv"
|
local before_file="$SNAPSHOT_DIR/before.tsv"
|
||||||
local after_file="$SNAPSHOT_DIR/after.tsv"
|
local after_file="$SNAPSHOT_DIR/after.tsv"
|
||||||
|
|
||||||
snapshot_archives "$output_dir" "$before_file"
|
snapshot_archives "$output_dir" "$before_file" "${channel_ids[@]}"
|
||||||
[[ -s "$before_file" ]] || die "No seeded archives found under $output_dir"
|
[[ -s "$before_file" ]] || die "No seeded archives found under $output_dir for channel filter."
|
||||||
|
|
||||||
|
if ((${#channel_ids[@]} > 0)); then
|
||||||
|
printf 'Channel-scoped proof for %s channel(s).\n' "${#channel_ids[@]}"
|
||||||
|
fi
|
||||||
printf 'Running incremental scrape for target %s...\n' "$target"
|
printf 'Running incremental scrape for target %s...\n' "$target"
|
||||||
local container_config="$CONTAINER_CONFIG"
|
local container_config="$CONTAINER_CONFIG"
|
||||||
case "$CONFIG_PATH" in
|
case "$CONFIG_PATH" in
|
||||||
|
|
@ -208,7 +231,7 @@ main() {
|
||||||
esac
|
esac
|
||||||
"$HOST_RUNNER" scrape --config "$container_config" --target "$target" "${channel_args[@]}"
|
"$HOST_RUNNER" scrape --config "$container_config" --target "$target" "${channel_args[@]}"
|
||||||
|
|
||||||
snapshot_archives "$output_dir" "$after_file"
|
snapshot_archives "$output_dir" "$after_file" "${channel_ids[@]}"
|
||||||
compare_snapshots "$before_file" "$after_file"
|
compare_snapshots "$before_file" "$after_file"
|
||||||
printf 'Append-safe proof passed for target %s.\n' "$target"
|
printf 'Append-safe proof passed for target %s.\n' "$target"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,17 @@ cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
|
||||||
}
|
}
|
||||||
JSON
|
JSON
|
||||||
|
|
||||||
|
cat >"$ARCHIVE_ROOT/demo/Guild - other [333333333333333333].json" <<'JSON'
|
||||||
|
{
|
||||||
|
"guild": {"id": "1", "name": "Guild"},
|
||||||
|
"channel": {"id": "333333333333333333", "name": "other"},
|
||||||
|
"messages": [
|
||||||
|
{"id": "9", "timestamp": "2020-01-01T00:00:00+00:00", "type": "Default", "content": "other"}
|
||||||
|
],
|
||||||
|
"messageCount": 1
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
printf '{"messages":[\n' >"$ARCHIVE_ROOT/demo/truncated [222222222222222222].json"
|
printf '{"messages":[\n' >"$ARCHIVE_ROOT/demo/truncated [222222222222222222].json"
|
||||||
|
|
||||||
cat >"$CONFIG_PATH" <<JSON
|
cat >"$CONFIG_PATH" <<JSON
|
||||||
|
|
@ -55,6 +66,17 @@ if grep -q '222222222222222222' "$BEFORE"; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
FILTERED="$TMP_DIR/filtered.tsv"
|
||||||
|
DCE_PRIMARY_CONFIG="$CONFIG_PATH" "$PROVE" --target demo --snapshot-only --snapshot-file "$FILTERED" --channel 111111111111111111
|
||||||
|
grep -q '111111111111111111' "$FILTERED" || {
|
||||||
|
printf 'ERROR: channel-filtered snapshot missing target channel\n' >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
if grep -q '333333333333333333' "$FILTERED"; then
|
||||||
|
printf 'ERROR: channel-filtered snapshot should exclude other valid channels\n' >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
|
cat >"$ARCHIVE_ROOT/demo/Guild - general [111111111111111111].json" <<'JSON'
|
||||||
{
|
{
|
||||||
"guild": {"id": "1", "name": "Guild"},
|
"guild": {"id": "1", "name": "Guild"},
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue