fix(scrape): audit and salvage corrupt archive JSON

Add audit-archive-json and salvage-truncated-export helpers, skip invalid
JSON during prove snapshots, and wire an audit smoke test into CI.
This commit is contained in:
Boden 2026-05-29 14:09:46 -05:00
parent a0db7aec52
commit a2aeaaab9c
8 changed files with 306 additions and 3 deletions

View file

@ -263,14 +263,24 @@ Not this:
**Solutions:**
1. **Validate the file:**
1. **Audit all archives for a target:**
```bash
./scripts/audit-archive-json.sh --target target-name
```
2. **Validate one file:**
```bash
jq empty archive-file.json
```
2. **If corrupted, restore from backup** (if available)
3. **Truncated export (parse error mid-message):** salvage drops the incomplete tail and keeps earlier messages. A timestamped `.bak.*` backup is created first:
```bash
./scripts/salvage-truncated-export.sh path/to/export.json
```
3. **If no backup, move the archive aside and re-export:**
4. **If corrupted beyond salvage, restore from backup** (if available)
5. **If no backup, move the archive aside and re-export:**
```bash
mv archive-file.json archive-file.json.bak
./scripts/run-discord-scrape.sh scrape --target target-name

View file

@ -78,6 +78,7 @@ jobs:
./scripts/tests/verify-documents-auth-smoke.sh
./scripts/tests/scrape-here-smoke.sh
./scripts/tests/bootstrap-recurring-scrape-smoke.sh
./scripts/tests/audit-archive-json-smoke.sh
test:
# Tests need access to secrets, so we can't run them against PRs because of limited trust

View file

@ -0,0 +1,42 @@
---
title: fix: Salvage corrupt archive JSON and harden scrape loop
type: fix
status: complete
date: 2026-05-29
origin: LFG — KotOR yes_general export truncated; prove/cron fail on jq parse
---
# fix: Salvage corrupt archive JSON and harden scrape loop
## Summary
One KotOR archive JSON is truncated mid-message. Add audit/salvage tooling and make prove/scrape skip or repair invalid files without aborting entire targets.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | `audit-archive-json.sh` lists invalid JSON per target/output_dir |
| R2 | `salvage-truncated-export.sh` backs up and repairs truncated DCE exports |
| R3 | `prove-incremental-append.sh` skips invalid JSON with warning (not fatal) |
| R4 | Salvaged KotOR file passes `jq empty` and prove for that target |
| R5 | Smoke test for audit script |
## Implementation Units
### U1. Audit + salvage scripts
**Files:** `scripts/audit-archive-json.sh`, `scripts/salvage-truncated-export.sh`
### U2. Prove hardening
**Files:** `scripts/prove-incremental-append.sh`
### U3. Repair KotOR file (runtime)
**File:** `~/Documents/KotOR_discord_msgs/...yes_general [221726893064454144].json`
## Verification
- `jq empty` on salvaged file
- `prove-incremental-append.sh --target KotOR_discord_msgs`

View file

@ -9,6 +9,7 @@ Use this after cloning or opening the **source** repo (`DiscordChatExporter`, no
3. `./scripts/bootstrap-recurring-scrape.sh` — verify archives, build image, preflight Discord.
4. `./scripts/run-documents-scrape.sh` — first incremental append-only scrape.
5. `./scripts/prove-incremental-append.sh --target <name>` — optional grow-only proof.
6. `./scripts/audit-archive-json.sh` — optional; lists invalid JSON before cron runs.
## Monthly automation

87
scripts/audit-archive-json.sh Executable file
View file

@ -0,0 +1,87 @@
#!/usr/bin/env bash
set -Eeuo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
TARGET=""
FAILURES=0
usage() {
cat <<EOF
Usage:
$(basename "$0") [--config PATH] [--target NAME]
Validate JSON syntax for every channel export under configured targets.
Exits 1 when any invalid file is found.
EOF
}
die() {
printf 'ERROR: %s\n' "$*" >&2
exit 1
}
audit_dir() {
local output_dir=$1
local file_path
[[ -d "$output_dir" ]] || return 0
while IFS= read -r -d '' file_path; do
if jq empty "$file_path" >/dev/null 2>&1; then
continue
fi
printf 'INVALID\t%s\n' "$file_path"
FAILURES=$((FAILURES + 1))
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null)
}
main() {
while (($#)); do
case "$1" in
--config)
[[ $# -ge 2 ]] || die "Missing value for --config."
CONFIG_PATH=$2
shift 2
;;
--target)
[[ $# -ge 2 ]] || die "Missing value for --target."
TARGET=$2
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
die "Unknown option: $1"
;;
esac
done
command -v jq >/dev/null 2>&1 || die "jq is required."
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
if [[ -n "$TARGET" ]]; then
local output_dir
output_dir=$(jq -r --arg name "$TARGET" '.targets[] | select(.name == $name) | .output_dir' "$CONFIG_PATH")
[[ -n "$output_dir" && "$output_dir" != null ]] || die "Unknown target: $TARGET"
audit_dir "$output_dir"
else
while IFS= read -r output_dir; do
[[ -n "$output_dir" ]] || continue
audit_dir "$output_dir"
done < <(jq -r '.targets[] | select(.enabled != false) | .output_dir' "$CONFIG_PATH")
fi
if (( FAILURES > 0 )); then
printf '\n%d invalid JSON archive file(s). Run scripts/salvage-truncated-export.sh on each path.\n' "$FAILURES" >&2
exit 1
fi
printf 'All checked archive JSON files are valid.\n'
}
main "$@"

View file

@ -58,6 +58,10 @@ snapshot_archives() {
file_name=$(basename "$file_path")
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
channel_id=${BASH_REMATCH[1]}
if ! jq empty "$file_path" >/dev/null 2>&1; then
printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2
continue
fi
count=$(jq -r '(.messages | length) // 0' "$file_path")
printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file"
fi

View file

@ -0,0 +1,99 @@
#!/usr/bin/env bash
set -Eeuo pipefail
usage() {
cat <<EOF
Usage:
$(basename "$0") PATH [--dry-run]
Repair a truncated DiscordChatExporter JSON export by dropping the incomplete
final message and closing the messages array.
Creates PATH.bak.<timestamp> before modifying PATH.
EOF
}
die() {
printf 'ERROR: %s\n' "$*" >&2
exit 1
}
main() {
local export_path=${1:-}
local dry_run=0
[[ -n "$export_path" ]] || {
usage
exit 1
}
shift || true
while (($#)); do
case "$1" in
--dry-run)
dry_run=1
shift
;;
--help|-h)
usage
exit 0
;;
*)
die "Unknown option: $1"
;;
esac
done
[[ -f "$export_path" ]] || die "File not found: $export_path"
command -v python3 >/dev/null 2>&1 || die "python3 is required."
if jq empty "$export_path" >/dev/null 2>&1; then
printf 'Already valid JSON: %s\n' "$export_path"
exit 0
fi
python3 - "$export_path" "$dry_run" <<'PY'
import sys
from datetime import datetime, timezone
from pathlib import Path
path = Path(sys.argv[1])
dry_run = sys.argv[2] == "1"
data = path.read_bytes()
marker = b"},\n {"
idx = data.rfind(marker)
if idx < 0:
print("ERROR: could not find a safe message boundary to truncate", file=sys.stderr)
sys.exit(1)
truncated = data[: idx + 1]
# idx+1 ends at closing brace of last complete message
suffix = b'\n ],\n "messageCount": 0\n}'
# preserve messageCount if we can count roughly - jq will fix on merge
out = truncated + suffix
if dry_run:
print(f"Would salvage {path} ({len(data)} -> {len(out)} bytes)")
sys.exit(0)
backup = path.with_suffix(path.suffix + f".bak.{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}")
backup.write_bytes(data)
path.write_bytes(out)
print(f"Backup: {backup}")
print(f"Salvaged: {path} ({len(data)} -> {len(out)} bytes)")
PY
jq empty "$export_path" >/dev/null 2>&1 || die "Salvage did not produce valid JSON."
local temp_file
temp_file=$(mktemp "${TMPDIR:-/tmp}/dce-salvage.XXXXXX.json")
jq '.messageCount = (.messages | length)' "$export_path" >"$temp_file"
mv -f "$temp_file" "$export_path"
local count
count=$(jq -r '(.messages | length) // 0' "$export_path")
printf 'Valid JSON with %s messages.\n' "$count"
}
main "$@"

View file

@ -0,0 +1,59 @@
#!/usr/bin/env bash
set -Eeuo pipefail
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-audit-smoke.XXXXXX")
ARCHIVE_ROOT="$TMP_DIR/archive"
CONFIG_PATH="$TMP_DIR/config.json"
AUDIT="$REPO_ROOT/scripts/audit-archive-json.sh"
cleanup() {
rm -rf "$TMP_DIR"
}
trap cleanup EXIT
mkdir -p "$ARCHIVE_ROOT/good" "$ARCHIVE_ROOT/bad"
cat >"$ARCHIVE_ROOT/good/valid [111].json" <<'JSON'
{"guild":{"id":"1","name":"g"},"channel":{"id":"111","name":"c"},"messages":[],"messageCount":0}
JSON
printf '{"messages":[\n' >"$ARCHIVE_ROOT/bad/truncated [222].json"
cat >"$CONFIG_PATH" <<JSON
{
"archive_root": "$ARCHIVE_ROOT",
"targets": [
{
"name": "demo",
"kind": "guild",
"output_dir": "$ARCHIVE_ROOT/good",
"enabled": true
},
{
"name": "broken",
"kind": "guild",
"output_dir": "$ARCHIVE_ROOT/bad",
"enabled": true
}
]
}
JSON
DCE_CONFIG_FILE="$CONFIG_PATH" "$AUDIT" --target demo
set +e
broken_output=$(DCE_CONFIG_FILE="$CONFIG_PATH" "$AUDIT" --target broken 2>&1)
broken_status=$?
set -e
if [[ "$broken_status" -eq 0 ]]; then
printf 'ERROR: audit should fail for target with invalid JSON\n' >&2
exit 1
fi
if ! grep -q 'INVALID' <<<"$broken_output"; then
printf 'ERROR: audit output missing INVALID marker\n' >&2
exit 1
fi
printf 'audit-archive-json-smoke: OK\n'