diff --git a/.docs/Recurring-Scrape-Troubleshooting.md b/.docs/Recurring-Scrape-Troubleshooting.md index 91ddc351..22957018 100644 --- a/.docs/Recurring-Scrape-Troubleshooting.md +++ b/.docs/Recurring-Scrape-Troubleshooting.md @@ -263,14 +263,24 @@ Not this: **Solutions:** -1. **Validate the file:** +1. **Audit all archives for a target:** + ```bash + ./scripts/audit-archive-json.sh --target target-name + ``` + +2. **Validate one file:** ```bash jq empty archive-file.json ``` -2. **If corrupted, restore from backup** (if available) +3. **Truncated export (parse error mid-message):** salvage drops the incomplete tail and keeps earlier messages. A timestamped `.bak.*` backup is created first: + ```bash + ./scripts/salvage-truncated-export.sh path/to/export.json + ``` -3. **If no backup, move the archive aside and re-export:** +4. **If corrupted beyond salvage, restore from backup** (if available) + +5. **If no backup, move the archive aside and re-export:** ```bash mv archive-file.json archive-file.json.bak ./scripts/run-discord-scrape.sh scrape --target target-name diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f76ac9a7..e2368f94 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -78,6 +78,7 @@ jobs: ./scripts/tests/verify-documents-auth-smoke.sh ./scripts/tests/scrape-here-smoke.sh ./scripts/tests/bootstrap-recurring-scrape-smoke.sh + ./scripts/tests/audit-archive-json-smoke.sh test: # Tests need access to secrets, so we can't run them against PRs because of limited trust diff --git a/docs/plans/2026-05-29-018-fix-corrupt-archive-json-plan.md b/docs/plans/2026-05-29-018-fix-corrupt-archive-json-plan.md new file mode 100644 index 00000000..027885d3 --- /dev/null +++ b/docs/plans/2026-05-29-018-fix-corrupt-archive-json-plan.md @@ -0,0 +1,42 @@ +--- +title: fix: Salvage corrupt archive JSON and harden scrape loop +type: fix +status: complete +date: 2026-05-29 +origin: LFG — KotOR yes_general export truncated; prove/cron fail on jq parse +--- + +# fix: Salvage corrupt archive JSON and harden scrape loop + +## Summary + +One KotOR archive JSON is truncated mid-message. Add audit/salvage tooling and make prove/scrape skip or repair invalid files without aborting entire targets. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `audit-archive-json.sh` lists invalid JSON per target/output_dir | +| R2 | `salvage-truncated-export.sh` backs up and repairs truncated DCE exports | +| R3 | `prove-incremental-append.sh` skips invalid JSON with warning (not fatal) | +| R4 | Salvaged KotOR file passes `jq empty` and prove for that target | +| R5 | Smoke test for audit script | + +## Implementation Units + +### U1. Audit + salvage scripts + +**Files:** `scripts/audit-archive-json.sh`, `scripts/salvage-truncated-export.sh` + +### U2. Prove hardening + +**Files:** `scripts/prove-incremental-append.sh` + +### U3. Repair KotOR file (runtime) + +**File:** `~/Documents/KotOR_discord_msgs/...yes_general [221726893064454144].json` + +## Verification + +- `jq empty` on salvaged file +- `prove-incremental-append.sh --target KotOR_discord_msgs` diff --git a/docs/recurring-scrape-operator-checklist.md b/docs/recurring-scrape-operator-checklist.md index 7ee2002c..5edccd80 100644 --- a/docs/recurring-scrape-operator-checklist.md +++ b/docs/recurring-scrape-operator-checklist.md @@ -9,6 +9,7 @@ Use this after cloning or opening the **source** repo (`DiscordChatExporter`, no 3. `./scripts/bootstrap-recurring-scrape.sh` — verify archives, build image, preflight Discord. 4. `./scripts/run-documents-scrape.sh` — first incremental append-only scrape. 5. `./scripts/prove-incremental-append.sh --target ` — optional grow-only proof. +6. `./scripts/audit-archive-json.sh` — optional; lists invalid JSON before cron runs. ## Monthly automation diff --git a/scripts/audit-archive-json.sh b/scripts/audit-archive-json.sh new file mode 100755 index 00000000..191815ab --- /dev/null +++ b/scripts/audit-archive-json.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P) +REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}" +CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}" +TARGET="" +FAILURES=0 + +usage() { + cat <&2 + exit 1 +} + +audit_dir() { + local output_dir=$1 + local file_path + + [[ -d "$output_dir" ]] || return 0 + + while IFS= read -r -d '' file_path; do + if jq empty "$file_path" >/dev/null 2>&1; then + continue + fi + printf 'INVALID\t%s\n' "$file_path" + FAILURES=$((FAILURES + 1)) + done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null) +} + +main() { + while (($#)); do + case "$1" in + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + CONFIG_PATH=$2 + shift 2 + ;; + --target) + [[ $# -ge 2 ]] || die "Missing value for --target." + TARGET=$2 + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + command -v jq >/dev/null 2>&1 || die "jq is required." + [[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH" + + if [[ -n "$TARGET" ]]; then + local output_dir + output_dir=$(jq -r --arg name "$TARGET" '.targets[] | select(.name == $name) | .output_dir' "$CONFIG_PATH") + [[ -n "$output_dir" && "$output_dir" != null ]] || die "Unknown target: $TARGET" + audit_dir "$output_dir" + else + while IFS= read -r output_dir; do + [[ -n "$output_dir" ]] || continue + audit_dir "$output_dir" + done < <(jq -r '.targets[] | select(.enabled != false) | .output_dir' "$CONFIG_PATH") + fi + + if (( FAILURES > 0 )); then + printf '\n%d invalid JSON archive file(s). Run scripts/salvage-truncated-export.sh on each path.\n' "$FAILURES" >&2 + exit 1 + fi + + printf 'All checked archive JSON files are valid.\n' +} + +main "$@" diff --git a/scripts/prove-incremental-append.sh b/scripts/prove-incremental-append.sh index d0d7ae46..3034700e 100755 --- a/scripts/prove-incremental-append.sh +++ b/scripts/prove-incremental-append.sh @@ -58,6 +58,10 @@ snapshot_archives() { file_name=$(basename "$file_path") if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then channel_id=${BASH_REMATCH[1]} + if ! jq empty "$file_path" >/dev/null 2>&1; then + printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2 + continue + fi count=$(jq -r '(.messages | length) // 0' "$file_path") printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file" fi diff --git a/scripts/salvage-truncated-export.sh b/scripts/salvage-truncated-export.sh new file mode 100755 index 00000000..62a0134d --- /dev/null +++ b/scripts/salvage-truncated-export.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +usage() { + cat < before modifying PATH. +EOF +} + +die() { + printf 'ERROR: %s\n' "$*" >&2 + exit 1 +} + +main() { + local export_path=${1:-} + local dry_run=0 + + [[ -n "$export_path" ]] || { + usage + exit 1 + } + shift || true + + while (($#)); do + case "$1" in + --dry-run) + dry_run=1 + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + [[ -f "$export_path" ]] || die "File not found: $export_path" + command -v python3 >/dev/null 2>&1 || die "python3 is required." + + if jq empty "$export_path" >/dev/null 2>&1; then + printf 'Already valid JSON: %s\n' "$export_path" + exit 0 + fi + + python3 - "$export_path" "$dry_run" <<'PY' +import sys +from datetime import datetime, timezone +from pathlib import Path + +path = Path(sys.argv[1]) +dry_run = sys.argv[2] == "1" +data = path.read_bytes() +marker = b"},\n {" +idx = data.rfind(marker) +if idx < 0: + print("ERROR: could not find a safe message boundary to truncate", file=sys.stderr) + sys.exit(1) + +truncated = data[: idx + 1] +# idx+1 ends at closing brace of last complete message +suffix = b'\n ],\n "messageCount": 0\n}' +# preserve messageCount if we can count roughly - jq will fix on merge +out = truncated + suffix + +if dry_run: + print(f"Would salvage {path} ({len(data)} -> {len(out)} bytes)") + sys.exit(0) + +backup = path.with_suffix(path.suffix + f".bak.{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}") +backup.write_bytes(data) +path.write_bytes(out) +print(f"Backup: {backup}") +print(f"Salvaged: {path} ({len(data)} -> {len(out)} bytes)") +PY + + jq empty "$export_path" >/dev/null 2>&1 || die "Salvage did not produce valid JSON." + + local temp_file + temp_file=$(mktemp "${TMPDIR:-/tmp}/dce-salvage.XXXXXX.json") + jq '.messageCount = (.messages | length)' "$export_path" >"$temp_file" + mv -f "$temp_file" "$export_path" + + local count + count=$(jq -r '(.messages | length) // 0' "$export_path") + printf 'Valid JSON with %s messages.\n' "$count" +} + +main "$@" diff --git a/scripts/tests/audit-archive-json-smoke.sh b/scripts/tests/audit-archive-json-smoke.sh new file mode 100755 index 00000000..255d4202 --- /dev/null +++ b/scripts/tests/audit-archive-json-smoke.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P) +TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-audit-smoke.XXXXXX") +ARCHIVE_ROOT="$TMP_DIR/archive" +CONFIG_PATH="$TMP_DIR/config.json" +AUDIT="$REPO_ROOT/scripts/audit-archive-json.sh" + +cleanup() { + rm -rf "$TMP_DIR" +} +trap cleanup EXIT + +mkdir -p "$ARCHIVE_ROOT/good" "$ARCHIVE_ROOT/bad" + +cat >"$ARCHIVE_ROOT/good/valid [111].json" <<'JSON' +{"guild":{"id":"1","name":"g"},"channel":{"id":"111","name":"c"},"messages":[],"messageCount":0} +JSON + +printf '{"messages":[\n' >"$ARCHIVE_ROOT/bad/truncated [222].json" + +cat >"$CONFIG_PATH" <&1) +broken_status=$? +set -e +if [[ "$broken_status" -eq 0 ]]; then + printf 'ERROR: audit should fail for target with invalid JSON\n' >&2 + exit 1 +fi +if ! grep -q 'INVALID' <<<"$broken_output"; then + printf 'ERROR: audit output missing INVALID marker\n' >&2 + exit 1 +fi + +printf 'audit-archive-json-smoke: OK\n'