mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-09 15:52:37 -06:00
fix(scrape): audit and salvage corrupt archive JSON
Add audit-archive-json and salvage-truncated-export helpers, skip invalid JSON during prove snapshots, and wire an audit smoke test into CI.
This commit is contained in:
parent
a0db7aec52
commit
a2aeaaab9c
|
|
@ -263,14 +263,24 @@ Not this:
|
|||
|
||||
**Solutions:**
|
||||
|
||||
1. **Validate the file:**
|
||||
1. **Audit all archives for a target:**
|
||||
```bash
|
||||
./scripts/audit-archive-json.sh --target target-name
|
||||
```
|
||||
|
||||
2. **Validate one file:**
|
||||
```bash
|
||||
jq empty archive-file.json
|
||||
```
|
||||
|
||||
2. **If corrupted, restore from backup** (if available)
|
||||
3. **Truncated export (parse error mid-message):** salvage drops the incomplete tail and keeps earlier messages. A timestamped `.bak.*` backup is created first:
|
||||
```bash
|
||||
./scripts/salvage-truncated-export.sh path/to/export.json
|
||||
```
|
||||
|
||||
3. **If no backup, move the archive aside and re-export:**
|
||||
4. **If corrupted beyond salvage, restore from backup** (if available)
|
||||
|
||||
5. **If no backup, move the archive aside and re-export:**
|
||||
```bash
|
||||
mv archive-file.json archive-file.json.bak
|
||||
./scripts/run-discord-scrape.sh scrape --target target-name
|
||||
|
|
|
|||
1
.github/workflows/main.yml
vendored
1
.github/workflows/main.yml
vendored
|
|
@ -78,6 +78,7 @@ jobs:
|
|||
./scripts/tests/verify-documents-auth-smoke.sh
|
||||
./scripts/tests/scrape-here-smoke.sh
|
||||
./scripts/tests/bootstrap-recurring-scrape-smoke.sh
|
||||
./scripts/tests/audit-archive-json-smoke.sh
|
||||
|
||||
test:
|
||||
# Tests need access to secrets, so we can't run them against PRs because of limited trust
|
||||
|
|
|
|||
42
docs/plans/2026-05-29-018-fix-corrupt-archive-json-plan.md
Normal file
42
docs/plans/2026-05-29-018-fix-corrupt-archive-json-plan.md
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
---
|
||||
title: fix: Salvage corrupt archive JSON and harden scrape loop
|
||||
type: fix
|
||||
status: complete
|
||||
date: 2026-05-29
|
||||
origin: LFG — KotOR yes_general export truncated; prove/cron fail on jq parse
|
||||
---
|
||||
|
||||
# fix: Salvage corrupt archive JSON and harden scrape loop
|
||||
|
||||
## Summary
|
||||
|
||||
One KotOR archive JSON is truncated mid-message. Add audit/salvage tooling and make prove/scrape skip or repair invalid files without aborting entire targets.
|
||||
|
||||
## Requirements
|
||||
|
||||
| ID | Requirement |
|
||||
|----|-------------|
|
||||
| R1 | `audit-archive-json.sh` lists invalid JSON per target/output_dir |
|
||||
| R2 | `salvage-truncated-export.sh` backs up and repairs truncated DCE exports |
|
||||
| R3 | `prove-incremental-append.sh` skips invalid JSON with warning (not fatal) |
|
||||
| R4 | Salvaged KotOR file passes `jq empty` and prove for that target |
|
||||
| R5 | Smoke test for audit script |
|
||||
|
||||
## Implementation Units
|
||||
|
||||
### U1. Audit + salvage scripts
|
||||
|
||||
**Files:** `scripts/audit-archive-json.sh`, `scripts/salvage-truncated-export.sh`
|
||||
|
||||
### U2. Prove hardening
|
||||
|
||||
**Files:** `scripts/prove-incremental-append.sh`
|
||||
|
||||
### U3. Repair KotOR file (runtime)
|
||||
|
||||
**File:** `~/Documents/KotOR_discord_msgs/...yes_general [221726893064454144].json`
|
||||
|
||||
## Verification
|
||||
|
||||
- `jq empty` on salvaged file
|
||||
- `prove-incremental-append.sh --target KotOR_discord_msgs`
|
||||
|
|
@ -9,6 +9,7 @@ Use this after cloning or opening the **source** repo (`DiscordChatExporter`, no
|
|||
3. `./scripts/bootstrap-recurring-scrape.sh` — verify archives, build image, preflight Discord.
|
||||
4. `./scripts/run-documents-scrape.sh` — first incremental append-only scrape.
|
||||
5. `./scripts/prove-incremental-append.sh --target <name>` — optional grow-only proof.
|
||||
6. `./scripts/audit-archive-json.sh` — optional; lists invalid JSON before cron runs.
|
||||
|
||||
## Monthly automation
|
||||
|
||||
|
|
|
|||
87
scripts/audit-archive-json.sh
Executable file
87
scripts/audit-archive-json.sh
Executable file
|
|
@ -0,0 +1,87 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
|
||||
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
||||
CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
|
||||
TARGET=""
|
||||
FAILURES=0
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage:
|
||||
$(basename "$0") [--config PATH] [--target NAME]
|
||||
|
||||
Validate JSON syntax for every channel export under configured targets.
|
||||
Exits 1 when any invalid file is found.
|
||||
EOF
|
||||
}
|
||||
|
||||
die() {
|
||||
printf 'ERROR: %s\n' "$*" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
audit_dir() {
|
||||
local output_dir=$1
|
||||
local file_path
|
||||
|
||||
[[ -d "$output_dir" ]] || return 0
|
||||
|
||||
while IFS= read -r -d '' file_path; do
|
||||
if jq empty "$file_path" >/dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
printf 'INVALID\t%s\n' "$file_path"
|
||||
FAILURES=$((FAILURES + 1))
|
||||
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null)
|
||||
}
|
||||
|
||||
main() {
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--config)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --config."
|
||||
CONFIG_PATH=$2
|
||||
shift 2
|
||||
;;
|
||||
--target)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --target."
|
||||
TARGET=$2
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
die "Unknown option: $1"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
command -v jq >/dev/null 2>&1 || die "jq is required."
|
||||
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
|
||||
|
||||
if [[ -n "$TARGET" ]]; then
|
||||
local output_dir
|
||||
output_dir=$(jq -r --arg name "$TARGET" '.targets[] | select(.name == $name) | .output_dir' "$CONFIG_PATH")
|
||||
[[ -n "$output_dir" && "$output_dir" != null ]] || die "Unknown target: $TARGET"
|
||||
audit_dir "$output_dir"
|
||||
else
|
||||
while IFS= read -r output_dir; do
|
||||
[[ -n "$output_dir" ]] || continue
|
||||
audit_dir "$output_dir"
|
||||
done < <(jq -r '.targets[] | select(.enabled != false) | .output_dir' "$CONFIG_PATH")
|
||||
fi
|
||||
|
||||
if (( FAILURES > 0 )); then
|
||||
printf '\n%d invalid JSON archive file(s). Run scripts/salvage-truncated-export.sh on each path.\n' "$FAILURES" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf 'All checked archive JSON files are valid.\n'
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -58,6 +58,10 @@ snapshot_archives() {
|
|||
file_name=$(basename "$file_path")
|
||||
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
||||
channel_id=${BASH_REMATCH[1]}
|
||||
if ! jq empty "$file_path" >/dev/null 2>&1; then
|
||||
printf 'WARN: skipping invalid JSON during snapshot: %s\n' "$file_path" >&2
|
||||
continue
|
||||
fi
|
||||
count=$(jq -r '(.messages | length) // 0' "$file_path")
|
||||
printf '%s\t%s\t%s\n' "$file_path" "$channel_id" "$count" >>"$snapshot_file"
|
||||
fi
|
||||
|
|
|
|||
99
scripts/salvage-truncated-export.sh
Executable file
99
scripts/salvage-truncated-export.sh
Executable file
|
|
@ -0,0 +1,99 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage:
|
||||
$(basename "$0") PATH [--dry-run]
|
||||
|
||||
Repair a truncated DiscordChatExporter JSON export by dropping the incomplete
|
||||
final message and closing the messages array.
|
||||
|
||||
Creates PATH.bak.<timestamp> before modifying PATH.
|
||||
EOF
|
||||
}
|
||||
|
||||
die() {
|
||||
printf 'ERROR: %s\n' "$*" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
main() {
|
||||
local export_path=${1:-}
|
||||
local dry_run=0
|
||||
|
||||
[[ -n "$export_path" ]] || {
|
||||
usage
|
||||
exit 1
|
||||
}
|
||||
shift || true
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
dry_run=1
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
die "Unknown option: $1"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ -f "$export_path" ]] || die "File not found: $export_path"
|
||||
command -v python3 >/dev/null 2>&1 || die "python3 is required."
|
||||
|
||||
if jq empty "$export_path" >/dev/null 2>&1; then
|
||||
printf 'Already valid JSON: %s\n' "$export_path"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
python3 - "$export_path" "$dry_run" <<'PY'
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
path = Path(sys.argv[1])
|
||||
dry_run = sys.argv[2] == "1"
|
||||
data = path.read_bytes()
|
||||
marker = b"},\n {"
|
||||
idx = data.rfind(marker)
|
||||
if idx < 0:
|
||||
print("ERROR: could not find a safe message boundary to truncate", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
truncated = data[: idx + 1]
|
||||
# idx+1 ends at closing brace of last complete message
|
||||
suffix = b'\n ],\n "messageCount": 0\n}'
|
||||
# preserve messageCount if we can count roughly - jq will fix on merge
|
||||
out = truncated + suffix
|
||||
|
||||
if dry_run:
|
||||
print(f"Would salvage {path} ({len(data)} -> {len(out)} bytes)")
|
||||
sys.exit(0)
|
||||
|
||||
backup = path.with_suffix(path.suffix + f".bak.{datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')}")
|
||||
backup.write_bytes(data)
|
||||
path.write_bytes(out)
|
||||
print(f"Backup: {backup}")
|
||||
print(f"Salvaged: {path} ({len(data)} -> {len(out)} bytes)")
|
||||
PY
|
||||
|
||||
jq empty "$export_path" >/dev/null 2>&1 || die "Salvage did not produce valid JSON."
|
||||
|
||||
local temp_file
|
||||
temp_file=$(mktemp "${TMPDIR:-/tmp}/dce-salvage.XXXXXX.json")
|
||||
jq '.messageCount = (.messages | length)' "$export_path" >"$temp_file"
|
||||
mv -f "$temp_file" "$export_path"
|
||||
|
||||
local count
|
||||
count=$(jq -r '(.messages | length) // 0' "$export_path")
|
||||
printf 'Valid JSON with %s messages.\n' "$count"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
59
scripts/tests/audit-archive-json-smoke.sh
Executable file
59
scripts/tests/audit-archive-json-smoke.sh
Executable file
|
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
|
||||
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-audit-smoke.XXXXXX")
|
||||
ARCHIVE_ROOT="$TMP_DIR/archive"
|
||||
CONFIG_PATH="$TMP_DIR/config.json"
|
||||
AUDIT="$REPO_ROOT/scripts/audit-archive-json.sh"
|
||||
|
||||
cleanup() {
|
||||
rm -rf "$TMP_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
mkdir -p "$ARCHIVE_ROOT/good" "$ARCHIVE_ROOT/bad"
|
||||
|
||||
cat >"$ARCHIVE_ROOT/good/valid [111].json" <<'JSON'
|
||||
{"guild":{"id":"1","name":"g"},"channel":{"id":"111","name":"c"},"messages":[],"messageCount":0}
|
||||
JSON
|
||||
|
||||
printf '{"messages":[\n' >"$ARCHIVE_ROOT/bad/truncated [222].json"
|
||||
|
||||
cat >"$CONFIG_PATH" <<JSON
|
||||
{
|
||||
"archive_root": "$ARCHIVE_ROOT",
|
||||
"targets": [
|
||||
{
|
||||
"name": "demo",
|
||||
"kind": "guild",
|
||||
"output_dir": "$ARCHIVE_ROOT/good",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"name": "broken",
|
||||
"kind": "guild",
|
||||
"output_dir": "$ARCHIVE_ROOT/bad",
|
||||
"enabled": true
|
||||
}
|
||||
]
|
||||
}
|
||||
JSON
|
||||
|
||||
DCE_CONFIG_FILE="$CONFIG_PATH" "$AUDIT" --target demo
|
||||
|
||||
set +e
|
||||
broken_output=$(DCE_CONFIG_FILE="$CONFIG_PATH" "$AUDIT" --target broken 2>&1)
|
||||
broken_status=$?
|
||||
set -e
|
||||
if [[ "$broken_status" -eq 0 ]]; then
|
||||
printf 'ERROR: audit should fail for target with invalid JSON\n' >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! grep -q 'INVALID' <<<"$broken_output"; then
|
||||
printf 'ERROR: audit output missing INVALID marker\n' >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf 'audit-archive-json-smoke: OK\n'
|
||||
Loading…
Reference in a new issue