diff --git a/docs/plans/2026-06-03-042-fix-salvage-stale-temp-exports-plan.md b/docs/plans/2026-06-03-042-fix-salvage-stale-temp-exports-plan.md new file mode 100644 index 00000000..fbc6cbd7 --- /dev/null +++ b/docs/plans/2026-06-03-042-fix-salvage-stale-temp-exports-plan.md @@ -0,0 +1,91 @@ +--- +title: "fix: Salvage stale temp exports before re-downloading" +type: fix +status: active +date: 2026-06-03 +origin: /lfg — yes_general re-downloads 514 MB because prior aborted run's temp data is never recovered +--- + +# fix: Salvage stale temp exports before re-downloading + +## Problem + +`scrape_target()` always creates a fresh temp directory and starts `export_channel_incremental` from `last_message_id(archive)`. When a previous run crashes (OOM, abort, kill), the partially-downloaded temp export is orphaned under `.dce-temp/export..*` but never cleaned up or reused. + +For `yes_general` (channel `221726893064454144`): +- Archive last message: **2021-01-17** (`800354246440648745`), 264K messages, 312 MB +- Stale temp export from May 29: **514 MB** of truncated JSON (messages 2021→mid-2026) +- Every re-run downloads all of those messages again from scratch + +The `salvage-truncated-export.sh` script already exists but is never called automatically. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | Before exporting a channel, `scrape_target` checks for orphaned temp dirs matching `.dce-temp/export..*` | +| R2 | If an orphaned temp export contains truncated JSON, salvage it to valid JSON using the same logic as `salvage-truncated-export.sh` | +| R3 | If salvage succeeds, merge the recovered messages into the archive (same merge_exports + commit_merged_export path) | +| R4 | Clean up stale temp dirs after salvage (success or failure) | +| R5 | After salvage-merge, `last_message_id` returns the advanced ID so the incremental only fetches truly new messages | +| R6 | If salvage fails (can't find a safe truncation point), delete the stale temp and proceed normally with a full incremental | +| R7 | Existing 19 smokes + new salvage smoke pass | + +## Files + +- `scripts/run-discord-scrape.sh` — add `salvage_stale_temp_exports()` called at top of per-channel loop in `scrape_target()` +- `scripts/tests/run-discord-scrape-smoke.sh` — add `salvage-stale` smoke: seed a truncated temp export, run scrape, verify messages are merged and `--after` advances + +## Implementation + +### `salvage_stale_temp_exports()` + +``` +salvage_stale_temp_exports(output_dir, channel_id): + glob = output_dir/.dce-temp/export..*/export.json + for each stale_export matching glob: + if jq empty succeeds → already valid JSON + else → run inline python salvage (same as salvage-truncated-export.sh) + if salvage fails → rm -rf stale_dir, continue + + validate channel identity + if archive exists: + merge_exports(archive, stale_export, temp_merged) + commit_merged_export(archive, temp_merged) + log "SALVAGED" with message counts + else: + mv stale_export → archive destination + rm -rf stale_dir +``` + +Called in `scrape_target()` before line 963 (`after_id=$(last_message_id ...)`), so the salvaged data is already in the archive when `--after` is computed. + +### Smoke test + +1. Seed archive with 2 messages (existing fixture) +2. Create a fake stale `.dce-temp/export..STALE/export.json` with truncated JSON containing message id "3" +3. Run scrape in append mode +4. Verify archive has 3+ messages (salvaged + incremental) +5. Verify stale temp dir is cleaned up + +## Test scenarios + +| Scenario | Expected | +|----------|----------| +| Stale temp with truncated JSON → salvageable | Messages merged, temp cleaned, `--after` advances | +| Stale temp with unsalvageable data (too short) | Temp deleted, normal incremental proceeds | +| Stale temp with valid JSON (complete export) | Merged directly, temp cleaned | +| No stale temps | Normal behavior, no change | +| Multiple stale temps for same channel | All salvaged in order, then normal incremental | + +## Verification + +```bash +./scripts/tests/run-discord-scrape-smoke.sh +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Out of scope + +- Configurable skip-vs-retry for OOM channels (separate concern) +- Increasing container memory limits diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh index 7d4bbf44..0c36ebd1 100755 --- a/scripts/run-discord-scrape.sh +++ b/scripts/run-discord-scrape.sh @@ -549,6 +549,96 @@ message_count() { jq -r '(.messages | length) // 0' "$export_path" } +salvage_truncated_json() { + local export_path=$1 + if jq empty "$export_path" >/dev/null 2>&1; then + return 0 + fi + command -v python3 >/dev/null 2>&1 || return 1 + python3 - "$export_path" <<'PY' || return 1 +import sys +from pathlib import Path + +path = Path(sys.argv[1]) +data = path.read_bytes() +marker = b"},\n {" +idx = data.rfind(marker) +if idx < 0: + sys.exit(1) + +truncated = data[: idx + 1] +suffix = b'\n ],\n "messageCount": 0\n}' +path.write_bytes(truncated + suffix) +PY + jq empty "$export_path" >/dev/null 2>&1 || return 1 + local temp_file + temp_file=$(mktemp "${TMPDIR:-/tmp}/dce-salvage-fix.XXXXXX.json") + if jq '.messageCount = (.messages | length)' "$export_path" >"$temp_file" 2>/dev/null; then + mv -f "$temp_file" "$export_path" + else + rm -f "$temp_file" + fi +} + +salvage_stale_temp_exports() { + local output_dir=$1 + local channel_id=$2 + local destination_path=$3 + + local stale_dirs stale_dir stale_export salvage_merged + mapfile -t stale_dirs < <( + find "$output_dir/.dce-temp" -maxdepth 1 -type d -name "export.${channel_id}.*" 2>/dev/null || true + ) + + (( ${#stale_dirs[@]} > 0 )) || return 0 + + for stale_dir in "${stale_dirs[@]}"; do + stale_export="$stale_dir/export.json" + [[ -f "$stale_export" ]] || { rm -rf "$stale_dir"; continue; } + [[ -s "$stale_export" ]] || { rm -rf "$stale_dir"; continue; } + + if ! salvage_truncated_json "$stale_export"; then + log " Stale temp export unsalvageable, discarding: $stale_dir" + rm -rf "$stale_dir" + continue + fi + + local stale_channel_id + stale_channel_id=$(channel_id_from_export "$stale_export" 2>/dev/null) || true + if [[ -n "$stale_channel_id" && "$stale_channel_id" != "$channel_id" ]]; then + log " Stale temp export wrong channel ($stale_channel_id != $channel_id), discarding: $stale_dir" + rm -rf "$stale_dir" + continue + fi + + local salvage_count + salvage_count=$(message_count "$stale_export") + if (( salvage_count == 0 )); then + rm -rf "$stale_dir" + continue + fi + + if [[ -n "$destination_path" && -f "$destination_path" ]]; then + salvage_merged="$stale_dir/merged.json" + if merge_exports "$destination_path" "$stale_export" "$salvage_merged" && [[ -s "$salvage_merged" ]]; then + if jq empty "$salvage_merged" >/dev/null 2>&1; then + local before_count after_count + before_count=$(message_count "$destination_path") + commit_merged_export "$destination_path" "$salvage_merged" + after_count=$(message_count "$destination_path") + log " SALVAGED $destination_path (+$((after_count - before_count)) messages from stale temp, $before_count → $after_count)" + fi + fi + elif [[ -n "$destination_path" ]]; then + mkdir -p "$(dirname "$destination_path")" + cp "$stale_export" "$destination_path" + log " SALVAGED $destination_path (${salvage_count} messages from stale temp, new archive)" + fi + + rm -rf "$stale_dir" + done +} + is_skippable_channel_export_failure() { local log_file=$1 grep -qiE \ @@ -960,8 +1050,13 @@ scrape_target() { guild_label=$(guild_label_from_export "$destination_path") fi - after_id=$(last_message_id "$destination_path") mkdir -p "$output_dir/.dce-temp" + salvage_stale_temp_exports "$output_dir" "$channel_id" "$destination_path" + + if [[ -n "$destination_path" && -f "$destination_path" ]]; then + before_count=$(message_count "$destination_path") + fi + after_id=$(last_message_id "$destination_path") temp_dir=$(mktemp -d "$output_dir/.dce-temp/export.${channel_id}.XXXXXX") temp_export="$temp_dir/export.json" temp_merged="$temp_dir/merged.json" diff --git a/scripts/tests/run-discord-scrape-smoke.sh b/scripts/tests/run-discord-scrape-smoke.sh index 9aa37202..5a026404 100755 --- a/scripts/tests/run-discord-scrape-smoke.sh +++ b/scripts/tests/run-discord-scrape-smoke.sh @@ -134,6 +134,14 @@ cat >"$CONFIG_PATH" <&2; exit 1; } grep -q 'SKIPPED.*134' "$SKIP_ABORT_LOG" || { echo "expected SKIPPED line for abort channel 134" >&2; exit 1; } +# Salvage stale temp export smoke +mkdir -p "$ARCHIVE_ROOT/salvage-stale" +cp "$FIXTURE_DIR/append-existing.json" "$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME" +mkdir -p "$ARCHIVE_ROOT/salvage-stale/.dce-meta" +printf '{\"111\":\"%s\"}\n' "$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME" >"$ARCHIVE_ROOT/salvage-stale/.dce-meta/channel-map.json" +mkdir -p "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE" +cp "$FIXTURE_DIR/salvage-truncated.json" "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE/export.json" +SALVAGE_LOG="$TMP_DIR/salvage-stale.log" +run_wrapper salvage-stale append 2>"$SALVAGE_LOG" +SALVAGE_DEST="$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME" +SALVAGE_COUNT=$(jq -r '.messages | length' "$SALVAGE_DEST") +(( SALVAGE_COUNT >= 3 )) || { echo "expected salvage-stale archive to have at least 3 messages (got $SALVAGE_COUNT)" >&2; exit 1; } +jq -e '.messages[] | select(.id == "3")' "$SALVAGE_DEST" >/dev/null || { echo "expected salvaged message id 3 in archive" >&2; exit 1; } +[[ ! -d "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE" ]] || { echo "expected stale temp dir cleaned up after salvage" >&2; exit 1; } +grep -q 'SALVAGED' "$SALVAGE_LOG" || { echo "expected SALVAGED line in salvage log" >&2; exit 1; } + # shellcheck disable=SC1091 source "$REPO_ROOT/scripts/run-discord-scrape.sh" SHRINK_EXISTING="$TMP_DIR/shrink-existing.json" diff --git a/scripts/tests/test-fixtures/salvage-truncated.json b/scripts/tests/test-fixtures/salvage-truncated.json new file mode 100644 index 00000000..798dc49f --- /dev/null +++ b/scripts/tests/test-fixtures/salvage-truncated.json @@ -0,0 +1,20 @@ +{ + "guild": { + "id": "222", + "name": "Fixture Guild" + }, + "channel": { + "id": "111", + "name": "fixture-room", + "category": "Testing Grounds" + }, + "messages": [ + { + "id": "3", + "timestamp": "2026-01-03T00:00:00Z", + "content": "third" + }, + { + "id": "4", + "timestamp": "2026-01-04T00:00:00Z", + "content": "fourth - this message is trun \ No newline at end of file