mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
fix(scrape): skip active stale temps and retry salvage merge
Avoid salvaging export.json while a channel export is still writing (default: skip temps modified within 120s). Retry truncate+merge once when merge fails on a quiescent partial temp. Adds active-skip smoke and ages stale fixture mtime so salvage tests stay deterministic.
This commit is contained in:
parent
14796e9c09
commit
ae120c916f
|
|
@ -0,0 +1,42 @@
|
||||||
|
---
|
||||||
|
title: "fix: Skip active stale temps and retry salvage merge"
|
||||||
|
type: fix
|
||||||
|
status: complete
|
||||||
|
date: 2026-06-04
|
||||||
|
origin: /lfg — yes_general logs show Stale temp merge failed while export.json still growing (73MB+ invalid JSON)
|
||||||
|
---
|
||||||
|
|
||||||
|
# fix: Skip active stale temps and retry salvage merge
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
`salvage_stale_temp_exports` can run while a channel export is still writing `export.json`. The file is truncated/invalid, `merge_exports_auto` fails, and the temp is retained — but the next incremental pass hits the same race. Observed on KotOR `yes_general` (`221726893064454144`): merge fails on ~82MB partial temp while archive stays at 266182 messages (2021 cursor).
|
||||||
|
|
||||||
|
Salvage after export completes works (truncated temp → 79529 messages merges to 345711 in ~58s).
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement |
|
||||||
|
|----|-------------|
|
||||||
|
| R1 | Skip stale temp dirs whose `export.json` was modified within `DCE_STALE_TEMP_MIN_AGE_SECONDS` (default 120) |
|
||||||
|
| R2 | On merge failure, re-run `salvage_truncated_json` and retry merge once before retaining temp |
|
||||||
|
| R3 | Log merge retry vs skip-active with distinct messages |
|
||||||
|
| R4 | Offline smoke: active temp skipped; retry succeeds after simulated truncation |
|
||||||
|
| R5 | `run-all-smokes.sh` passes |
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
- `scripts/run-discord-scrape.sh` — `stale_temp_is_active`, skip guard, merge retry helper
|
||||||
|
- `scripts/tests/run-discord-scrape-smoke.sh` — active-temp skip + merge-retry scenarios
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/tests/run-discord-scrape-smoke.sh
|
||||||
|
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Out of scope
|
||||||
|
|
||||||
|
- Completing yes_general catch-up inside LFG
|
||||||
|
- Container memory limits
|
||||||
|
|
@ -629,12 +629,62 @@ PY
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stale_temp_is_active() {
|
||||||
|
local stale_export=$1
|
||||||
|
local min_age=${DCE_STALE_TEMP_MIN_AGE_SECONDS:-120}
|
||||||
|
local now mtime age
|
||||||
|
|
||||||
|
if [[ "${DCE_SALVAGE_ACTIVE_TEMPS:-0}" == "1" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
now=$(date +%s)
|
||||||
|
mtime=$(stat -c '%Y' "$stale_export" 2>/dev/null || stat -f '%m' "$stale_export" 2>/dev/null || echo 0)
|
||||||
|
age=$((now - mtime))
|
||||||
|
(( age < min_age ))
|
||||||
|
}
|
||||||
|
|
||||||
|
merge_stale_export_into_destination() {
|
||||||
|
local destination_path=$1
|
||||||
|
local stale_export=$2
|
||||||
|
local stale_dir=$3
|
||||||
|
local salvage_merged="$stale_dir/merged.json"
|
||||||
|
local attempt=0
|
||||||
|
|
||||||
|
while (( attempt < 2 )); do
|
||||||
|
if (( attempt > 0 )); then
|
||||||
|
salvage_truncated_json "$stale_export" || true
|
||||||
|
fi
|
||||||
|
rm -f "$salvage_merged"
|
||||||
|
if merge_exports_auto "$destination_path" "$stale_export" "$salvage_merged" && [[ -s "$salvage_merged" ]]; then
|
||||||
|
if json_is_valid "$salvage_merged"; then
|
||||||
|
local before_count after_count
|
||||||
|
before_count=$(message_count_fast "$destination_path")
|
||||||
|
commit_merged_export "$destination_path" "$salvage_merged"
|
||||||
|
after_count=$(message_count_fast "$destination_path")
|
||||||
|
if (( after_count > before_count )); then
|
||||||
|
log " SALVAGED $destination_path (+$((after_count - before_count)) messages from stale temp, $before_count → $after_count)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log " Stale temp merged with no new messages, discarding: $stale_dir"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log " Stale temp merge produced invalid JSON, retaining for retry: $stale_dir"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
attempt=$((attempt + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
log " Stale temp merge failed, retaining for retry: $stale_dir"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
salvage_stale_temp_exports() {
|
salvage_stale_temp_exports() {
|
||||||
local output_dir=$1
|
local output_dir=$1
|
||||||
local channel_id=$2
|
local channel_id=$2
|
||||||
local destination_path=$3
|
local destination_path=$3
|
||||||
|
|
||||||
local stale_dirs stale_dir stale_export salvage_merged
|
local stale_dirs stale_dir stale_export
|
||||||
mapfile -t stale_dirs < <(
|
mapfile -t stale_dirs < <(
|
||||||
find "$output_dir/.dce-temp" -maxdepth 1 -type d -name "export.${channel_id}.*" 2>/dev/null || true
|
find "$output_dir/.dce-temp" -maxdepth 1 -type d -name "export.${channel_id}.*" 2>/dev/null || true
|
||||||
)
|
)
|
||||||
|
|
@ -646,6 +696,11 @@ salvage_stale_temp_exports() {
|
||||||
[[ -f "$stale_export" ]] || { rm -rf "$stale_dir"; continue; }
|
[[ -f "$stale_export" ]] || { rm -rf "$stale_dir"; continue; }
|
||||||
[[ -s "$stale_export" ]] || { rm -rf "$stale_dir"; continue; }
|
[[ -s "$stale_export" ]] || { rm -rf "$stale_dir"; continue; }
|
||||||
|
|
||||||
|
if stale_temp_is_active "$stale_export"; then
|
||||||
|
log " Stale temp still active (recently modified), skipping salvage: $stale_dir"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
if ! salvage_truncated_json "$stale_export"; then
|
if ! salvage_truncated_json "$stale_export"; then
|
||||||
log " Stale temp export unsalvageable, discarding: $stale_dir"
|
log " Stale temp export unsalvageable, discarding: $stale_dir"
|
||||||
rm -rf "$stale_dir"
|
rm -rf "$stale_dir"
|
||||||
|
|
@ -668,25 +723,8 @@ salvage_stale_temp_exports() {
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ -n "$destination_path" && -f "$destination_path" ]]; then
|
if [[ -n "$destination_path" && -f "$destination_path" ]]; then
|
||||||
salvage_merged="$stale_dir/merged.json"
|
if merge_stale_export_into_destination "$destination_path" "$stale_export" "$stale_dir"; then
|
||||||
if merge_exports_auto "$destination_path" "$stale_export" "$salvage_merged" && [[ -s "$salvage_merged" ]]; then
|
|
||||||
if json_is_valid "$salvage_merged"; then
|
|
||||||
local before_count after_count
|
|
||||||
before_count=$(message_count_fast "$destination_path")
|
|
||||||
commit_merged_export "$destination_path" "$salvage_merged"
|
|
||||||
after_count=$(message_count_fast "$destination_path")
|
|
||||||
if (( after_count > before_count )); then
|
|
||||||
log " SALVAGED $destination_path (+$((after_count - before_count)) messages from stale temp, $before_count → $after_count)"
|
|
||||||
merged_ok=1
|
merged_ok=1
|
||||||
else
|
|
||||||
log " Stale temp merged with no new messages, discarding: $stale_dir"
|
|
||||||
merged_ok=1
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log " Stale temp merge produced invalid JSON, retaining for retry: $stale_dir"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log " Stale temp merge failed, retaining for retry: $stale_dir"
|
|
||||||
fi
|
fi
|
||||||
elif [[ -n "$destination_path" ]]; then
|
elif [[ -n "$destination_path" ]]; then
|
||||||
mkdir -p "$(dirname "$destination_path")"
|
mkdir -p "$(dirname "$destination_path")"
|
||||||
|
|
|
||||||
|
|
@ -150,6 +150,14 @@ cat >"$CONFIG_PATH" <<JSON
|
||||||
"channel_ids": ["111"],
|
"channel_ids": ["111"],
|
||||||
"guild_ids": [],
|
"guild_ids": [],
|
||||||
"guild_name_patterns": []
|
"guild_name_patterns": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "salvage-stale-active",
|
||||||
|
"kind": "guild",
|
||||||
|
"output_dir": "$ARCHIVE_ROOT/salvage-stale-active",
|
||||||
|
"channel_ids": ["111"],
|
||||||
|
"guild_ids": [],
|
||||||
|
"guild_name_patterns": []
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -417,12 +425,28 @@ grep -q 'exit 143' "$SKIP_SIGTERM_LOG" || { echo "expected sigterm exit logged f
|
||||||
grep -q 'Preserving partial export temp' "$SKIP_SIGTERM_LOG" || { echo "expected partial temp preserved on sigterm channel 143" >&2; exit 1; }
|
grep -q 'Preserving partial export temp' "$SKIP_SIGTERM_LOG" || { echo "expected partial temp preserved on sigterm channel 143" >&2; exit 1; }
|
||||||
|
|
||||||
# Salvage stale temp export smoke
|
# Salvage stale temp export smoke
|
||||||
|
mkdir -p "$ARCHIVE_ROOT/salvage-stale-active"
|
||||||
|
cp "$FIXTURE_DIR/append-existing.json" "$ARCHIVE_ROOT/salvage-stale-active/$DEFAULT_FILE_NAME"
|
||||||
|
mkdir -p "$ARCHIVE_ROOT/salvage-stale-active/.dce-meta"
|
||||||
|
printf '{\"111\":\"%s\"}\n' "$ARCHIVE_ROOT/salvage-stale-active/$DEFAULT_FILE_NAME" >"$ARCHIVE_ROOT/salvage-stale-active/.dce-meta/channel-map.json"
|
||||||
|
mkdir -p "$ARCHIVE_ROOT/salvage-stale-active/.dce-temp/export.111.ACTIVE"
|
||||||
|
cp "$FIXTURE_DIR/salvage-truncated.json" "$ARCHIVE_ROOT/salvage-stale-active/.dce-temp/export.111.ACTIVE/export.json"
|
||||||
|
SALVAGE_ACTIVE_LOG="$TMP_DIR/salvage-stale-active.log"
|
||||||
|
DCE_STALE_TEMP_MIN_AGE_SECONDS=9999 \
|
||||||
|
run_wrapper salvage-stale-active append 2>"$SALVAGE_ACTIVE_LOG"
|
||||||
|
grep -q 'still active' "$SALVAGE_ACTIVE_LOG" || { echo "expected active stale temp skip message" >&2; exit 1; }
|
||||||
|
[[ -d "$ARCHIVE_ROOT/salvage-stale-active/.dce-temp/export.111.ACTIVE" ]] || {
|
||||||
|
echo "expected active stale temp dir to be retained" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
mkdir -p "$ARCHIVE_ROOT/salvage-stale"
|
mkdir -p "$ARCHIVE_ROOT/salvage-stale"
|
||||||
cp "$FIXTURE_DIR/append-existing.json" "$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME"
|
cp "$FIXTURE_DIR/append-existing.json" "$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME"
|
||||||
mkdir -p "$ARCHIVE_ROOT/salvage-stale/.dce-meta"
|
mkdir -p "$ARCHIVE_ROOT/salvage-stale/.dce-meta"
|
||||||
printf '{\"111\":\"%s\"}\n' "$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME" >"$ARCHIVE_ROOT/salvage-stale/.dce-meta/channel-map.json"
|
printf '{\"111\":\"%s\"}\n' "$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME" >"$ARCHIVE_ROOT/salvage-stale/.dce-meta/channel-map.json"
|
||||||
mkdir -p "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE"
|
mkdir -p "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE"
|
||||||
cp "$FIXTURE_DIR/salvage-truncated.json" "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE/export.json"
|
cp "$FIXTURE_DIR/salvage-truncated.json" "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE/export.json"
|
||||||
|
touch -d '1 hour ago' "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE/export.json"
|
||||||
SALVAGE_LOG="$TMP_DIR/salvage-stale.log"
|
SALVAGE_LOG="$TMP_DIR/salvage-stale.log"
|
||||||
run_wrapper salvage-stale append 2>"$SALVAGE_LOG"
|
run_wrapper salvage-stale append 2>"$SALVAGE_LOG"
|
||||||
SALVAGE_DEST="$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME"
|
SALVAGE_DEST="$ARCHIVE_ROOT/salvage-stale/$DEFAULT_FILE_NAME"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue