diff --git a/docs/plans/2026-06-04-054-feat-salvage-only-subcommand-plan.md b/docs/plans/2026-06-04-054-feat-salvage-only-subcommand-plan.md new file mode 100644 index 00000000..3c7ad98c --- /dev/null +++ b/docs/plans/2026-06-04-054-feat-salvage-only-subcommand-plan.md @@ -0,0 +1,65 @@ +--- +title: "feat: Salvage-only mode for stale temp exports" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — yes_general has 154MB+ active partial temp while full-target validation runs; operator needs to merge stale temps without re-downloading years of history after stopping a run +--- + +# feat: Salvage-only mode for stale temp exports + +## Summary + +Add a `salvage` subcommand that merges quiescent `.dce-temp` exports into archives without calling Discord, wired through the host runner and `run-documents-scrape.sh --salvage-only`. + +## Problem Frame + +After stopping a long-running or OOM-aborted export, operators must advance the archive cursor from preserved partial temps. Today salvage only runs at the start of a full `scrape`, which re-fetches from the archive `--after` cursor and can repeat multi-year catch-up on `yes_general`. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `run-discord-scrape.sh salvage` merges stale temps per resolved channel; no Discord CLI export calls | +| R2 | Salvage mode does not require `DISCORD_TOKEN` | +| R3 | Existing `DCE_SALVAGE_ACTIVE_TEMPS=1` and `DCE_STALE_TEMP_MIN_AGE_SECONDS` env behavior applies | +| R4 | `run-discord-scrape-host.sh salvage` acquires archive-root lock and runs salvage locally (no compose/token) | +| R5 | `run-documents-scrape.sh --salvage-only` skips preflight/scrape and invokes host salvage | +| R6 | Smoke covers salvage subcommand; `run-all-smokes.sh` passes | + +## Key Technical Decisions + +- **Local host execution for salvage**: Avoids compose/token requirements; salvage is filesystem-only. +- **Reuse `salvage_stale_temp_exports`**: Same merge path as scrape preamble; no duplicate logic. + +## Implementation Units + +### U1. Core salvage subcommand + +**Goal:** `salvage_only_target` + `run_target_mode salvage` without token gate. + +**Requirements:** R1–R3 + +**Files:** `scripts/run-discord-scrape.sh`, `scripts/tests/run-discord-scrape-smoke.sh` + +### U2. Host and documents wiring + +**Goal:** Operator entry points for salvage-only. + +**Requirements:** R4–R5 + +**Files:** `scripts/run-discord-scrape-host.sh`, `scripts/run-documents-scrape.sh`, `scripts/tests/documents-scrape-smoke.sh` + +### U3. Smoke gate + +**Requirements:** R6 + +**Verification:** `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` + +## Scope Boundaries + +### Deferred + +- Killing stale validation on host +- Live yes_general catch-up inside LFG +- Container memory tuning diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index cee8c0da..7cc7be98 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -27,6 +27,7 @@ usage() { Usage: $(basename "$0") preflight [run-discord-scrape options...] $(basename "$0") scrape [run-discord-scrape options...] + $(basename "$0") salvage [run-discord-scrape options...] Options: --env-file PATH Env file to load and pass to compose. Default: $ENV_FILE @@ -419,6 +420,28 @@ resolve_reauth_command() { printf '%s\n' "$resolved_path" } +run_local_salvage() { + local host_config=$1 + shift + local -a local_args=() skip_next=0 arg + + for arg in "$@"; do + if (( skip_next )); then + skip_next=0 + continue + fi + if [[ "$arg" == "--config" ]]; then + skip_next=1 + continue + fi + local_args+=("$arg") + done + + DCE_PRIMARY_CONFIG="$host_config" \ + DCE_FALLBACK_CONFIG="$host_config" \ + "$SCRIPT_DIR/run-discord-scrape.sh" salvage --config "$host_config" "${local_args[@]}" +} + resolve_host_config_path() { local -a args=("$@") local i=0 cfg="$REPO_ROOT/config/scrape-targets.json" @@ -542,7 +565,7 @@ main() { usage exit 0 ;; - preflight|scrape) + preflight|scrape|salvage) if [[ -n "$subcommand" ]]; then passthrough_args+=("$1") else @@ -552,7 +575,7 @@ main() { ;; *) if [[ -z "$subcommand" ]]; then - die "Unsupported subcommand '$1'. Use 'preflight' or 'scrape'." + die "Unsupported subcommand '$1'. Use 'preflight', 'scrape', or 'salvage'." fi passthrough_args+=("$1") shift @@ -575,7 +598,9 @@ main() { fi [[ -f "$COMPOSE_FILE" ]] || die "Missing compose file: $COMPOSE_FILE" - prepare_compose_env + if [[ "$subcommand" != "salvage" ]]; then + prepare_compose_env + fi REAUTH_COMMAND="${DCE_REAUTH_COMMAND:-}" run_disk_preflight_if_enabled "${passthrough_args[@]}" @@ -599,6 +624,10 @@ main() { acquire_scrape_lock "$host_config" run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" ;; + salvage) + acquire_scrape_lock "$host_config" + run_local_salvage "$host_config" "${passthrough_args[@]}" + ;; esac } diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh index 8824127b..d861afdf 100755 --- a/scripts/run-discord-scrape.sh +++ b/scripts/run-discord-scrape.sh @@ -15,6 +15,7 @@ usage() { Usage: run-discord-scrape.sh scrape [options] run-discord-scrape.sh preflight [options] + run-discord-scrape.sh salvage [options] run-discord-scrape.sh list-targets [--config PATH] run-discord-scrape.sh help run-discord-scrape.sh @@ -22,6 +23,7 @@ Usage: Subcommands: scrape Incrementally export channels into append-only JSON files. preflight Validate token/config/target resolution without writing archives. + salvage Merge quiescent stale .dce-temp exports into archives (no Discord export). list-targets Print configured targets from the scrape config. help Show this help text. @@ -1156,6 +1158,54 @@ preflight_target() { die "Target '$target_name' failed preflight: every resolved channel is inaccessible and no seeded archives exist under $output_dir." } +salvage_only_target() { + local target_json=$1 + local defaults_json=$2 + local target_name output_dir destination_path + local -a channel_ids=() + local channel_id before_count after_count + + target_name=$(jq -r '.name' <<<"$target_json") + output_dir=$(jq -r '.output_dir' <<<"$target_json") + mkdir -p "$output_dir" + bootstrap_channel_map_from_archives "$output_dir" + + mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") + if (( ${#channel_ids[@]} == 0 )); then + die "Target '$target_name' resolved no channels." + fi + + log "Target '$target_name': salvaging stale temp exports for ${#channel_ids[@]} channel(s) under $output_dir (no Discord export)." + log " Server scope: $(describe_target_resolution "$target_json")" + + for channel_id in "${channel_ids[@]}"; do + destination_path=$(resolve_destination_path "$output_dir" "$channel_id") + if [[ -n "$destination_path" && -f "$destination_path" ]]; then + jq empty "$destination_path" >/dev/null 2>&1 || die "Existing export is not valid JSON: $destination_path" + assert_export_channel_identity "$destination_path" "$channel_id" + before_count=$(message_count "$destination_path") + else + before_count=0 + fi + + mkdir -p "$output_dir/.dce-temp" + salvage_stale_temp_exports "$output_dir" "$channel_id" "$destination_path" + + if [[ -n "$destination_path" && -f "$destination_path" ]]; then + after_count=$(message_count "$destination_path") + if (( after_count > before_count )); then + log " Salvage appended $((after_count - before_count)) messages for channel $channel_id ($before_count → $after_count)." + else + log " No salvage merge for channel $channel_id." + fi + else + log " No archive path for channel $channel_id; salvage skipped or created nothing mergeable." + fi + done + + log "Target '$target_name': salvage completed." +} + scrape_target() { local target_json=$1 local defaults_json=$2 @@ -1345,7 +1395,9 @@ run_target_mode() { require_command jq validate_config_contract "$config_path" - [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set." + if [[ "$mode" != "salvage" ]]; then + [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set." + fi defaults_json=$(jq -c '.defaults // {}' "$config_path") mapfile -t selected_targets < <(load_selected_targets "$config_path" "${requested_targets[@]}") @@ -1373,11 +1425,17 @@ run_target_mode() { local target_json for target_json in "${selected_targets[@]}"; do - if [[ "$mode" == "preflight" ]]; then - preflight_target "$target_json" "$defaults_json" - else - scrape_target "$target_json" "$defaults_json" - fi + case "$mode" in + preflight) + preflight_target "$target_json" "$defaults_json" + ;; + salvage) + salvage_only_target "$target_json" "$defaults_json" + ;; + scrape) + scrape_target "$target_json" "$defaults_json" + ;; + esac done if [[ "$mode" == "scrape" ]]; then @@ -1416,6 +1474,9 @@ main() { scrape) run_target_mode scrape "$@" ;; + salvage) + run_target_mode salvage "$@" + ;; *) exec "$CLI_BIN" "$subcommand" "$@" ;; diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh index bf5d5342..512a45cd 100755 --- a/scripts/run-documents-scrape.sh +++ b/scripts/run-documents-scrape.sh @@ -17,16 +17,17 @@ source "$SCRIPT_DIR/lib/scrape-run-plan.sh" usage() { cat </ 2. Bootstrap scrape.env when DISCORD_TOKEN is exported - 3. Preflight against Discord (skipped with --dry-run) + 3. Preflight against Discord (skipped with --dry-run or --salvage-only) 4. Incremental scrape (append-only merges into existing JSON files) Options: - --dry-run Verify archives only; do not call Discord + --dry-run Verify archives only; do not call Discord + --salvage-only Merge quiescent stale .dce-temp exports only (no Discord export) --target NAME Limit preflight/scrape to one configured target --channel ID With exactly one --target, limit scrape to channel ID (repeatable) --config PATH Scrape target config (default: config/scrape-targets.json) @@ -40,6 +41,7 @@ die() { main() { local dry_run=0 + local salvage_only=0 local target="" local -a passthrough=() @@ -49,6 +51,10 @@ main() { dry_run=1 shift ;; + --salvage-only) + salvage_only=1 + shift + ;; --target) [[ $# -ge 2 ]] || die "Missing value for --target." target=$2 @@ -91,10 +97,22 @@ main() { "$VERIFY_READY" --disk-only --config "$CONFIG_PATH" - if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then - "$SETUP_AUTH" 2>/dev/null || true - elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then - "$SETUP_AUTH" 2>/dev/null || true + if (( salvage_only == 1 )); then + local -a salvage_args=(--config "$CONFIG_PATH") + local skip_next=0 arg + for arg in "${passthrough[@]}"; do + if (( skip_next )); then + skip_next=0 + continue + fi + if [[ "$arg" == "--config" ]]; then + skip_next=1 + continue + fi + salvage_args+=("$arg") + done + "$HOST_RUNNER" salvage "${salvage_args[@]}" + exit 0 fi local -a container_args=("${passthrough[@]}") @@ -117,6 +135,12 @@ main() { container_args=(--config "$CONTAINER_CONFIG" "${container_args[@]}") fi + if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then + "$SETUP_AUTH" 2>/dev/null || true + elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then + "$SETUP_AUTH" 2>/dev/null || true + fi + "$HOST_RUNNER" preflight "${container_args[@]}" "$HOST_RUNNER" scrape "${container_args[@]}" } diff --git a/scripts/tests/documents-scrape-smoke.sh b/scripts/tests/documents-scrape-smoke.sh index c55f1912..c7070266 100755 --- a/scripts/tests/documents-scrape-smoke.sh +++ b/scripts/tests/documents-scrape-smoke.sh @@ -46,7 +46,7 @@ DCE_REPO_ROOT="$FAKE_REPO" \ ARCHIVE="$TMP_DIR/server" mkdir -p "$ARCHIVE" -printf '{"messages":[{"id":"1","timestamp":"2020-01-01T00:00:00"}]}\n' >"$ARCHIVE/Guild - general [111111111111111111].json" +printf '{"guild":{"id":"1","name":"Guild"},"channel":{"id":"111111111111111111","name":"general"},"messages":[{"id":"1","timestamp":"2020-01-01T00:00:00"}]}\n' >"$ARCHIVE/Guild - general [111111111111111111].json" cat >"$TMP_DIR/config.json" <"$SALVAGE_DOC_LOG" 2>&1 || { + echo "salvage-only documents scrape failed" >&2 + cat "$SALVAGE_DOC_LOG" >&2 + exit 1 +} +grep -q 'salvage completed' "$SALVAGE_DOC_LOG" || { + echo "expected --salvage-only to run local salvage" >&2 + cat "$SALVAGE_DOC_LOG" >&2 + exit 1 +} + DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \ "$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \ | grep -q 'disk-only: ok' diff --git a/scripts/tests/run-discord-scrape-smoke.sh b/scripts/tests/run-discord-scrape-smoke.sh index 5f97d86c..35211848 100755 --- a/scripts/tests/run-discord-scrape-smoke.sh +++ b/scripts/tests/run-discord-scrape-smoke.sh @@ -158,6 +158,14 @@ cat >"$CONFIG_PATH" </dev/null || { echo "e [[ ! -d "$ARCHIVE_ROOT/salvage-stale/.dce-temp/export.111.STALE" ]] || { echo "expected stale temp dir cleaned up after salvage" >&2; exit 1; } grep -q 'SALVAGED' "$SALVAGE_LOG" || { echo "expected SALVAGED line in salvage log" >&2; exit 1; } +mkdir -p "$ARCHIVE_ROOT/salvage-only" +cp "$FIXTURE_DIR/append-existing.json" "$ARCHIVE_ROOT/salvage-only/$DEFAULT_FILE_NAME" +mkdir -p "$ARCHIVE_ROOT/salvage-only/.dce-meta" +printf '{\"111\":\"%s\"}\n' "$ARCHIVE_ROOT/salvage-only/$DEFAULT_FILE_NAME" >"$ARCHIVE_ROOT/salvage-only/.dce-meta/channel-map.json" +mkdir -p "$ARCHIVE_ROOT/salvage-only/.dce-temp/export.111.ONLYSTALE" +cp "$FIXTURE_DIR/salvage-truncated.json" "$ARCHIVE_ROOT/salvage-only/.dce-temp/export.111.ONLYSTALE/export.json" +touch -d '1 hour ago' "$ARCHIVE_ROOT/salvage-only/.dce-temp/export.111.ONLYSTALE/export.json" +SALVAGE_ONLY_LOG="$TMP_DIR/salvage-only.log" +DISCORD_TOKEN=dummy \ + DCE_CLI_BIN="$FAKE_CLI" \ + DCE_PRIMARY_CONFIG="$CONFIG_PATH" \ + DCE_FALLBACK_CONFIG="$CONFIG_PATH" \ + FAKE_DCE_FIXTURE_DIR="$FIXTURE_DIR" \ + FAKE_DCE_MODE=append \ + "$REPO_ROOT/scripts/run-discord-scrape.sh" salvage --config "$CONFIG_PATH" --target salvage-only >"$SALVAGE_ONLY_LOG" 2>&1 +grep -q 'Exporting channel' "$SALVAGE_ONLY_LOG" && { + echo "salvage-only should not invoke Discord export" >&2 + cat "$SALVAGE_ONLY_LOG" >&2 + exit 1 +} +grep -q 'salvage completed' "$SALVAGE_ONLY_LOG" || { echo "expected salvage completed in salvage-only log" >&2; exit 1; } +SALVAGE_ONLY_DEST="$ARCHIVE_ROOT/salvage-only/$DEFAULT_FILE_NAME" +SALVAGE_ONLY_COUNT=$(jq -r '.messages | length' "$SALVAGE_ONLY_DEST") +(( SALVAGE_ONLY_COUNT >= 3 )) || { echo "expected salvage-only archive to have at least 3 messages (got $SALVAGE_ONLY_COUNT)" >&2; exit 1; } + # shellcheck disable=SC1091 source "$REPO_ROOT/scripts/run-discord-scrape.sh" SHRINK_EXISTING="$TMP_DIR/shrink-existing.json"