diff --git a/docs/plans/2026-06-04-059-feat-reclaim-stale-lock-proof-smoke-plan.md b/docs/plans/2026-06-04-059-feat-reclaim-stale-lock-proof-smoke-plan.md new file mode 100644 index 00000000..f3eebb0c --- /dev/null +++ b/docs/plans/2026-06-04-059-feat-reclaim-stale-lock-proof-smoke-plan.md @@ -0,0 +1,44 @@ +--- +title: "feat: Reclaim stale scrape lock and proof salvage-before smoke" +type: feat +status: active +date: 2026-06-04 +origin: /lfg — stale MyBook validation leaves lock/meta; proof lacks salvage-before smoke +--- + +# feat: Reclaim stale scrape lock and proof salvage-before smoke + +## Summary + +Add `--reclaim-stale` to `scrape-lock-status.sh` for operators to clear dead-holder lock artifacts, and extend `run-operator-proof-smoke.sh` for `--salvage-before-scrape`. + +## Problem Frame + +After a crashed scrape, `{archive_root}/.dce-scrape.lock.meta` may reference a dead pid. Operators need a safe reclaim path before restarting KotOR catch-up. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `scrape-lock-status.sh --reclaim-stale` removes stale `.meta` when holder pid is not running | +| R2 | Reclaim refuses when flock is actively held or holder pid is running | +| R3 | Reclaim removes unheld orphan lock file when safe | +| R4 | `run-operator-proof-smoke.sh` covers `--salvage-before-scrape` | +| R5 | `run-all-smokes.sh` passes | + +## Implementation Units + +### U1. Lock reclaim flag + +**Files:** `scripts/scrape-lock-status.sh`, `scripts/tests/scrape-lock-status-smoke.sh` + +### U2. Proof salvage-before smoke + +**Files:** `scripts/tests/run-operator-proof-smoke.sh` + +## Scope Boundaries + +### Deferred + +- GUI bridge doc refresh +- Live KotOR catch-up on host diff --git a/scripts/scrape-lock-status.sh b/scripts/scrape-lock-status.sh index 991a3e1d..1eab5c31 100755 --- a/scripts/scrape-lock-status.sh +++ b/scripts/scrape-lock-status.sh @@ -9,11 +9,13 @@ CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}" usage() { cat </dev/null; then + die "Cannot reclaim: holder pid $pid is still running." + fi + rm -f "$meta_file" + printf 'removed stale lock meta: %s\n' "$meta_file" + fi + + if [[ -e "$lock_file" ]] && ! lock_is_held "$lock_file"; then + rm -f "$lock_file" + printf 'removed unheld lock file: %s\n' "$lock_file" + fi +} + main() { + local reclaim=0 while (($#)); do case "$1" in --config) @@ -91,6 +117,10 @@ main() { CONFIG_PATH=$2 shift 2 ;; + --reclaim-stale) + reclaim=1 + shift + ;; --help|-h) usage exit 0 @@ -136,10 +166,24 @@ main() { if [[ -n "$pid" ]] && ! kill -0 "$pid" 2>/dev/null; then printf 'state: stale (reclaimable; holder pid %s is not running)\n' "$pid" format_holder_line "$meta_file" + if (( reclaim )); then + reclaim_stale_lock "$lock_file" "$meta_file" + printf 'state: free (stale lock reclaimed)\n' + fi exit 0 fi fi + if (( reclaim )); then + if [[ -e "$lock_file" ]] && ! lock_is_held "$lock_file"; then + reclaim_stale_lock "$lock_file" "$meta_file" + printf 'state: free (orphan lock reclaimed)\n' + exit 0 + fi + printf 'state: free (nothing to reclaim)\n' + exit 0 + fi + printf 'state: free (lock file present but not held)\n' format_holder_line "$meta_file" exit 0 diff --git a/scripts/tests/run-operator-proof-smoke.sh b/scripts/tests/run-operator-proof-smoke.sh index f31b363c..4adb5c83 100755 --- a/scripts/tests/run-operator-proof-smoke.sh +++ b/scripts/tests/run-operator-proof-smoke.sh @@ -35,6 +35,19 @@ JSON printf 'DISCORD_TOKEN=dummy\n' >"$ENV_PATH" +COMPOSE_FILE="$TMP_DIR/docker-compose.yml" +FAKE_DOCKER="$TMP_DIR/docker" +cat >"$COMPOSE_FILE" <<'EOF' +services: + discord-scraper: + image: fake +EOF +cat >"$FAKE_DOCKER" <<'EOF' +#!/usr/bin/env bash +printf 'run succeeded\n' +EOF +chmod +x "$FAKE_DOCKER" + set +e output=$( DCE_MIN_FREE_MB=0 \ @@ -72,6 +85,30 @@ if [[ "$salvage_status" -ne 0 ]] || ! grep -q 'Salvage-only proof complete' <<<" exit 1 fi +set +e +salvage_before_output=$( + DCE_MIN_FREE_MB=0 \ + DCE_CONFIG_FILE="$CONFIG_PATH" \ + DCE_ENV_FILE="$ENV_PATH" \ + DCE_SKIP_SCRAPE_LOCK=1 \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + "$PROOF" --config "$CONFIG_PATH" --target demo --salvage-before-scrape 2>&1 +) +salvage_before_status=$? +set -e + +if [[ "$salvage_before_status" -ne 0 ]] || ! grep -q 'salvage completed' <<<"$salvage_before_output"; then + printf 'run-operator-proof --salvage-before-scrape failed (status=%s)\n' "$salvage_before_status" >&2 + printf '%s\n' "$salvage_before_output" >&2 + exit 1 +fi +grep -q 'Operator proof passed for demo' <<<"$salvage_before_output" || { + printf 'expected operator proof to pass after salvage-before scrape\n' >&2 + printf '%s\n' "$salvage_before_output" >&2 + exit 1 +} + command -v flock >/dev/null 2>&1 && { LOCK_FILE="$TMP_DIR/archive/.dce-scrape.lock" HOLDER_PID="" diff --git a/scripts/tests/scrape-lock-status-smoke.sh b/scripts/tests/scrape-lock-status-smoke.sh index d21ef2ff..1cfb6148 100755 --- a/scripts/tests/scrape-lock-status-smoke.sh +++ b/scripts/tests/scrape-lock-status-smoke.sh @@ -91,4 +91,19 @@ if [[ "$stale_status" -ne 0 ]] || ! grep -q 'state: stale (reclaimable' <<<"$sta exit 1 fi +set +e +reclaim_output=$("$STATUS" --config "$CONFIG_PATH" --reclaim-stale 2>&1) +reclaim_status=$? +set -e + +if [[ "$reclaim_status" -ne 0 ]] || ! grep -q 'removed stale lock meta' <<<"$reclaim_output"; then + echo "expected --reclaim-stale to remove stale meta" >&2 + printf '%s\n' "$reclaim_output" >&2 + exit 1 +fi +[[ ! -f "${LOCK_FILE}.meta" ]] || { + echo "expected stale meta removed after reclaim" >&2 + exit 1 +} + printf 'scrape-lock-status-smoke: ok\n'