From 2c01b3a7b9c004d1da45847ea6dbae0ea211214e Mon Sep 17 00:00:00 2001 From: Copilot Date: Wed, 3 Jun 2026 06:39:46 -0500 Subject: [PATCH] fix(scrape): flock on archive_root with lock holder diagnostics Serialize scrapes across repo checkouts that share the same Documents archive_root, write lock meta with pid/cmd, and reclaim when the holder process is dead. --- ...4-053-fix-archive-root-scrape-lock-plan.md | 82 ++++++++++++++++++ scripts/run-discord-scrape-host.sh | 86 ++++++++++++++++++- .../run-discord-scrape-host-lock-smoke.sh | 57 ++++++++++++ 3 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 docs/plans/2026-06-04-053-fix-archive-root-scrape-lock-plan.md diff --git a/docs/plans/2026-06-04-053-fix-archive-root-scrape-lock-plan.md b/docs/plans/2026-06-04-053-fix-archive-root-scrape-lock-plan.md new file mode 100644 index 00000000..f8b1127a --- /dev/null +++ b/docs/plans/2026-06-04-053-fix-archive-root-scrape-lock-plan.md @@ -0,0 +1,82 @@ +--- +title: "fix: Archive-root scrape lock with holder diagnostics" +type: fix +status: complete +date: 2026-06-04 +origin: /lfg — stale KotOR validation runs from MyBook checkout while Downloads checkout has a separate `.dce-scrape.lock`; both can touch the same `~/Documents` archives +--- + +# fix: Archive-root scrape lock with holder diagnostics + +## Summary + +Move the host scrape flock from per-repo `.dce-scrape.lock` to `{archive_root}/.dce-scrape.lock` (from scrape config), write a sidecar `.meta` file with holder PID/command, and improve the lock-held error so operators know what is blocking a run. + +## Problem Frame + +Plan 046 added scrape serialization via `flock`, but the lock path defaults to `$REPO_ROOT/.dce-scrape.lock`. The operator has two checkouts (`~/Downloads/DiscordChatExporter` and `/run/media/.../MyBook/...`) sharing the same `archive_root` in `config/scrape-targets.json`. A long-running validation from the MyBook path does not block a new scrape from the Downloads path, risking twin exports and OOM loops on `yes_general`. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | Default lock file is `{archive_root}/.dce-scrape.lock` resolved from the host config used for the scrape | +| R2 | `DCE_SCRAPE_LOCK_FILE` override continues to work unchanged | +| R3 | On acquire, write `{lock}.meta` with pid, UTC started timestamp, and command summary | +| R4 | On lock failure, error cites meta (pid, started, cmd) when present | +| R5 | If meta pid is not running, reclaim lock automatically with a warning log | +| R6 | Release removes `.meta` alongside releasing flock | +| R7 | Lock smoke covers archive-root path; `run-all-smokes.sh` passes | + +## Key Technical Decisions + +- **Lock at archive_root**: Matches the shared resource being mutated (Documents archives), not the git checkout path. +- **Fallback**: If config lacks `archive_root`, keep `$REPO_ROOT/.dce-scrape.lock` fallback for tests/minimal configs. +- **Reclaim only when pid dead**: Do not force-break a live holder; kernel releases flock when the holder exits. + +## Implementation Units + +### U1. Resolve lock path and meta lifecycle + +**Goal:** Host runner acquires archive-root lock with meta sidecar. + +**Requirements:** R1–R6 + +**Files:** +- `scripts/run-discord-scrape-host.sh` + +**Approach:** Add `resolve_scrape_lock_file`, `write_scrape_lock_meta`, `format_scrape_lock_holder`, `try_reclaim_stale_scrape_lock`; pass `host_config` into `acquire_scrape_lock`; call from scrape branch after config resolution. + +**Test scenarios:** +- Config with `archive_root=/tmp/x` uses `/tmp/x/.dce-scrape.lock` when override unset. +- `DCE_SCRAPE_LOCK_FILE` still wins over archive_root. +- Dead pid in meta allows second acquire after reclaim. + +**Verification:** Lock smoke passes. + +### U2. Extend lock smoke + +**Goal:** Regression for archive-root default and informative lock-held message. + +**Requirements:** R7 + +**Files:** +- `scripts/tests/run-discord-scrape-host-lock-smoke.sh` + +**Test scenarios:** +- Two processes: first holds flock on `{archive_root}/.dce-scrape.lock`, second fails with holder hint. +- After killing holder, second scrape succeeds. + +**Verification:** `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` + +## Scope Boundaries + +### In scope + +- Lock path, meta, reclaim, smoke + +### Deferred to Follow-Up Work + +- Killing the stale MyBook validation process on the host +- Live yes_general channel catch-up inside LFG +- Container memory limits diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index 0050d3ab..cee8c0da 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -60,17 +60,98 @@ cleanup_compose_env() { fi } +resolve_scrape_lock_file() { + local config_path=$1 + + if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then + printf '%s\n' "$DCE_SCRAPE_LOCK_FILE" + return 0 + fi + + local archive_root="" + if [[ -f "$config_path" ]]; then + archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true + fi + if [[ -n "$archive_root" && "$archive_root" != null ]]; then + printf '%s/.dce-scrape.lock\n' "$archive_root" + else + printf '%s/.dce-scrape.lock\n' "$REPO_ROOT" + fi +} + +scrape_lock_meta_path() { + printf '%s.meta\n' "$SCRAPE_LOCK_FILE" +} + +write_scrape_lock_meta() { + local meta_file + meta_file=$(scrape_lock_meta_path) + printf 'pid=%s\nstarted=%s\ncmd=%s\n' \ + "$$" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$(ps -o args= -p $$ 2>/dev/null | head -c 500 || echo unknown)" >"$meta_file" +} + +remove_scrape_lock_meta() { + rm -f "$(scrape_lock_meta_path)" +} + +format_scrape_lock_holder() { + local meta_file=$1 + local pid="" started="" cmd="" holder_state="" + + [[ -f "$meta_file" ]] || return 0 + pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true) + started=$(grep -E '^started=' "$meta_file" | head -1 | cut -d= -f2- || true) + cmd=$(grep -E '^cmd=' "$meta_file" | head -1 | cut -d= -f2- || true) + [[ -n "$pid" ]] || return 0 + + if kill -0 "$pid" 2>/dev/null; then + holder_state="running" + else + holder_state="not running" + fi + printf 'Holder pid %s (%s, started %s): %s' "$pid" "$holder_state" "${started:-unknown}" "${cmd:-unknown}" +} + +try_reclaim_stale_scrape_lock() { + local meta_file pid + meta_file=$(scrape_lock_meta_path) + [[ -f "$meta_file" ]] || return 1 + pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true) + [[ -n "$pid" ]] || return 1 + if kill -0 "$pid" 2>/dev/null; then + return 1 + fi + printf 'WARN: reclaiming scrape lock; previous holder pid %s is not running.\n' "$pid" >&2 + remove_scrape_lock_meta + return 0 +} + acquire_scrape_lock() { + local config_path=${1:-} + if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then return 0 fi command -v flock >/dev/null 2>&1 || return 0 - SCRAPE_LOCK_FILE="${DCE_SCRAPE_LOCK_FILE:-$REPO_ROOT/.dce-scrape.lock}" + [[ -n "$config_path" ]] || config_path="$REPO_ROOT/config/scrape-targets.json" + SCRAPE_LOCK_FILE=$(resolve_scrape_lock_file "$config_path") + mkdir -p "$(dirname "$SCRAPE_LOCK_FILE")" + exec {SCRAPE_LOCK_FD}>>"$SCRAPE_LOCK_FILE" if ! flock -n "$SCRAPE_LOCK_FD"; then + if try_reclaim_stale_scrape_lock && flock -n "$SCRAPE_LOCK_FD"; then + write_scrape_lock_meta + return 0 + fi + local holder_msg="" + holder_msg=$(format_scrape_lock_holder "$(scrape_lock_meta_path)") || true + if [[ -n "$holder_msg" ]]; then + die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). $holder_msg" + fi die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). Wait for it to finish or confirm no scrape is active before removing the lock." fi + write_scrape_lock_meta } release_scrape_lock() { @@ -80,6 +161,7 @@ release_scrape_lock() { flock -u "$SCRAPE_LOCK_FD" 2>/dev/null || true exec {SCRAPE_LOCK_FD}>&- SCRAPE_LOCK_FD="" + remove_scrape_lock_meta } cleanup_on_exit() { @@ -514,7 +596,7 @@ main() { run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" ;; scrape) - acquire_scrape_lock + acquire_scrape_lock "$host_config" run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" ;; esac diff --git a/scripts/tests/run-discord-scrape-host-lock-smoke.sh b/scripts/tests/run-discord-scrape-host-lock-smoke.sh index 4025d459..76a97799 100755 --- a/scripts/tests/run-discord-scrape-host-lock-smoke.sh +++ b/scripts/tests/run-discord-scrape-host-lock-smoke.sh @@ -43,6 +43,7 @@ EOF ( exec {lock_fd}>>"$LOCK_FILE" flock -n "$lock_fd" || exit 1 + printf 'pid=%s\nstarted=2020-01-01T00:00:00Z\ncmd=mock-lock-holder\n' "$$" >"${LOCK_FILE}.meta" sleep 120 ) & HOLDER_PID=$! @@ -69,6 +70,11 @@ if ! grep -q 'Another scrape is already running' <<<"$output"; then printf '%s\n' "$output" >&2 exit 1 fi +if ! grep -q 'Holder pid' <<<"$output"; then + echo "expected holder pid details in lock-held error" >&2 + printf '%s\n' "$output" >&2 + exit 1 +fi kill "$HOLDER_PID" 2>/dev/null || true wait "$HOLDER_PID" 2>/dev/null || true @@ -84,4 +90,55 @@ if ! DCE_REPO_ROOT="$REPO_ROOT" \ exit 1 fi +ARCHIVE_ROOT="$TMP_DIR/archive_shared" +mkdir -p "$ARCHIVE_ROOT/demo" +CONFIG_PATH="$TMP_DIR/archive-config.json" +ARCHIVE_LOCK="$ARCHIVE_ROOT/.dce-scrape.lock" +cat >"$CONFIG_PATH" <>"$ARCHIVE_LOCK" + flock -n "$archive_lock_fd" || exit 1 + sleep 120 +) & +HOLDER_PID=$! +sleep 0.2 + +set +e +archive_output=$( + DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + DCE_ENV_FILE="$ENV_FILE" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --config "$CONFIG_PATH" --target demo 2>&1 +) +archive_status=$? +set -e + +kill "$HOLDER_PID" 2>/dev/null || true +wait "$HOLDER_PID" 2>/dev/null || true +HOLDER_PID="" + +if [[ "$archive_status" -eq 0 ]]; then + echo "expected archive-root lock to block scrape" >&2 + exit 1 +fi +if ! grep -Fq "$ARCHIVE_LOCK" <<<"$archive_output"; then + echo "expected archive-root lock path in error message" >&2 + printf '%s\n' "$archive_output" >&2 + exit 1 +fi + echo "run-discord-scrape-host-lock-smoke: OK"