fix(scrape): flock on archive_root with lock holder diagnostics

Serialize scrapes across repo checkouts that share the same Documents
archive_root, write lock meta with pid/cmd, and reclaim when the holder
process is dead.
This commit is contained in:
Copilot 2026-06-03 06:39:46 -05:00
parent 8057a4443a
commit 2c01b3a7b9
3 changed files with 223 additions and 2 deletions

View file

@ -0,0 +1,82 @@
---
title: "fix: Archive-root scrape lock with holder diagnostics"
type: fix
status: complete
date: 2026-06-04
origin: /lfg — stale KotOR validation runs from MyBook checkout while Downloads checkout has a separate `.dce-scrape.lock`; both can touch the same `~/Documents` archives
---
# fix: Archive-root scrape lock with holder diagnostics
## Summary
Move the host scrape flock from per-repo `.dce-scrape.lock` to `{archive_root}/.dce-scrape.lock` (from scrape config), write a sidecar `.meta` file with holder PID/command, and improve the lock-held error so operators know what is blocking a run.
## Problem Frame
Plan 046 added scrape serialization via `flock`, but the lock path defaults to `$REPO_ROOT/.dce-scrape.lock`. The operator has two checkouts (`~/Downloads/DiscordChatExporter` and `/run/media/.../MyBook/...`) sharing the same `archive_root` in `config/scrape-targets.json`. A long-running validation from the MyBook path does not block a new scrape from the Downloads path, risking twin exports and OOM loops on `yes_general`.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | Default lock file is `{archive_root}/.dce-scrape.lock` resolved from the host config used for the scrape |
| R2 | `DCE_SCRAPE_LOCK_FILE` override continues to work unchanged |
| R3 | On acquire, write `{lock}.meta` with pid, UTC started timestamp, and command summary |
| R4 | On lock failure, error cites meta (pid, started, cmd) when present |
| R5 | If meta pid is not running, reclaim lock automatically with a warning log |
| R6 | Release removes `.meta` alongside releasing flock |
| R7 | Lock smoke covers archive-root path; `run-all-smokes.sh` passes |
## Key Technical Decisions
- **Lock at archive_root**: Matches the shared resource being mutated (Documents archives), not the git checkout path.
- **Fallback**: If config lacks `archive_root`, keep `$REPO_ROOT/.dce-scrape.lock` fallback for tests/minimal configs.
- **Reclaim only when pid dead**: Do not force-break a live holder; kernel releases flock when the holder exits.
## Implementation Units
### U1. Resolve lock path and meta lifecycle
**Goal:** Host runner acquires archive-root lock with meta sidecar.
**Requirements:** R1R6
**Files:**
- `scripts/run-discord-scrape-host.sh`
**Approach:** Add `resolve_scrape_lock_file`, `write_scrape_lock_meta`, `format_scrape_lock_holder`, `try_reclaim_stale_scrape_lock`; pass `host_config` into `acquire_scrape_lock`; call from scrape branch after config resolution.
**Test scenarios:**
- Config with `archive_root=/tmp/x` uses `/tmp/x/.dce-scrape.lock` when override unset.
- `DCE_SCRAPE_LOCK_FILE` still wins over archive_root.
- Dead pid in meta allows second acquire after reclaim.
**Verification:** Lock smoke passes.
### U2. Extend lock smoke
**Goal:** Regression for archive-root default and informative lock-held message.
**Requirements:** R7
**Files:**
- `scripts/tests/run-discord-scrape-host-lock-smoke.sh`
**Test scenarios:**
- Two processes: first holds flock on `{archive_root}/.dce-scrape.lock`, second fails with holder hint.
- After killing holder, second scrape succeeds.
**Verification:** `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh`
## Scope Boundaries
### In scope
- Lock path, meta, reclaim, smoke
### Deferred to Follow-Up Work
- Killing the stale MyBook validation process on the host
- Live yes_general channel catch-up inside LFG
- Container memory limits

View file

@ -60,17 +60,98 @@ cleanup_compose_env() {
fi fi
} }
resolve_scrape_lock_file() {
local config_path=$1
if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then
printf '%s\n' "$DCE_SCRAPE_LOCK_FILE"
return 0
fi
local archive_root=""
if [[ -f "$config_path" ]]; then
archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true
fi
if [[ -n "$archive_root" && "$archive_root" != null ]]; then
printf '%s/.dce-scrape.lock\n' "$archive_root"
else
printf '%s/.dce-scrape.lock\n' "$REPO_ROOT"
fi
}
scrape_lock_meta_path() {
printf '%s.meta\n' "$SCRAPE_LOCK_FILE"
}
write_scrape_lock_meta() {
local meta_file
meta_file=$(scrape_lock_meta_path)
printf 'pid=%s\nstarted=%s\ncmd=%s\n' \
"$$" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$(ps -o args= -p $$ 2>/dev/null | head -c 500 || echo unknown)" >"$meta_file"
}
remove_scrape_lock_meta() {
rm -f "$(scrape_lock_meta_path)"
}
format_scrape_lock_holder() {
local meta_file=$1
local pid="" started="" cmd="" holder_state=""
[[ -f "$meta_file" ]] || return 0
pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true)
started=$(grep -E '^started=' "$meta_file" | head -1 | cut -d= -f2- || true)
cmd=$(grep -E '^cmd=' "$meta_file" | head -1 | cut -d= -f2- || true)
[[ -n "$pid" ]] || return 0
if kill -0 "$pid" 2>/dev/null; then
holder_state="running"
else
holder_state="not running"
fi
printf 'Holder pid %s (%s, started %s): %s' "$pid" "$holder_state" "${started:-unknown}" "${cmd:-unknown}"
}
try_reclaim_stale_scrape_lock() {
local meta_file pid
meta_file=$(scrape_lock_meta_path)
[[ -f "$meta_file" ]] || return 1
pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true)
[[ -n "$pid" ]] || return 1
if kill -0 "$pid" 2>/dev/null; then
return 1
fi
printf 'WARN: reclaiming scrape lock; previous holder pid %s is not running.\n' "$pid" >&2
remove_scrape_lock_meta
return 0
}
acquire_scrape_lock() { acquire_scrape_lock() {
local config_path=${1:-}
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
return 0 return 0
fi fi
command -v flock >/dev/null 2>&1 || return 0 command -v flock >/dev/null 2>&1 || return 0
SCRAPE_LOCK_FILE="${DCE_SCRAPE_LOCK_FILE:-$REPO_ROOT/.dce-scrape.lock}" [[ -n "$config_path" ]] || config_path="$REPO_ROOT/config/scrape-targets.json"
SCRAPE_LOCK_FILE=$(resolve_scrape_lock_file "$config_path")
mkdir -p "$(dirname "$SCRAPE_LOCK_FILE")"
exec {SCRAPE_LOCK_FD}>>"$SCRAPE_LOCK_FILE" exec {SCRAPE_LOCK_FD}>>"$SCRAPE_LOCK_FILE"
if ! flock -n "$SCRAPE_LOCK_FD"; then if ! flock -n "$SCRAPE_LOCK_FD"; then
if try_reclaim_stale_scrape_lock && flock -n "$SCRAPE_LOCK_FD"; then
write_scrape_lock_meta
return 0
fi
local holder_msg=""
holder_msg=$(format_scrape_lock_holder "$(scrape_lock_meta_path)") || true
if [[ -n "$holder_msg" ]]; then
die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). $holder_msg"
fi
die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). Wait for it to finish or confirm no scrape is active before removing the lock." die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). Wait for it to finish or confirm no scrape is active before removing the lock."
fi fi
write_scrape_lock_meta
} }
release_scrape_lock() { release_scrape_lock() {
@ -80,6 +161,7 @@ release_scrape_lock() {
flock -u "$SCRAPE_LOCK_FD" 2>/dev/null || true flock -u "$SCRAPE_LOCK_FD" 2>/dev/null || true
exec {SCRAPE_LOCK_FD}>&- exec {SCRAPE_LOCK_FD}>&-
SCRAPE_LOCK_FD="" SCRAPE_LOCK_FD=""
remove_scrape_lock_meta
} }
cleanup_on_exit() { cleanup_on_exit() {
@ -514,7 +596,7 @@ main() {
run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}"
;; ;;
scrape) scrape)
acquire_scrape_lock acquire_scrape_lock "$host_config"
run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}"
;; ;;
esac esac

View file

@ -43,6 +43,7 @@ EOF
( (
exec {lock_fd}>>"$LOCK_FILE" exec {lock_fd}>>"$LOCK_FILE"
flock -n "$lock_fd" || exit 1 flock -n "$lock_fd" || exit 1
printf 'pid=%s\nstarted=2020-01-01T00:00:00Z\ncmd=mock-lock-holder\n' "$$" >"${LOCK_FILE}.meta"
sleep 120 sleep 120
) & ) &
HOLDER_PID=$! HOLDER_PID=$!
@ -69,6 +70,11 @@ if ! grep -q 'Another scrape is already running' <<<"$output"; then
printf '%s\n' "$output" >&2 printf '%s\n' "$output" >&2
exit 1 exit 1
fi fi
if ! grep -q 'Holder pid' <<<"$output"; then
echo "expected holder pid details in lock-held error" >&2
printf '%s\n' "$output" >&2
exit 1
fi
kill "$HOLDER_PID" 2>/dev/null || true kill "$HOLDER_PID" 2>/dev/null || true
wait "$HOLDER_PID" 2>/dev/null || true wait "$HOLDER_PID" 2>/dev/null || true
@ -84,4 +90,55 @@ if ! DCE_REPO_ROOT="$REPO_ROOT" \
exit 1 exit 1
fi fi
ARCHIVE_ROOT="$TMP_DIR/archive_shared"
mkdir -p "$ARCHIVE_ROOT/demo"
CONFIG_PATH="$TMP_DIR/archive-config.json"
ARCHIVE_LOCK="$ARCHIVE_ROOT/.dce-scrape.lock"
cat >"$CONFIG_PATH" <<JSON
{
"archive_root": "$ARCHIVE_ROOT",
"targets": [
{
"name": "demo",
"kind": "guild",
"output_dir": "$ARCHIVE_ROOT/demo",
"enabled": true
}
]
}
JSON
(
exec {archive_lock_fd}>>"$ARCHIVE_LOCK"
flock -n "$archive_lock_fd" || exit 1
sleep 120
) &
HOLDER_PID=$!
sleep 0.2
set +e
archive_output=$(
DCE_REPO_ROOT="$REPO_ROOT" \
DCE_DOCKER_BIN="$FAKE_DOCKER" \
DCE_ENV_FILE="$ENV_FILE" \
DCE_COMPOSE_FILE="$COMPOSE_FILE" \
"$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --config "$CONFIG_PATH" --target demo 2>&1
)
archive_status=$?
set -e
kill "$HOLDER_PID" 2>/dev/null || true
wait "$HOLDER_PID" 2>/dev/null || true
HOLDER_PID=""
if [[ "$archive_status" -eq 0 ]]; then
echo "expected archive-root lock to block scrape" >&2
exit 1
fi
if ! grep -Fq "$ARCHIVE_LOCK" <<<"$archive_output"; then
echo "expected archive-root lock path in error message" >&2
printf '%s\n' "$archive_output" >&2
exit 1
fi
echo "run-discord-scrape-host-lock-smoke: OK" echo "run-discord-scrape-host-lock-smoke: OK"