mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-09 15:52:37 -06:00
fix(scrape): flock on archive_root with lock holder diagnostics
Serialize scrapes across repo checkouts that share the same Documents archive_root, write lock meta with pid/cmd, and reclaim when the holder process is dead.
This commit is contained in:
parent
8057a4443a
commit
2c01b3a7b9
|
|
@ -0,0 +1,82 @@
|
|||
---
|
||||
title: "fix: Archive-root scrape lock with holder diagnostics"
|
||||
type: fix
|
||||
status: complete
|
||||
date: 2026-06-04
|
||||
origin: /lfg — stale KotOR validation runs from MyBook checkout while Downloads checkout has a separate `.dce-scrape.lock`; both can touch the same `~/Documents` archives
|
||||
---
|
||||
|
||||
# fix: Archive-root scrape lock with holder diagnostics
|
||||
|
||||
## Summary
|
||||
|
||||
Move the host scrape flock from per-repo `.dce-scrape.lock` to `{archive_root}/.dce-scrape.lock` (from scrape config), write a sidecar `.meta` file with holder PID/command, and improve the lock-held error so operators know what is blocking a run.
|
||||
|
||||
## Problem Frame
|
||||
|
||||
Plan 046 added scrape serialization via `flock`, but the lock path defaults to `$REPO_ROOT/.dce-scrape.lock`. The operator has two checkouts (`~/Downloads/DiscordChatExporter` and `/run/media/.../MyBook/...`) sharing the same `archive_root` in `config/scrape-targets.json`. A long-running validation from the MyBook path does not block a new scrape from the Downloads path, risking twin exports and OOM loops on `yes_general`.
|
||||
|
||||
## Requirements
|
||||
|
||||
| ID | Requirement |
|
||||
|----|-------------|
|
||||
| R1 | Default lock file is `{archive_root}/.dce-scrape.lock` resolved from the host config used for the scrape |
|
||||
| R2 | `DCE_SCRAPE_LOCK_FILE` override continues to work unchanged |
|
||||
| R3 | On acquire, write `{lock}.meta` with pid, UTC started timestamp, and command summary |
|
||||
| R4 | On lock failure, error cites meta (pid, started, cmd) when present |
|
||||
| R5 | If meta pid is not running, reclaim lock automatically with a warning log |
|
||||
| R6 | Release removes `.meta` alongside releasing flock |
|
||||
| R7 | Lock smoke covers archive-root path; `run-all-smokes.sh` passes |
|
||||
|
||||
## Key Technical Decisions
|
||||
|
||||
- **Lock at archive_root**: Matches the shared resource being mutated (Documents archives), not the git checkout path.
|
||||
- **Fallback**: If config lacks `archive_root`, keep `$REPO_ROOT/.dce-scrape.lock` fallback for tests/minimal configs.
|
||||
- **Reclaim only when pid dead**: Do not force-break a live holder; kernel releases flock when the holder exits.
|
||||
|
||||
## Implementation Units
|
||||
|
||||
### U1. Resolve lock path and meta lifecycle
|
||||
|
||||
**Goal:** Host runner acquires archive-root lock with meta sidecar.
|
||||
|
||||
**Requirements:** R1–R6
|
||||
|
||||
**Files:**
|
||||
- `scripts/run-discord-scrape-host.sh`
|
||||
|
||||
**Approach:** Add `resolve_scrape_lock_file`, `write_scrape_lock_meta`, `format_scrape_lock_holder`, `try_reclaim_stale_scrape_lock`; pass `host_config` into `acquire_scrape_lock`; call from scrape branch after config resolution.
|
||||
|
||||
**Test scenarios:**
|
||||
- Config with `archive_root=/tmp/x` uses `/tmp/x/.dce-scrape.lock` when override unset.
|
||||
- `DCE_SCRAPE_LOCK_FILE` still wins over archive_root.
|
||||
- Dead pid in meta allows second acquire after reclaim.
|
||||
|
||||
**Verification:** Lock smoke passes.
|
||||
|
||||
### U2. Extend lock smoke
|
||||
|
||||
**Goal:** Regression for archive-root default and informative lock-held message.
|
||||
|
||||
**Requirements:** R7
|
||||
|
||||
**Files:**
|
||||
- `scripts/tests/run-discord-scrape-host-lock-smoke.sh`
|
||||
|
||||
**Test scenarios:**
|
||||
- Two processes: first holds flock on `{archive_root}/.dce-scrape.lock`, second fails with holder hint.
|
||||
- After killing holder, second scrape succeeds.
|
||||
|
||||
**Verification:** `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh`
|
||||
|
||||
## Scope Boundaries
|
||||
|
||||
### In scope
|
||||
|
||||
- Lock path, meta, reclaim, smoke
|
||||
|
||||
### Deferred to Follow-Up Work
|
||||
|
||||
- Killing the stale MyBook validation process on the host
|
||||
- Live yes_general channel catch-up inside LFG
|
||||
- Container memory limits
|
||||
|
|
@ -60,17 +60,98 @@ cleanup_compose_env() {
|
|||
fi
|
||||
}
|
||||
|
||||
resolve_scrape_lock_file() {
|
||||
local config_path=$1
|
||||
|
||||
if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then
|
||||
printf '%s\n' "$DCE_SCRAPE_LOCK_FILE"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local archive_root=""
|
||||
if [[ -f "$config_path" ]]; then
|
||||
archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true
|
||||
fi
|
||||
if [[ -n "$archive_root" && "$archive_root" != null ]]; then
|
||||
printf '%s/.dce-scrape.lock\n' "$archive_root"
|
||||
else
|
||||
printf '%s/.dce-scrape.lock\n' "$REPO_ROOT"
|
||||
fi
|
||||
}
|
||||
|
||||
scrape_lock_meta_path() {
|
||||
printf '%s.meta\n' "$SCRAPE_LOCK_FILE"
|
||||
}
|
||||
|
||||
write_scrape_lock_meta() {
|
||||
local meta_file
|
||||
meta_file=$(scrape_lock_meta_path)
|
||||
printf 'pid=%s\nstarted=%s\ncmd=%s\n' \
|
||||
"$$" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$(ps -o args= -p $$ 2>/dev/null | head -c 500 || echo unknown)" >"$meta_file"
|
||||
}
|
||||
|
||||
remove_scrape_lock_meta() {
|
||||
rm -f "$(scrape_lock_meta_path)"
|
||||
}
|
||||
|
||||
format_scrape_lock_holder() {
|
||||
local meta_file=$1
|
||||
local pid="" started="" cmd="" holder_state=""
|
||||
|
||||
[[ -f "$meta_file" ]] || return 0
|
||||
pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true)
|
||||
started=$(grep -E '^started=' "$meta_file" | head -1 | cut -d= -f2- || true)
|
||||
cmd=$(grep -E '^cmd=' "$meta_file" | head -1 | cut -d= -f2- || true)
|
||||
[[ -n "$pid" ]] || return 0
|
||||
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
holder_state="running"
|
||||
else
|
||||
holder_state="not running"
|
||||
fi
|
||||
printf 'Holder pid %s (%s, started %s): %s' "$pid" "$holder_state" "${started:-unknown}" "${cmd:-unknown}"
|
||||
}
|
||||
|
||||
try_reclaim_stale_scrape_lock() {
|
||||
local meta_file pid
|
||||
meta_file=$(scrape_lock_meta_path)
|
||||
[[ -f "$meta_file" ]] || return 1
|
||||
pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true)
|
||||
[[ -n "$pid" ]] || return 1
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
return 1
|
||||
fi
|
||||
printf 'WARN: reclaiming scrape lock; previous holder pid %s is not running.\n' "$pid" >&2
|
||||
remove_scrape_lock_meta
|
||||
return 0
|
||||
}
|
||||
|
||||
acquire_scrape_lock() {
|
||||
local config_path=${1:-}
|
||||
|
||||
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
|
||||
return 0
|
||||
fi
|
||||
command -v flock >/dev/null 2>&1 || return 0
|
||||
|
||||
SCRAPE_LOCK_FILE="${DCE_SCRAPE_LOCK_FILE:-$REPO_ROOT/.dce-scrape.lock}"
|
||||
[[ -n "$config_path" ]] || config_path="$REPO_ROOT/config/scrape-targets.json"
|
||||
SCRAPE_LOCK_FILE=$(resolve_scrape_lock_file "$config_path")
|
||||
mkdir -p "$(dirname "$SCRAPE_LOCK_FILE")"
|
||||
|
||||
exec {SCRAPE_LOCK_FD}>>"$SCRAPE_LOCK_FILE"
|
||||
if ! flock -n "$SCRAPE_LOCK_FD"; then
|
||||
if try_reclaim_stale_scrape_lock && flock -n "$SCRAPE_LOCK_FD"; then
|
||||
write_scrape_lock_meta
|
||||
return 0
|
||||
fi
|
||||
local holder_msg=""
|
||||
holder_msg=$(format_scrape_lock_holder "$(scrape_lock_meta_path)") || true
|
||||
if [[ -n "$holder_msg" ]]; then
|
||||
die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). $holder_msg"
|
||||
fi
|
||||
die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). Wait for it to finish or confirm no scrape is active before removing the lock."
|
||||
fi
|
||||
write_scrape_lock_meta
|
||||
}
|
||||
|
||||
release_scrape_lock() {
|
||||
|
|
@ -80,6 +161,7 @@ release_scrape_lock() {
|
|||
flock -u "$SCRAPE_LOCK_FD" 2>/dev/null || true
|
||||
exec {SCRAPE_LOCK_FD}>&-
|
||||
SCRAPE_LOCK_FD=""
|
||||
remove_scrape_lock_meta
|
||||
}
|
||||
|
||||
cleanup_on_exit() {
|
||||
|
|
@ -514,7 +596,7 @@ main() {
|
|||
run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}"
|
||||
;;
|
||||
scrape)
|
||||
acquire_scrape_lock
|
||||
acquire_scrape_lock "$host_config"
|
||||
run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}"
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ EOF
|
|||
(
|
||||
exec {lock_fd}>>"$LOCK_FILE"
|
||||
flock -n "$lock_fd" || exit 1
|
||||
printf 'pid=%s\nstarted=2020-01-01T00:00:00Z\ncmd=mock-lock-holder\n' "$$" >"${LOCK_FILE}.meta"
|
||||
sleep 120
|
||||
) &
|
||||
HOLDER_PID=$!
|
||||
|
|
@ -69,6 +70,11 @@ if ! grep -q 'Another scrape is already running' <<<"$output"; then
|
|||
printf '%s\n' "$output" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! grep -q 'Holder pid' <<<"$output"; then
|
||||
echo "expected holder pid details in lock-held error" >&2
|
||||
printf '%s\n' "$output" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
kill "$HOLDER_PID" 2>/dev/null || true
|
||||
wait "$HOLDER_PID" 2>/dev/null || true
|
||||
|
|
@ -84,4 +90,55 @@ if ! DCE_REPO_ROOT="$REPO_ROOT" \
|
|||
exit 1
|
||||
fi
|
||||
|
||||
ARCHIVE_ROOT="$TMP_DIR/archive_shared"
|
||||
mkdir -p "$ARCHIVE_ROOT/demo"
|
||||
CONFIG_PATH="$TMP_DIR/archive-config.json"
|
||||
ARCHIVE_LOCK="$ARCHIVE_ROOT/.dce-scrape.lock"
|
||||
cat >"$CONFIG_PATH" <<JSON
|
||||
{
|
||||
"archive_root": "$ARCHIVE_ROOT",
|
||||
"targets": [
|
||||
{
|
||||
"name": "demo",
|
||||
"kind": "guild",
|
||||
"output_dir": "$ARCHIVE_ROOT/demo",
|
||||
"enabled": true
|
||||
}
|
||||
]
|
||||
}
|
||||
JSON
|
||||
|
||||
(
|
||||
exec {archive_lock_fd}>>"$ARCHIVE_LOCK"
|
||||
flock -n "$archive_lock_fd" || exit 1
|
||||
sleep 120
|
||||
) &
|
||||
HOLDER_PID=$!
|
||||
sleep 0.2
|
||||
|
||||
set +e
|
||||
archive_output=$(
|
||||
DCE_REPO_ROOT="$REPO_ROOT" \
|
||||
DCE_DOCKER_BIN="$FAKE_DOCKER" \
|
||||
DCE_ENV_FILE="$ENV_FILE" \
|
||||
DCE_COMPOSE_FILE="$COMPOSE_FILE" \
|
||||
"$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --config "$CONFIG_PATH" --target demo 2>&1
|
||||
)
|
||||
archive_status=$?
|
||||
set -e
|
||||
|
||||
kill "$HOLDER_PID" 2>/dev/null || true
|
||||
wait "$HOLDER_PID" 2>/dev/null || true
|
||||
HOLDER_PID=""
|
||||
|
||||
if [[ "$archive_status" -eq 0 ]]; then
|
||||
echo "expected archive-root lock to block scrape" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! grep -Fq "$ARCHIVE_LOCK" <<<"$archive_output"; then
|
||||
echo "expected archive-root lock path in error message" >&2
|
||||
printf '%s\n' "$archive_output" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "run-discord-scrape-host-lock-smoke: OK"
|
||||
|
|
|
|||
Loading…
Reference in a new issue