refactor(scrape): extract shared scrape-lock library

Centralize archive-root lock path, held checks, holder formatting, and
reclaim helpers in scripts/lib/scrape-lock.sh. Source it from the host
runner, lock status script, and operator wrappers to remove duplicated
logic. Update documents-scrape smoke fake repo to include the new lib.
This commit is contained in:
Copilot 2026-06-03 07:14:22 -05:00
parent ad5384ecc1
commit a88cd815f4
7 changed files with 213 additions and 160 deletions

View file

@ -0,0 +1,38 @@
---
title: "refactor: Shared scrape-lock library"
type: refactor
status: active
date: 2026-06-04
origin: /lfg — lock path and gate logic duplicated across host runner, status script, validation, documents scrape
---
# refactor: Shared scrape-lock library
## Summary
Extract `scripts/lib/scrape-lock.sh` and source it from lock-related scripts to keep archive-root lock behavior consistent.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | `lib/scrape-lock.sh` provides resolve, held check, holder formatting, reclaim helpers |
| R2 | `scrape-lock-status.sh` and `run-discord-scrape-host.sh` source the library |
| R3 | `run-documents-scrape.sh` and `run-operator-validation.sh` use shared `ensure_scrape_lock_available` |
| R4 | `run-all-smokes.sh` passes (21 smokes) |
## Implementation Units
### U1. Library extraction
**Files:** `scripts/lib/scrape-lock.sh`, consumers listed above
### U2. Smoke gate
**Verification:** `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh`
## Scope Boundaries
### Deferred
- Live KotOR catch-up on host

132
scripts/lib/scrape-lock.sh Normal file
View file

@ -0,0 +1,132 @@
# Shared scrape lock path and inspection helpers.
# Sourced by run-discord-scrape-host.sh, scrape-lock-status.sh, and operator wrappers.
resolve_scrape_lock_file() {
local config_path=$1 repo_root=$2
if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then
printf '%s\n' "$DCE_SCRAPE_LOCK_FILE"
return 0
fi
local archive_root=""
if [[ -f "$config_path" ]]; then
archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true
fi
if [[ -n "$archive_root" && "$archive_root" != null ]]; then
printf '%s/.dce-scrape.lock\n' "$archive_root"
else
printf '%s/.dce-scrape.lock\n' "$repo_root"
fi
}
scrape_lock_meta_path() {
printf '%s.meta\n' "$1"
}
read_scrape_lock_meta_field() {
local meta_file=$1 field=$2
grep -E "^${field}=" "$meta_file" 2>/dev/null | head -1 | cut -d= -f2- || true
}
scrape_lock_is_held() {
local lock_file=$1
command -v flock >/dev/null 2>&1 || return 1
exec {lock_probe_fd}>>"$lock_file"
if flock -n "$lock_probe_fd"; then
flock -u "$lock_probe_fd" 2>/dev/null || true
exec {lock_probe_fd}>&-
return 1
fi
exec {lock_probe_fd}>&-
return 0
}
scrape_lock_format_holder_summary() {
local meta_file=$1
local pid="" started="" cmd="" holder_state=""
[[ -f "$meta_file" ]] || return 0
pid=$(read_scrape_lock_meta_field "$meta_file" pid)
started=$(read_scrape_lock_meta_field "$meta_file" started)
cmd=$(read_scrape_lock_meta_field "$meta_file" cmd)
[[ -n "$pid" ]] || return 0
if kill -0 "$pid" 2>/dev/null; then
holder_state="running"
else
holder_state="not running"
fi
printf 'Holder pid %s (%s, started %s): %s' "$pid" "$holder_state" "${started:-unknown}" "${cmd:-unknown}"
}
scrape_lock_format_holder_lines() {
local meta_file=$1
local pid="" started="" cmd="" holder_state=""
[[ -f "$meta_file" ]] || return 0
pid=$(read_scrape_lock_meta_field "$meta_file" pid)
started=$(read_scrape_lock_meta_field "$meta_file" started)
cmd=$(read_scrape_lock_meta_field "$meta_file" cmd)
[[ -n "$pid" ]] || return 0
if kill -0 "$pid" 2>/dev/null; then
holder_state="running"
else
holder_state="not running"
fi
printf 'holder: pid %s (%s, started %s)\n' "$pid" "$holder_state" "${started:-unknown}"
[[ -n "$cmd" ]] && printf 'cmd: %s\n' "$cmd"
}
scrape_lock_try_reclaim_meta() {
local meta_file=$1
local pid
[[ -f "$meta_file" ]] || return 1
pid=$(read_scrape_lock_meta_field "$meta_file" pid)
[[ -n "$pid" ]] || return 1
if kill -0 "$pid" 2>/dev/null; then
return 1
fi
rm -f "$meta_file"
return 0
}
scrape_lock_reclaim_stale_files() {
local lock_file=$1 meta_file=$2
if scrape_lock_is_held "$lock_file"; then
return 2
fi
if [[ -f "$meta_file" ]]; then
local pid
pid=$(read_scrape_lock_meta_field "$meta_file" pid)
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
return 3
fi
rm -f "$meta_file"
printf 'removed stale lock meta: %s\n' "$meta_file"
fi
if [[ -e "$lock_file" ]] && ! scrape_lock_is_held "$lock_file"; then
rm -f "$lock_file"
printf 'removed unheld lock file: %s\n' "$lock_file"
fi
return 0
}
ensure_scrape_lock_available() {
local config_path=$1 status_script=$2
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
return 0
fi
[[ -x "$status_script" ]] || return 0
if ! "$status_script" --config "$config_path"; then
return 1
fi
return 0
}

View file

@ -6,6 +6,8 @@ SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
# shellcheck source=lib/scrape-run-plan.sh
source "$SCRIPT_DIR/lib/scrape-run-plan.sh"
# shellcheck source=lib/scrape-lock.sh
source "$SCRIPT_DIR/lib/scrape-lock.sh"
COMPOSE_FILE="${DCE_COMPOSE_FILE:-$REPO_ROOT/docker-compose.yml}"
ENV_FILE="${DCE_ENV_FILE:-$REPO_ROOT/scrape.env}"
DOCKER_BIN="${DCE_DOCKER_BIN:-docker}"
@ -61,70 +63,26 @@ cleanup_compose_env() {
fi
}
resolve_scrape_lock_file() {
local config_path=$1
if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then
printf '%s\n' "$DCE_SCRAPE_LOCK_FILE"
return 0
fi
local archive_root=""
if [[ -f "$config_path" ]]; then
archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true
fi
if [[ -n "$archive_root" && "$archive_root" != null ]]; then
printf '%s/.dce-scrape.lock\n' "$archive_root"
else
printf '%s/.dce-scrape.lock\n' "$REPO_ROOT"
fi
}
scrape_lock_meta_path() {
printf '%s.meta\n' "$SCRAPE_LOCK_FILE"
}
write_scrape_lock_meta() {
local meta_file
meta_file=$(scrape_lock_meta_path)
meta_file=$(scrape_lock_meta_path "$SCRAPE_LOCK_FILE")
printf 'pid=%s\nstarted=%s\ncmd=%s\n' \
"$$" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$(ps -o args= -p $$ 2>/dev/null | head -c 500 || echo unknown)" >"$meta_file"
}
remove_scrape_lock_meta() {
rm -f "$(scrape_lock_meta_path)"
}
format_scrape_lock_holder() {
local meta_file=$1
local pid="" started="" cmd="" holder_state=""
[[ -f "$meta_file" ]] || return 0
pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true)
started=$(grep -E '^started=' "$meta_file" | head -1 | cut -d= -f2- || true)
cmd=$(grep -E '^cmd=' "$meta_file" | head -1 | cut -d= -f2- || true)
[[ -n "$pid" ]] || return 0
if kill -0 "$pid" 2>/dev/null; then
holder_state="running"
else
holder_state="not running"
fi
printf 'Holder pid %s (%s, started %s): %s' "$pid" "$holder_state" "${started:-unknown}" "${cmd:-unknown}"
rm -f "$(scrape_lock_meta_path "$SCRAPE_LOCK_FILE")"
}
try_reclaim_stale_scrape_lock() {
local meta_file pid
meta_file=$(scrape_lock_meta_path)
[[ -f "$meta_file" ]] || return 1
pid=$(grep -E '^pid=' "$meta_file" | head -1 | cut -d= -f2- || true)
[[ -n "$pid" ]] || return 1
if kill -0 "$pid" 2>/dev/null; then
return 1
fi
meta_file=$(scrape_lock_meta_path "$SCRAPE_LOCK_FILE")
pid=$(read_scrape_lock_meta_field "$meta_file" pid)
if scrape_lock_try_reclaim_meta "$meta_file"; then
printf 'WARN: reclaiming scrape lock; previous holder pid %s is not running.\n' "$pid" >&2
remove_scrape_lock_meta
return 0
fi
return 1
}
acquire_scrape_lock() {
@ -136,7 +94,7 @@ acquire_scrape_lock() {
command -v flock >/dev/null 2>&1 || return 0
[[ -n "$config_path" ]] || config_path="$REPO_ROOT/config/scrape-targets.json"
SCRAPE_LOCK_FILE=$(resolve_scrape_lock_file "$config_path")
SCRAPE_LOCK_FILE=$(resolve_scrape_lock_file "$config_path" "$REPO_ROOT")
mkdir -p "$(dirname "$SCRAPE_LOCK_FILE")"
exec {SCRAPE_LOCK_FD}>>"$SCRAPE_LOCK_FILE"
@ -146,7 +104,7 @@ acquire_scrape_lock() {
return 0
fi
local holder_msg=""
holder_msg=$(format_scrape_lock_holder "$(scrape_lock_meta_path)") || true
holder_msg=$(scrape_lock_format_holder_summary "$(scrape_lock_meta_path "$SCRAPE_LOCK_FILE")") || true
if [[ -n "$holder_msg" ]]; then
die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). $holder_msg"
fi

View file

@ -12,6 +12,8 @@ VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
# shellcheck source=lib/scrape-lock.sh
source "$SCRIPT_DIR/lib/scrape-lock.sh"
# shellcheck source=lib/scrape-run-plan.sh
source "$SCRIPT_DIR/lib/scrape-run-plan.sh"
@ -41,12 +43,8 @@ die() {
exit 1
}
ensure_scrape_lock_available() {
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
return 0
fi
[[ -x "$LOCK_STATUS" ]] || return 0
if ! "$LOCK_STATUS" --config "$CONFIG_PATH"; then
require_scrape_lock_free() {
if ! ensure_scrape_lock_available "$CONFIG_PATH" "$LOCK_STATUS"; then
die "Scrape lock is held; another scrape may be running. Inspect: $LOCK_STATUS --config $CONFIG_PATH"
fi
}
@ -139,7 +137,7 @@ main() {
"$VERIFY_READY" --disk-only --config "$CONFIG_PATH"
ensure_scrape_lock_available
require_scrape_lock_free
if (( salvage_only == 1 )); then
run_local_salvage "${passthrough[@]}"

View file

@ -11,6 +11,8 @@ VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
DOCUMENTS_SCRAPE="$REPO_ROOT/scripts/run-documents-scrape.sh"
AUDIT_JSON="$REPO_ROOT/scripts/audit-archive-json.sh"
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
# shellcheck source=lib/scrape-lock.sh
source "$SCRIPT_DIR/lib/scrape-lock.sh"
DRY_RUN=0
SKIP_SCRAPE=0
@ -91,12 +93,8 @@ audit_targets() {
(( failures == 0 ))
}
ensure_scrape_lock_available() {
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
return 0
fi
[[ -x "$LOCK_STATUS" ]] || return 0
if ! "$LOCK_STATUS" --config "$CONFIG_PATH"; then
require_scrape_lock_free() {
if ! ensure_scrape_lock_available "$CONFIG_PATH" "$LOCK_STATUS"; then
die "Scrape lock is held; another scrape may be running. Inspect: $LOCK_STATUS --config $CONFIG_PATH"
fi
}
@ -292,7 +290,7 @@ main() {
if (( SKIP_SCRAPE )); then
log_step "Skip scrape requested."
else
ensure_scrape_lock_available || failures=$((failures + 1))
require_scrape_lock_free || failures=$((failures + 1))
if (( failures == 0 )); then
if (( PER_TARGET )) && [[ -z "$TARGET" ]]; then
scrape_per_target || failures=$((failures + 1))

View file

@ -5,6 +5,8 @@ set -Eeuo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
# shellcheck source=lib/scrape-lock.sh
source "$SCRIPT_DIR/lib/scrape-lock.sh"
usage() {
cat <<EOF
@ -28,86 +30,6 @@ die() {
exit 2
}
resolve_scrape_lock_file() {
local config_path=$1
if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then
printf '%s\n' "$DCE_SCRAPE_LOCK_FILE"
return 0
fi
local archive_root=""
if [[ -f "$config_path" ]]; then
archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true
fi
if [[ -n "$archive_root" && "$archive_root" != null ]]; then
printf '%s/.dce-scrape.lock\n' "$archive_root"
else
printf '%s/.dce-scrape.lock\n' "$REPO_ROOT"
fi
}
read_meta_field() {
local meta_file=$1 field=$2
grep -E "^${field}=" "$meta_file" 2>/dev/null | head -1 | cut -d= -f2- || true
}
format_holder_line() {
local meta_file=$1
local pid="" started="" cmd="" holder_state=""
[[ -f "$meta_file" ]] || return 0
pid=$(read_meta_field "$meta_file" pid)
started=$(read_meta_field "$meta_file" started)
cmd=$(read_meta_field "$meta_file" cmd)
[[ -n "$pid" ]] || return 0
if kill -0 "$pid" 2>/dev/null; then
holder_state="running"
else
holder_state="not running"
fi
printf 'holder: pid %s (%s, started %s)\n' "$pid" "$holder_state" "${started:-unknown}"
[[ -n "$cmd" ]] && printf 'cmd: %s\n' "$cmd"
}
lock_is_held() {
local lock_file=$1
command -v flock >/dev/null 2>&1 || return 1
exec {lock_probe_fd}>>"$lock_file"
if flock -n "$lock_probe_fd"; then
flock -u "$lock_probe_fd" 2>/dev/null || true
exec {lock_probe_fd}>&-
return 1
fi
exec {lock_probe_fd}>&-
return 0
}
reclaim_stale_lock() {
local lock_file=$1 meta_file=$2
if lock_is_held "$lock_file"; then
die "Cannot reclaim: scrape lock is actively held."
fi
if [[ -f "$meta_file" ]]; then
local pid
pid=$(read_meta_field "$meta_file" pid)
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
die "Cannot reclaim: holder pid $pid is still running."
fi
rm -f "$meta_file"
printf 'removed stale lock meta: %s\n' "$meta_file"
fi
if [[ -e "$lock_file" ]] && ! lock_is_held "$lock_file"; then
rm -f "$lock_file"
printf 'removed unheld lock file: %s\n' "$lock_file"
fi
}
main() {
local reclaim=0
while (($#)); do
@ -134,9 +56,9 @@ main() {
command -v jq >/dev/null 2>&1 || die "Required command 'jq' is missing."
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
local lock_file meta_file
lock_file=$(resolve_scrape_lock_file "$CONFIG_PATH")
meta_file="${lock_file}.meta"
local lock_file meta_file reclaim_status
lock_file=$(resolve_scrape_lock_file "$CONFIG_PATH" "$REPO_ROOT")
meta_file=$(scrape_lock_meta_path "$lock_file")
printf 'Scrape lock status\n'
printf '==================\n'
@ -150,24 +72,24 @@ main() {
if ! command -v flock >/dev/null 2>&1; then
printf 'state: unknown (flock unavailable; lock file exists)\n'
format_holder_line "$meta_file"
scrape_lock_format_holder_lines "$meta_file"
exit 0
fi
if lock_is_held "$lock_file"; then
if scrape_lock_is_held "$lock_file"; then
printf 'state: held (active scrape)\n'
format_holder_line "$meta_file"
scrape_lock_format_holder_lines "$meta_file"
exit 1
fi
if [[ -f "$meta_file" ]]; then
local pid
pid=$(read_meta_field "$meta_file" pid)
pid=$(read_scrape_lock_meta_field "$meta_file" pid)
if [[ -n "$pid" ]] && ! kill -0 "$pid" 2>/dev/null; then
printf 'state: stale (reclaimable; holder pid %s is not running)\n' "$pid"
format_holder_line "$meta_file"
scrape_lock_format_holder_lines "$meta_file"
if (( reclaim )); then
reclaim_stale_lock "$lock_file" "$meta_file"
scrape_lock_reclaim_stale_files "$lock_file" "$meta_file" || die "Cannot reclaim stale scrape lock."
printf 'state: free (stale lock reclaimed)\n'
fi
exit 0
@ -175,8 +97,14 @@ main() {
fi
if (( reclaim )); then
if [[ -e "$lock_file" ]] && ! lock_is_held "$lock_file"; then
reclaim_stale_lock "$lock_file" "$meta_file"
if [[ -e "$lock_file" ]] && ! scrape_lock_is_held "$lock_file"; then
reclaim_status=0
scrape_lock_reclaim_stale_files "$lock_file" "$meta_file" || reclaim_status=$?
if (( reclaim_status == 2 )); then
die "Cannot reclaim: scrape lock is actively held."
elif (( reclaim_status == 3 )); then
die "Cannot reclaim: lock holder pid is still running."
fi
printf 'state: free (orphan lock reclaimed)\n'
exit 0
fi
@ -185,7 +113,7 @@ main() {
fi
printf 'state: free (lock file present but not held)\n'
format_holder_line "$meta_file"
scrape_lock_format_holder_lines "$meta_file"
exit 0
}

View file

@ -14,6 +14,7 @@ FAKE_REPO="$TMP_DIR/fake-repo"
mkdir -p "$FAKE_REPO/scripts/lib"
cp "$REPO_ROOT/scripts/run-discord-scrape-host.sh" "$FAKE_REPO/scripts/"
cp "$REPO_ROOT/scripts/lib/scrape-run-plan.sh" "$FAKE_REPO/scripts/lib/"
cp "$REPO_ROOT/scripts/lib/scrape-lock.sh" "$FAKE_REPO/scripts/lib/"
chmod +x "$FAKE_REPO/scripts/run-discord-scrape-host.sh"
COMPOSE_FILE="$TMP_DIR/docker-compose.yml"