mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
feat(scrape): add scrape-lock-status diagnostic for archive-root lock
Introduce read-only scrape-lock-status.sh and surface lock state during operator-handoff so shared Documents archives show active or stale holders before starting another scrape.
This commit is contained in:
parent
22915770e6
commit
682094c348
50
docs/plans/2026-06-04-056-feat-scrape-lock-status-plan.md
Normal file
50
docs/plans/2026-06-04-056-feat-scrape-lock-status-plan.md
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
---
|
||||||
|
title: "feat: Scrape lock status diagnostic"
|
||||||
|
type: feat
|
||||||
|
status: complete
|
||||||
|
date: 2026-06-04
|
||||||
|
origin: /lfg — plan 053 moved lock to archive_root; operators need read-only visibility before starting validation or killing stale runs
|
||||||
|
---
|
||||||
|
|
||||||
|
# feat: Scrape lock status diagnostic
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Add `scripts/scrape-lock-status.sh` to report archive-root scrape lock state (path, holder pid/cmd/started, live vs stale) and call it from `operator-handoff.sh` so handoff surfaces blocking scrapes.
|
||||||
|
|
||||||
|
## Problem Frame
|
||||||
|
|
||||||
|
Two checkouts can share `~/Documents` archives. A long validation holds `{archive_root}/.dce-scrape.lock` but operators only discover it when a second scrape fails. They need a read-only check before starting work.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement |
|
||||||
|
|----|-------------|
|
||||||
|
| R1 | `scrape-lock-status.sh --config PATH` prints lock file path and state |
|
||||||
|
| R2 | Resolves lock via `DCE_SCRAPE_LOCK_FILE` or `{archive_root}/.dce-scrape.lock` (same rules as host runner) |
|
||||||
|
| R3 | Reads `.meta` sidecar when present (pid, started, cmd) |
|
||||||
|
| R4 | Exit 0 when safe to scrape (free or stale reclaimable); exit 1 when actively held |
|
||||||
|
| R5 | `operator-handoff.sh` prints lock status section after verify-operator-ready |
|
||||||
|
| R6 | Offline smoke covers held, free, and archive-root path; `run-all-smokes.sh` passes |
|
||||||
|
|
||||||
|
## Implementation Units
|
||||||
|
|
||||||
|
### U1. scrape-lock-status.sh
|
||||||
|
|
||||||
|
**Files:** `scripts/scrape-lock-status.sh`
|
||||||
|
|
||||||
|
### U2. Operator handoff integration
|
||||||
|
|
||||||
|
**Files:** `scripts/operator-handoff.sh`, `scripts/tests/operator-handoff-smoke.sh`
|
||||||
|
|
||||||
|
### U3. Lock status smoke
|
||||||
|
|
||||||
|
**Files:** `scripts/tests/scrape-lock-status-smoke.sh`
|
||||||
|
|
||||||
|
## Scope Boundaries
|
||||||
|
|
||||||
|
### Deferred
|
||||||
|
|
||||||
|
- Refactoring host runner to shared lib (duplicate minimal resolve logic in status script)
|
||||||
|
- Live KotOR catch-up on host
|
||||||
|
- operator-handoff `--salvage-only`
|
||||||
|
|
@ -9,6 +9,7 @@ source "$SCRIPT_DIR/lib/scrape-run-plan.sh"
|
||||||
CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
|
CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
|
||||||
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
|
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
|
||||||
DOCUMENTS_SCRAPE="$REPO_ROOT/scripts/run-documents-scrape.sh"
|
DOCUMENTS_SCRAPE="$REPO_ROOT/scripts/run-documents-scrape.sh"
|
||||||
|
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
|
||||||
SKIP_DF=0
|
SKIP_DF=0
|
||||||
TARGET=""
|
TARGET=""
|
||||||
CHANNEL_ARGS=()
|
CHANNEL_ARGS=()
|
||||||
|
|
@ -101,6 +102,18 @@ main() {
|
||||||
fi
|
fi
|
||||||
|
|
||||||
"$VERIFY_READY" --config "$CONFIG_PATH"
|
"$VERIFY_READY" --config "$CONFIG_PATH"
|
||||||
|
|
||||||
|
if [[ -x "$LOCK_STATUS" ]]; then
|
||||||
|
printf '\n'
|
||||||
|
set +e
|
||||||
|
"$LOCK_STATUS" --config "$CONFIG_PATH"
|
||||||
|
lock_status=$?
|
||||||
|
set -e
|
||||||
|
if (( lock_status == 1 )); then
|
||||||
|
printf '\nWARN: scrape lock is held; wait for the active scrape or confirm it is stale before starting another run.\n'
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
local -a dry_run_args=(--dry-run --config "$CONFIG_PATH")
|
local -a dry_run_args=(--dry-run --config "$CONFIG_PATH")
|
||||||
[[ -n "$TARGET" ]] && dry_run_args+=(--target "$TARGET")
|
[[ -n "$TARGET" ]] && dry_run_args+=(--target "$TARGET")
|
||||||
dry_run_args+=("${CHANNEL_ARGS[@]}")
|
dry_run_args+=("${CHANNEL_ARGS[@]}")
|
||||||
|
|
|
||||||
148
scripts/scrape-lock-status.sh
Executable file
148
scripts/scrape-lock-status.sh
Executable file
|
|
@ -0,0 +1,148 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
|
||||||
|
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
||||||
|
CONFIG_PATH="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage:
|
||||||
|
$(basename "$0") [--config PATH]
|
||||||
|
|
||||||
|
Report scrape serialization lock state for the configured archive root.
|
||||||
|
Uses the same lock path rules as run-discord-scrape-host.sh.
|
||||||
|
|
||||||
|
Exit codes:
|
||||||
|
0 Safe to scrape (no lock, unheld lock file, or stale reclaimable holder)
|
||||||
|
1 Another scrape is actively holding the lock
|
||||||
|
2 Configuration or usage error
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf 'ERROR: %s\n' "$*" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_scrape_lock_file() {
|
||||||
|
local config_path=$1
|
||||||
|
|
||||||
|
if [[ -n "${DCE_SCRAPE_LOCK_FILE:-}" ]]; then
|
||||||
|
printf '%s\n' "$DCE_SCRAPE_LOCK_FILE"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local archive_root=""
|
||||||
|
if [[ -f "$config_path" ]]; then
|
||||||
|
archive_root=$(jq -r '.archive_root // empty' "$config_path" 2>/dev/null) || true
|
||||||
|
fi
|
||||||
|
if [[ -n "$archive_root" && "$archive_root" != null ]]; then
|
||||||
|
printf '%s/.dce-scrape.lock\n' "$archive_root"
|
||||||
|
else
|
||||||
|
printf '%s/.dce-scrape.lock\n' "$REPO_ROOT"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
read_meta_field() {
|
||||||
|
local meta_file=$1 field=$2
|
||||||
|
grep -E "^${field}=" "$meta_file" 2>/dev/null | head -1 | cut -d= -f2- || true
|
||||||
|
}
|
||||||
|
|
||||||
|
format_holder_line() {
|
||||||
|
local meta_file=$1
|
||||||
|
local pid="" started="" cmd="" holder_state=""
|
||||||
|
|
||||||
|
[[ -f "$meta_file" ]] || return 0
|
||||||
|
pid=$(read_meta_field "$meta_file" pid)
|
||||||
|
started=$(read_meta_field "$meta_file" started)
|
||||||
|
cmd=$(read_meta_field "$meta_file" cmd)
|
||||||
|
[[ -n "$pid" ]] || return 0
|
||||||
|
|
||||||
|
if kill -0 "$pid" 2>/dev/null; then
|
||||||
|
holder_state="running"
|
||||||
|
else
|
||||||
|
holder_state="not running"
|
||||||
|
fi
|
||||||
|
printf 'holder: pid %s (%s, started %s)\n' "$pid" "$holder_state" "${started:-unknown}"
|
||||||
|
[[ -n "$cmd" ]] && printf 'cmd: %s\n' "$cmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
lock_is_held() {
|
||||||
|
local lock_file=$1
|
||||||
|
|
||||||
|
command -v flock >/dev/null 2>&1 || return 1
|
||||||
|
exec {lock_probe_fd}>>"$lock_file"
|
||||||
|
if flock -n "$lock_probe_fd"; then
|
||||||
|
flock -u "$lock_probe_fd" 2>/dev/null || true
|
||||||
|
exec {lock_probe_fd}>&-
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
exec {lock_probe_fd}>&-
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
while (($#)); do
|
||||||
|
case "$1" in
|
||||||
|
--config)
|
||||||
|
[[ $# -ge 2 ]] || die "Missing value for --config."
|
||||||
|
CONFIG_PATH=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--help|-h)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die "Unknown option: $1"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
command -v jq >/dev/null 2>&1 || die "Required command 'jq' is missing."
|
||||||
|
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
|
||||||
|
|
||||||
|
local lock_file meta_file
|
||||||
|
lock_file=$(resolve_scrape_lock_file "$CONFIG_PATH")
|
||||||
|
meta_file="${lock_file}.meta"
|
||||||
|
|
||||||
|
printf 'Scrape lock status\n'
|
||||||
|
printf '==================\n'
|
||||||
|
printf 'config: %s\n' "$CONFIG_PATH"
|
||||||
|
printf 'lock: %s\n' "$lock_file"
|
||||||
|
|
||||||
|
if [[ ! -e "$lock_file" ]]; then
|
||||||
|
printf 'state: free (no lock file)\n'
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v flock >/dev/null 2>&1; then
|
||||||
|
printf 'state: unknown (flock unavailable; lock file exists)\n'
|
||||||
|
format_holder_line "$meta_file"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if lock_is_held "$lock_file"; then
|
||||||
|
printf 'state: held (active scrape)\n'
|
||||||
|
format_holder_line "$meta_file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$meta_file" ]]; then
|
||||||
|
local pid
|
||||||
|
pid=$(read_meta_field "$meta_file" pid)
|
||||||
|
if [[ -n "$pid" ]] && ! kill -0 "$pid" 2>/dev/null; then
|
||||||
|
printf 'state: stale (reclaimable; holder pid %s is not running)\n' "$pid"
|
||||||
|
format_holder_line "$meta_file"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf 'state: free (lock file present but not held)\n'
|
||||||
|
format_holder_line "$meta_file"
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
|
|
@ -65,5 +65,9 @@ if [[ "$channel_status" -ne 0 ]] || ! grep -q 'Handoff complete' <<<"$channel_ou
|
||||||
printf '%s\n' "$channel_output" >&2
|
printf '%s\n' "$channel_output" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
if ! grep -q 'Scrape lock status' <<<"$handoff_output"; then
|
||||||
|
printf 'operator-handoff missing scrape lock status section\n' >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
printf 'operator-handoff-smoke: ok\n'
|
printf 'operator-handoff-smoke: ok\n'
|
||||||
|
|
|
||||||
94
scripts/tests/scrape-lock-status-smoke.sh
Executable file
94
scripts/tests/scrape-lock-status-smoke.sh
Executable file
|
|
@ -0,0 +1,94 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
|
||||||
|
STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
|
||||||
|
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-lock-status-smoke.XXXXXX")
|
||||||
|
ARCHIVE_ROOT="$TMP_DIR/archive"
|
||||||
|
CONFIG_PATH="$TMP_DIR/config.json"
|
||||||
|
LOCK_FILE="$ARCHIVE_ROOT/.dce-scrape.lock"
|
||||||
|
HOLDER_PID=""
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "$HOLDER_PID" ]] && kill -0 "$HOLDER_PID" 2>/dev/null; then
|
||||||
|
kill "$HOLDER_PID" 2>/dev/null || true
|
||||||
|
wait "$HOLDER_PID" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -rf "$TMP_DIR"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
command -v flock >/dev/null 2>&1 || {
|
||||||
|
echo "SKIP: flock not available"
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "$ARCHIVE_ROOT"
|
||||||
|
cat >"$CONFIG_PATH" <<JSON
|
||||||
|
{
|
||||||
|
"archive_root": "$ARCHIVE_ROOT",
|
||||||
|
"targets": []
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
|
chmod +x "$STATUS"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
free_output=$("$STATUS" --config "$CONFIG_PATH" 2>&1)
|
||||||
|
free_status=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ "$free_status" -ne 0 ]] || ! grep -q 'state: free (no lock file)' <<<"$free_output"; then
|
||||||
|
echo "expected free lock status" >&2
|
||||||
|
printf '%s\n' "$free_output" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! grep -Fq "$LOCK_FILE" <<<"$free_output"; then
|
||||||
|
echo "expected archive-root lock path in output" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
(
|
||||||
|
exec {lock_fd}>>"$LOCK_FILE"
|
||||||
|
flock -n "$lock_fd" || exit 1
|
||||||
|
printf 'pid=%s\nstarted=2020-01-01T00:00:00Z\ncmd=lock-status-smoke-holder\n' "$$" >"${LOCK_FILE}.meta"
|
||||||
|
sleep 120
|
||||||
|
) &
|
||||||
|
HOLDER_PID=$!
|
||||||
|
sleep 0.2
|
||||||
|
|
||||||
|
set +e
|
||||||
|
held_output=$("$STATUS" --config "$CONFIG_PATH" 2>&1)
|
||||||
|
held_status=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ "$held_status" -ne 1 ]] || ! grep -q 'state: held (active scrape)' <<<"$held_output"; then
|
||||||
|
echo "expected held lock status exit 1" >&2
|
||||||
|
printf '%s\n' "$held_output" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! grep -q 'lock-status-smoke-holder' <<<"$held_output"; then
|
||||||
|
echo "expected holder cmd in status output" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
kill "$HOLDER_PID" 2>/dev/null || true
|
||||||
|
wait "$HOLDER_PID" 2>/dev/null || true
|
||||||
|
HOLDER_PID=""
|
||||||
|
|
||||||
|
printf 'pid=99999999\nstarted=2020-01-01T00:00:00Z\ncmd=dead-smoke-holder\n' >"${LOCK_FILE}.meta"
|
||||||
|
touch "$LOCK_FILE"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
stale_output=$("$STATUS" --config "$CONFIG_PATH" 2>&1)
|
||||||
|
stale_status=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ "$stale_status" -ne 0 ]] || ! grep -q 'state: stale (reclaimable' <<<"$stale_output"; then
|
||||||
|
echo "expected stale reclaimable status after holder exit" >&2
|
||||||
|
printf '%s\n' "$stale_output" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf 'scrape-lock-status-smoke: ok\n'
|
||||||
Loading…
Reference in a new issue