diff --git a/docs/plans/2026-06-04-058-feat-documents-scrape-lock-salvage-before-plan.md b/docs/plans/2026-06-04-058-feat-documents-scrape-lock-salvage-before-plan.md new file mode 100644 index 00000000..cb77093e --- /dev/null +++ b/docs/plans/2026-06-04-058-feat-documents-scrape-lock-salvage-before-plan.md @@ -0,0 +1,50 @@ +--- +title: "feat: Documents scrape lock gate and salvage-before" +type: feat +status: active +date: 2026-06-04 +origin: /lfg — validation/handoff expose salvage; direct run-documents-scrape.sh still lacks lock gate and --salvage-before-scrape +--- + +# feat: Documents scrape lock gate and salvage-before + +## Summary + +Add scrape lock preflight and `--salvage-before-scrape` to `run-documents-scrape.sh` so direct document scrapes match operator-validation safety and KotOR catch-up workflow. + +## Problem Frame + +Operators often invoke documents scrape directly: + +```bash +./scripts/run-documents-scrape.sh --target KotOR_discord_msgs --channel 221726893064454144 +``` + +This bypasses `run-operator-validation.sh` lock gate. Salvage-before requires two commands today. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `run-documents-scrape.sh` checks archive-root lock before salvage or Discord scrape | +| R2 | Lock gate skipped when `DCE_SKIP_SCRAPE_LOCK=1` | +| R3 | `--salvage-before-scrape` runs salvage then preflight/scrape | +| R4 | `--salvage-only`, `--salvage-before-scrape`, and `--dry-run` are mutually exclusive | +| R5 | Smokes cover lock block and salvage-before; `run-all-smokes.sh` passes | + +## Implementation Units + +### U1. Documents scrape lock + salvage-before + +**Files:** `scripts/run-documents-scrape.sh` + +### U2. Smoke coverage + +**Files:** `scripts/tests/documents-scrape-smoke.sh` + +## Scope Boundaries + +### Deferred + +- Operator checklist doc refresh +- Live KotOR catch-up on host diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh index 512a45cd..cab01d69 100755 --- a/scripts/run-documents-scrape.sh +++ b/scripts/run-documents-scrape.sh @@ -11,13 +11,14 @@ DISCOVER_TOKEN="$REPO_ROOT/scripts/discover-discord-token.sh" VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh" VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh" SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh" +LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh" # shellcheck source=lib/scrape-run-plan.sh source "$SCRIPT_DIR/lib/scrape-run-plan.sh" usage() { cat </ @@ -28,6 +29,7 @@ End-to-end Documents scrape workflow: Options: --dry-run Verify archives only; do not call Discord --salvage-only Merge quiescent stale .dce-temp exports only (no Discord export) + --salvage-before-scrape Run salvage-only pass before preflight and incremental scrape --target NAME Limit preflight/scrape to one configured target --channel ID With exactly one --target, limit scrape to channel ID (repeatable) --config PATH Scrape target config (default: config/scrape-targets.json) @@ -39,9 +41,37 @@ die() { exit 1 } +ensure_scrape_lock_available() { + if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then + return 0 + fi + [[ -x "$LOCK_STATUS" ]] || return 0 + if ! "$LOCK_STATUS" --config "$CONFIG_PATH"; then + die "Scrape lock is held; another scrape may be running. Inspect: $LOCK_STATUS --config $CONFIG_PATH" + fi +} + +run_local_salvage() { + local -a salvage_args=(--config "$CONFIG_PATH") + local skip_next=0 arg + for arg in "$@"; do + if (( skip_next )); then + skip_next=0 + continue + fi + if [[ "$arg" == "--config" ]]; then + skip_next=1 + continue + fi + salvage_args+=("$arg") + done + "$HOST_RUNNER" salvage "${salvage_args[@]}" +} + main() { local dry_run=0 local salvage_only=0 + local salvage_before=0 local target="" local -a passthrough=() @@ -55,6 +85,10 @@ main() { salvage_only=1 shift ;; + --salvage-before-scrape) + salvage_before=1 + shift + ;; --target) [[ $# -ge 2 ]] || die "Missing value for --target." target=$2 @@ -82,6 +116,14 @@ main() { esac done + local exclusive=0 + (( dry_run == 1 )) && exclusive=$((exclusive + 1)) + (( salvage_only == 1 )) && exclusive=$((exclusive + 1)) + (( salvage_before == 1 )) && exclusive=$((exclusive + 1)) + if (( exclusive > 1 )); then + die "Use only one of --dry-run, --salvage-only, or --salvage-before-scrape." + fi + "$VERIFY_SCRIPT" --config "$CONFIG_PATH" local -a plan_targets=() @@ -97,24 +139,17 @@ main() { "$VERIFY_READY" --disk-only --config "$CONFIG_PATH" + ensure_scrape_lock_available + if (( salvage_only == 1 )); then - local -a salvage_args=(--config "$CONFIG_PATH") - local skip_next=0 arg - for arg in "${passthrough[@]}"; do - if (( skip_next )); then - skip_next=0 - continue - fi - if [[ "$arg" == "--config" ]]; then - skip_next=1 - continue - fi - salvage_args+=("$arg") - done - "$HOST_RUNNER" salvage "${salvage_args[@]}" + run_local_salvage "${passthrough[@]}" exit 0 fi + if (( salvage_before == 1 )); then + run_local_salvage "${passthrough[@]}" + fi + local -a container_args=("${passthrough[@]}") local has_config=0 idx=0 diff --git a/scripts/tests/documents-scrape-smoke.sh b/scripts/tests/documents-scrape-smoke.sh index c7070266..419d7d83 100755 --- a/scripts/tests/documents-scrape-smoke.sh +++ b/scripts/tests/documents-scrape-smoke.sh @@ -128,6 +128,60 @@ grep -q 'salvage completed' "$SALVAGE_DOC_LOG" || { exit 1 } +SALVAGE_BEFORE_LOG="$TMP_DIR/salvage-before.log" +: >"$ARGS_LOG" +DCE_MIN_FREE_MB=0 \ + DCE_SKIP_SCRAPE_LOCK=1 \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \ + DCE_ENV_FILE="$TMP_DIR/scrape.env" \ + "$REPO_ROOT/scripts/run-documents-scrape.sh" \ + --salvage-before-scrape --config "$TMP_DIR/config.json" --target demo >"$SALVAGE_BEFORE_LOG" 2>&1 || { + echo "salvage-before-scrape documents scrape failed" >&2 + cat "$SALVAGE_BEFORE_LOG" >&2 + exit 1 +} +grep -q 'salvage completed' "$SALVAGE_BEFORE_LOG" || { + echo "expected --salvage-before-scrape to run local salvage first" >&2 + cat "$SALVAGE_BEFORE_LOG" >&2 + exit 1 +} +grep -q 'compose' "$ARGS_LOG" || { + echo "expected --salvage-before-scrape to continue into container scrape" >&2 + cat "$ARGS_LOG" >&2 + exit 1 +} + +command -v flock >/dev/null 2>&1 && { + LOCK_FILE="$TMP_DIR/.dce-scrape.lock" + HOLDER_PID="" + ( + exec {lock_fd}>>"$LOCK_FILE" + flock -n "$lock_fd" || exit 1 + sleep 120 + ) & + HOLDER_PID=$! + sleep 0.2 + + set +e + blocked_output=$( + DCE_MIN_FREE_MB=0 \ + "$REPO_ROOT/scripts/run-documents-scrape.sh" \ + --salvage-only --config "$TMP_DIR/config.json" --target demo 2>&1 + ) + blocked_status=$? + set -e + + kill "$HOLDER_PID" 2>/dev/null || true + wait "$HOLDER_PID" 2>/dev/null || true + + if [[ "$blocked_status" -eq 0 ]] || ! grep -q 'Scrape lock is held' <<<"$blocked_output"; then + echo "expected documents scrape to fail when archive lock held" >&2 + printf '%s\n' "$blocked_output" >&2 + exit 1 + fi +} + DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \ "$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \ | grep -q 'disk-only: ok'