From b9bb4bbe643d4ee8d55be6c87438fb077abda918 Mon Sep 17 00:00:00 2001 From: Copilot Date: Wed, 3 Jun 2026 06:03:47 -0500 Subject: [PATCH] fix(host): flock scrape lock prevents concurrent container exports Overlapping run-operator-validation invocations spawned twin yes_general exports and repeated OOM skips. Host scrape now holds .dce-scrape.lock; smokes bypass via DCE_SKIP_SCRAPE_LOCK. Added lock smoke (20/20 pass). --- ...2026-06-04-046-fix-scrape-run-lock-plan.md | 37 ++++++++ docs/recurring-scrape-merge-readiness.md | 2 + scripts/run-discord-scrape-host.sh | 37 +++++++- .../run-discord-scrape-host-lock-smoke.sh | 87 +++++++++++++++++++ .../tests/run-discord-scrape-host-smoke.sh | 2 + 5 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 docs/plans/2026-06-04-046-fix-scrape-run-lock-plan.md create mode 100755 scripts/tests/run-discord-scrape-host-lock-smoke.sh diff --git a/docs/plans/2026-06-04-046-fix-scrape-run-lock-plan.md b/docs/plans/2026-06-04-046-fix-scrape-run-lock-plan.md new file mode 100644 index 00000000..e3e1bf0c --- /dev/null +++ b/docs/plans/2026-06-04-046-fix-scrape-run-lock-plan.md @@ -0,0 +1,37 @@ +--- +title: "fix: Scrape run lock prevents concurrent container exports" +type: fix +status: complete +date: 2026-06-04 +origin: /lfg — duplicate KotOR validation runs left two yes_general exports OOM-looping +--- + +# fix: Scrape run lock prevents concurrent container exports + +## Problem + +Two overlapping `run-operator-validation.sh --target KotOR_discord_msgs` processes each started a full container scrape. Both exported `yes_general` (`221726893064454144`) with the same `--after` cursor, creating twin `.dce-temp/export.*` dirs (~29–34 MiB each) and repeated OOM skips. + +Cron uses `flock`, but manual/host validation does not — overlapping runs are unguarded. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `run-discord-scrape-host.sh scrape` acquires non-blocking `flock` on `$REPO_ROOT/.dce-scrape.lock` | +| R2 | `DCE_SKIP_SCRAPE_LOCK=1` bypasses lock (smoke tests) | +| R3 | Clear error when lock held; preflight unaffected | +| R4 | Offline smoke asserts second scrape fails while lock held | +| R5 | `run-all-smokes.sh` passes (19/19); docs note concurrent-run hazard | + +## Verification + +```bash +./scripts/tests/run-discord-scrape-host-lock-smoke.sh +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Out of scope + +- Completing yes_general multi-hour catch-up inside LFG +- Container memory limits / tuning diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index e70a15c5..0751c810 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -115,6 +115,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh --sync-gui --per-target - \* Audit failed before plan 045 because truncated partial exports under `.dce-temp/` were scanned as archives. After fix, audit passes while partial temps exist. +**Plan 046 (2026-06-04):** `run-discord-scrape-host.sh scrape` holds non-blocking `flock` on `.dce-scrape.lock` so overlapping manual/cron validation cannot spawn twin yes_general exports. Stop duplicate runs before restarting KotOR validation. + **Plan 045 (2026-06-04):** `audit-archive-json.sh` and `verify-documents-archives.sh` skip `*/.dce-temp/*` (in-progress partial exports). Salvage run 2026-06-03: 7 merged, 17 unchanged, 3 skipped (+5404 messages); yes_general OOM-skipped with partial temps preserved for next salvage. **Plan 044 (2026-06-04):** Offline smoke asserts partial temp preserved on OOM skip (channel 134). Host wrapper prefers `DISCORD_TOKEN_FILE` over inherited shell tokens. `run-all-smokes.sh` → 19/19 pass. diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index 8af05a79..1dbb2389 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -14,6 +14,8 @@ DOCKER_BIN_OVERRIDDEN=0 REAUTH_COMMAND="" COMPOSE_ENV_FILE="" COMPOSE_ENV_TEMP="" +SCRAPE_LOCK_FILE="" +SCRAPE_LOCK_FD="" VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh" if [[ -n "${DCE_DOCKER_BIN:-}" ]]; then @@ -56,6 +58,33 @@ cleanup_compose_env() { fi } +acquire_scrape_lock() { + if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then + return 0 + fi + command -v flock >/dev/null 2>&1 || return 0 + + SCRAPE_LOCK_FILE="${DCE_SCRAPE_LOCK_FILE:-$REPO_ROOT/.dce-scrape.lock}" + exec {SCRAPE_LOCK_FD}>>"$SCRAPE_LOCK_FILE" + if ! flock -n "$SCRAPE_LOCK_FD"; then + die "Another scrape is already running (lock: $SCRAPE_LOCK_FILE). Wait for it to finish or confirm no scrape is active before removing the lock." + fi +} + +release_scrape_lock() { + if [[ -z "${SCRAPE_LOCK_FD:-}" ]]; then + return 0 + fi + flock -u "$SCRAPE_LOCK_FD" 2>/dev/null || true + exec {SCRAPE_LOCK_FD}>&- + SCRAPE_LOCK_FD="" +} + +cleanup_on_exit() { + release_scrape_lock + cleanup_compose_env +} + load_env_file() { [[ -f "$ENV_FILE" ]] || die "Missing env file: $ENV_FILE" local raw_line line key value @@ -402,7 +431,7 @@ main() { local -a passthrough_args=() local subcommand="" - trap cleanup_compose_env EXIT + trap cleanup_on_exit EXIT while (($#)); do case "$1" in @@ -470,7 +499,11 @@ main() { print_scrape_config_plan "$host_config" "Host $subcommand" "${host_targets[@]}" case "$subcommand" in - preflight|scrape) + preflight) + run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" + ;; + scrape) + acquire_scrape_lock run_subcommand_with_retry "$subcommand" "${passthrough_args[@]}" ;; esac diff --git a/scripts/tests/run-discord-scrape-host-lock-smoke.sh b/scripts/tests/run-discord-scrape-host-lock-smoke.sh new file mode 100755 index 00000000..4025d459 --- /dev/null +++ b/scripts/tests/run-discord-scrape-host-lock-smoke.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P) +TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-host-lock-smoke.XXXXXX") +ENV_FILE="$TMP_DIR/scrape.env" +COMPOSE_FILE="$TMP_DIR/docker-compose.yml" +FAKE_DOCKER="$TMP_DIR/docker" +LOCK_FILE="$TMP_DIR/scrape.lock" +HOLDER_PID="" + +cleanup() { + if [[ -n "$HOLDER_PID" ]] && kill -0 "$HOLDER_PID" 2>/dev/null; then + kill "$HOLDER_PID" 2>/dev/null || true + wait "$HOLDER_PID" 2>/dev/null || true + fi + rm -rf "$TMP_DIR" +} +trap cleanup EXIT + +command -v flock >/dev/null 2>&1 || { + echo "SKIP: flock not available" + exit 0 +} + +cat >"$COMPOSE_FILE" <<'EOF' +services: + discord-scraper: + image: fake +EOF + +cat >"$FAKE_DOCKER" <<'EOF' +#!/usr/bin/env bash +printf 'run succeeded\n' +EOF +chmod +x "$FAKE_DOCKER" + +cat >"$ENV_FILE" <>"$LOCK_FILE" + flock -n "$lock_fd" || exit 1 + sleep 120 +) & +HOLDER_PID=$! +sleep 0.2 + +set +e +output=$( + DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_SCRAPE_LOCK_FILE="$LOCK_FILE" \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + DCE_ENV_FILE="$ENV_FILE" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --target demo 2>&1 +) +status=$? +set -e + +if [[ "$status" -eq 0 ]]; then + echo "expected scrape to fail while lock is held" >&2 + exit 1 +fi +if ! grep -q 'Another scrape is already running' <<<"$output"; then + echo "expected lock-held error message" >&2 + printf '%s\n' "$output" >&2 + exit 1 +fi + +kill "$HOLDER_PID" 2>/dev/null || true +wait "$HOLDER_PID" 2>/dev/null || true +HOLDER_PID="" + +if ! DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_SCRAPE_LOCK_FILE="$LOCK_FILE" \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + DCE_ENV_FILE="$ENV_FILE" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --target demo >/dev/null; then + echo "expected scrape to succeed after lock released" >&2 + exit 1 +fi + +echo "run-discord-scrape-host-lock-smoke: OK" diff --git a/scripts/tests/run-discord-scrape-host-smoke.sh b/scripts/tests/run-discord-scrape-host-smoke.sh index e4cc3ddf..fb2c12b5 100755 --- a/scripts/tests/run-discord-scrape-host-smoke.sh +++ b/scripts/tests/run-discord-scrape-host-smoke.sh @@ -85,6 +85,7 @@ run_host() { local env_path=${2:-$ENV_FILE} env -u DISCORD_TOKEN \ + DCE_SKIP_SCRAPE_LOCK=1 \ DCE_REPO_ROOT="$REPO_ROOT" \ DCE_DOCKER_BIN="$FAKE_DOCKER" \ DCE_ENV_FILE="$env_path" \ @@ -100,6 +101,7 @@ run_host_with_shell_token() { local missing_env_path=$2 DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_SKIP_SCRAPE_LOCK=1 \ DCE_DOCKER_BIN="$FAKE_DOCKER" \ DCE_ENV_FILE="$missing_env_path" \ DCE_COMPOSE_FILE="$COMPOSE_FILE" \