diff --git a/docs/plans/2026-05-29-026-feat-documents-scrape-disk-preflight-plan.md b/docs/plans/2026-05-29-026-feat-documents-scrape-disk-preflight-plan.md new file mode 100644 index 00000000..ba4550bc --- /dev/null +++ b/docs/plans/2026-05-29-026-feat-documents-scrape-disk-preflight-plan.md @@ -0,0 +1,27 @@ +--- +title: feat: Disk preflight on documents scrape entrypoint +type: feat +status: complete +date: 2026-05-29 +origin: /lfg — plan 025 added disk checks to verify-operator-ready but run-documents-scrape bypassed them +--- + +# feat: Disk preflight on documents scrape entrypoint + +## Summary + +Operators often run `./scripts/run-documents-scrape.sh` directly (and monthly cron uses the host runner). Call the same archive disk check before any live Discord scrape so full disks fail fast with a clear message. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `verify-operator-ready.sh --disk-only` runs config parse + `require_archive_disk_space` only | +| R2 | `run-documents-scrape.sh` invokes disk check before preflight/scrape (not on `--dry-run`) | +| R3 | `documents-scrape-smoke.sh` covers `--disk-only` success path with `DCE_MIN_FREE_MB=0` | +| R4 | `run-all-smokes.sh` still passes | + +## Verification + +- `./scripts/tests/documents-scrape-smoke.sh` +- `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh index 30340dde..f10ab4af 100755 --- a/scripts/run-documents-scrape.sh +++ b/scripts/run-documents-scrape.sh @@ -9,6 +9,7 @@ CONTAINER_CONFIG="${DCE_CONTAINER_CONFIG:-/config/scrape-targets.json}" HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh" DISCOVER_TOKEN="$REPO_ROOT/scripts/discover-discord-token.sh" VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh" +VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh" SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh" usage() { @@ -74,6 +75,8 @@ main() { exit 0 fi + "$VERIFY_READY" --disk-only --config "$CONFIG_PATH" + if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then "$SETUP_AUTH" 2>/dev/null || true elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then diff --git a/scripts/tests/documents-scrape-smoke.sh b/scripts/tests/documents-scrape-smoke.sh index 15109a3a..8f8fd340 100755 --- a/scripts/tests/documents-scrape-smoke.sh +++ b/scripts/tests/documents-scrape-smoke.sh @@ -75,4 +75,8 @@ DCE_REPO_ROOT="$REPO_ROOT" \ "$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" >/dev/null +DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \ + "$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \ + | grep -q 'disk-only: ok' + echo "documents-scrape-smoke: ok" diff --git a/scripts/verify-operator-ready.sh b/scripts/verify-operator-ready.sh index 2833293f..1e0c95f1 100755 --- a/scripts/verify-operator-ready.sh +++ b/scripts/verify-operator-ready.sh @@ -10,14 +10,16 @@ HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh" VERIFY_ARCHIVES="$REPO_ROOT/scripts/verify-documents-archives.sh" DISCOVER="$REPO_ROOT/scripts/discover-discord-token.sh" PREFLIGHT_TARGET="" +DISK_ONLY=0 usage() { cat </dev/null 2>&1 || die "Invalid JSON config: $CONFIG_PATH" + if (( DISK_ONLY == 1 )); then + require_archive_disk_space + printf 'disk-only: ok (config %s)\n' "$CONFIG_PATH" + exit 0 + fi + printf 'Operator readiness checks\n' printf '=========================\n' require_archive_disk_space