feat(scrape): disk-only verify flag and documents scrape preflight

run-documents-scrape.sh checks archive free space before live Discord
calls; verify-operator-ready.sh adds --disk-only for lightweight checks.
This commit is contained in:
Boden 2026-05-29 15:54:08 -05:00
parent 1142e376b5
commit 32b7f47d45
4 changed files with 47 additions and 1 deletions

View file

@ -0,0 +1,27 @@
---
title: feat: Disk preflight on documents scrape entrypoint
type: feat
status: complete
date: 2026-05-29
origin: /lfg — plan 025 added disk checks to verify-operator-ready but run-documents-scrape bypassed them
---
# feat: Disk preflight on documents scrape entrypoint
## Summary
Operators often run `./scripts/run-documents-scrape.sh` directly (and monthly cron uses the host runner). Call the same archive disk check before any live Discord scrape so full disks fail fast with a clear message.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | `verify-operator-ready.sh --disk-only` runs config parse + `require_archive_disk_space` only |
| R2 | `run-documents-scrape.sh` invokes disk check before preflight/scrape (not on `--dry-run`) |
| R3 | `documents-scrape-smoke.sh` covers `--disk-only` success path with `DCE_MIN_FREE_MB=0` |
| R4 | `run-all-smokes.sh` still passes |
## Verification
- `./scripts/tests/documents-scrape-smoke.sh`
- `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh`

View file

@ -9,6 +9,7 @@ CONTAINER_CONFIG="${DCE_CONTAINER_CONFIG:-/config/scrape-targets.json}"
HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh" HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh"
DISCOVER_TOKEN="$REPO_ROOT/scripts/discover-discord-token.sh" DISCOVER_TOKEN="$REPO_ROOT/scripts/discover-discord-token.sh"
VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh" VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh" SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
usage() { usage() {
@ -74,6 +75,8 @@ main() {
exit 0 exit 0
fi fi
"$VERIFY_READY" --disk-only --config "$CONFIG_PATH"
if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then
"$SETUP_AUTH" 2>/dev/null || true "$SETUP_AUTH" 2>/dev/null || true
elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then

View file

@ -75,4 +75,8 @@ DCE_REPO_ROOT="$REPO_ROOT" \
"$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" >/dev/null "$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" >/dev/null
DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \
"$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \
| grep -q 'disk-only: ok'
echo "documents-scrape-smoke: ok" echo "documents-scrape-smoke: ok"

View file

@ -10,14 +10,16 @@ HOST_RUNNER="$REPO_ROOT/scripts/run-discord-scrape-host.sh"
VERIFY_ARCHIVES="$REPO_ROOT/scripts/verify-documents-archives.sh" VERIFY_ARCHIVES="$REPO_ROOT/scripts/verify-documents-archives.sh"
DISCOVER="$REPO_ROOT/scripts/discover-discord-token.sh" DISCOVER="$REPO_ROOT/scripts/discover-discord-token.sh"
PREFLIGHT_TARGET="" PREFLIGHT_TARGET=""
DISK_ONLY=0
usage() { usage() {
cat <<EOF cat <<EOF
Usage: Usage:
$(basename "$0") [--config PATH] [--preflight TARGET] $(basename "$0") [--config PATH] [--disk-only] [--preflight TARGET]
Check host prerequisites for recurring scrape: Check host prerequisites for recurring scrape:
jq, container compose, Discord auth, valid config, seeded archives. jq, container compose, Discord auth, valid config, seeded archives.
With --disk-only, only validate config JSON and archive-root free space (DCE_MIN_FREE_MB).
With --preflight TARGET, also run Discord preflight for one target. With --preflight TARGET, also run Discord preflight for one target.
EOF EOF
} }
@ -103,6 +105,10 @@ main() {
CONFIG_PATH=$2 CONFIG_PATH=$2
shift 2 shift 2
;; ;;
--disk-only)
DISK_ONLY=1
shift
;;
--preflight) --preflight)
[[ $# -ge 2 ]] || die "Missing value for --preflight." [[ $# -ge 2 ]] || die "Missing value for --preflight."
PREFLIGHT_TARGET=$2 PREFLIGHT_TARGET=$2
@ -122,6 +128,12 @@ main() {
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH" [[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
jq empty "$CONFIG_PATH" >/dev/null 2>&1 || die "Invalid JSON config: $CONFIG_PATH" jq empty "$CONFIG_PATH" >/dev/null 2>&1 || die "Invalid JSON config: $CONFIG_PATH"
if (( DISK_ONLY == 1 )); then
require_archive_disk_space
printf 'disk-only: ok (config %s)\n' "$CONFIG_PATH"
exit 0
fi
printf 'Operator readiness checks\n' printf 'Operator readiness checks\n'
printf '=========================\n' printf '=========================\n'
require_archive_disk_space require_archive_disk_space