From b71c6975303dc1c8a7a5227e425fa8935643956a Mon Sep 17 00:00:00 2001 From: Copilot Date: Wed, 3 Jun 2026 11:27:12 -0500 Subject: [PATCH] feat(scrape): cron uses documents scrape with --log-file Monthly cron now runs the unified documents workflow with teed logs and paired JSON summaries instead of host scrape shell redirect. --- .docs/Recurring-Scrape-Setup.md | 3 + ...eat-cron-documents-scrape-log-file-plan.md | 57 +++++++++++++++++++ docs/recurring-scrape-merge-readiness.md | 2 + scripts/run-documents-scrape.sh | 6 ++ scripts/setup-cron.sh | 12 ++-- scripts/tests/setup-cron-smoke.sh | 7 ++- 6 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 docs/plans/2026-06-04-079-feat-cron-documents-scrape-log-file-plan.md diff --git a/.docs/Recurring-Scrape-Setup.md b/.docs/Recurring-Scrape-Setup.md index 596e9b62..3737ee87 100644 --- a/.docs/Recurring-Scrape-Setup.md +++ b/.docs/Recurring-Scrape-Setup.md @@ -297,6 +297,9 @@ Check logs from your last run: # Primary log file (default from setup-cron.sh) tail -f logs/discord-scrape.log +# Machine-readable totals beside the cron log +./scripts/print-scrape-summary.sh logs/discord-scrape.summary.json + # Recent cron execution (system log) sudo grep discord-scrape /var/log/syslog # Debian/Ubuntu sudo grep discord-scrape /var/log/cron # CentOS/RHEL diff --git a/docs/plans/2026-06-04-079-feat-cron-documents-scrape-log-file-plan.md b/docs/plans/2026-06-04-079-feat-cron-documents-scrape-log-file-plan.md new file mode 100644 index 00000000..c3ea0fb9 --- /dev/null +++ b/docs/plans/2026-06-04-079-feat-cron-documents-scrape-log-file-plan.md @@ -0,0 +1,57 @@ +--- +title: "feat: Cron uses documents scrape with --log-file" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — plan 078 deferred wire --log-file into setup-cron crontab line +--- + +# feat: Cron uses documents scrape with --log-file + +## Summary + +Change `setup-cron.sh` to install `run-documents-scrape.sh --log-file PATH` instead of `run-discord-scrape-host.sh scrape >> log`. Cron jobs get archive verify, disk preflight, lock gate, teed logs, and paired JSON summaries. + +## Problem Frame + +Plan 078 added `--log-file` tee to documents scrape, but monthly cron still invokes the bare host wrapper with shell `>>` redirect — bypassing the unified workflow and JSON summary pairing. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | Cron job line runs `run-documents-scrape.sh --config HOST_CONFIG --log-file LOG_FILE` | +| R2 | `--target`, `--channel`, `--guild` forwarded to documents scrape | +| R3 | Cron sets `DCE_ENV_FILE`, `DCE_COMPOSE_FILE`, `DCE_COMPOSE_TTY=0` (no shell `>>` redirect) | +| R4 | `run-documents-scrape.sh` accepts `--guild` passthrough like `--channel` | +| R5 | `setup-cron-smoke.sh` asserts documents scrape + `--log-file` in crontab | +| R6 | Docs note cron log + `.summary.json` | +| R7 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 23/23 | + +## Implementation Units + +### U1. setup-cron.sh + +**Files:** `scripts/setup-cron.sh`, `scripts/tests/setup-cron-smoke.sh` + +### U2. run-documents-scrape.sh + +**Files:** `scripts/run-documents-scrape.sh` (--guild passthrough) + +### U3. Docs + +**Files:** `docs/recurring-scrape-merge-readiness.md`, `.docs/Recurring-Scrape-Setup.md` + +## Verification + +```bash +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up on host +- Refresh PR #1538 body with plans 070–079 stamps +- `--salvage-before-scrape` on cron (operator opt-in only) diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index 9ff5f5aa..318d0161 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -186,6 +186,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 078 (2026-06-04):** `run-documents-scrape.sh` `--log-file` with auto tee on live scrapes; summary pairs with log basename. +**Plan 079 (2026-06-04):** `setup-cron.sh` installs `run-documents-scrape.sh --log-file` (unified workflow + JSON summary) instead of bare host scrape redirect. + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh index 841f4dc8..a611d231 100755 --- a/scripts/run-documents-scrape.sh +++ b/scripts/run-documents-scrape.sh @@ -35,6 +35,7 @@ Options: --salvage-before-scrape Run salvage-only pass before preflight and incremental scrape --target NAME Limit preflight/scrape to one configured target --channel ID With exactly one --target, limit scrape to channel ID (repeatable) + --guild ID With exactly one --target, limit scrape to guild ID (repeatable) --config PATH Scrape target config (default: config/scrape-targets.json) --log-file PATH Append full workflow output to this file (default on live scrape: logs/documents-scrape-UTC.log) --summary-file PATH Machine-readable scrape summary JSON (default: .summary.json on live scrape) @@ -172,6 +173,11 @@ main() { passthrough+=(--channel "$2") shift 2 ;; + --guild) + [[ $# -ge 2 ]] || die "Missing value for --guild." + passthrough+=(--guild "$2") + shift 2 + ;; --config) [[ $# -ge 2 ]] || die "Missing value for --config." CONFIG_PATH=$2 diff --git a/scripts/setup-cron.sh b/scripts/setup-cron.sh index 62d79fe6..ca81f580 100755 --- a/scripts/setup-cron.sh +++ b/scripts/setup-cron.sh @@ -7,6 +7,7 @@ REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}" COMPOSE_FILE="${DCE_COMPOSE_FILE:-$REPO_ROOT/docker-compose.yml}" ENV_FILE="${DCE_ENV_FILE:-$REPO_ROOT/scrape.env}" HOST_RUNNER="${DCE_HOST_RUNNER:-$REPO_ROOT/scripts/run-discord-scrape-host.sh}" +DOCUMENTS_SCRAPE="${DCE_DOCUMENTS_SCRAPE:-$REPO_ROOT/scripts/run-documents-scrape.sh}" CONFIG_FILE="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}" LOG_FILE="${DCE_LOG_FILE:-$REPO_ROOT/logs/discord-scrape.log}" JOB_NAME="discord-scrape" @@ -292,6 +293,7 @@ main() { [[ -f "$COMPOSE_FILE" ]] || die "Missing compose file: $COMPOSE_FILE" [[ -x "$HOST_RUNNER" ]] || die "Missing or non-executable host runner: $HOST_RUNNER" + [[ -x "$DOCUMENTS_SCRAPE" ]] || die "Missing or non-executable documents scrape: $DOCUMENTS_SCRAPE" [[ -f "$CONFIG_FILE" ]] || die "Missing config file: $CONFIG_FILE" "$JQ_BIN" empty "$CONFIG_FILE" >/dev/null 2>&1 || die "Invalid JSON config: $CONFIG_FILE" @@ -334,11 +336,9 @@ main() { fi scrape_args=( - "$HOST_RUNNER" - --env-file "$ENV_FILE" - --compose-file "$COMPOSE_FILE" - scrape - --config "$(container_config_path "$CONFIG_FILE")" + "$DOCUMENTS_SCRAPE" + --config "$CONFIG_FILE" + --log-file "$LOG_FILE" ) append_target_args scrape_args scrape_command=$(printf '%q ' "${scrape_args[@]}") @@ -348,7 +348,7 @@ main() { lock_prefix="" fi - job_line="$cron_line cd $(printf '%q' "$REPO_ROOT") && DCE_COMPOSE_TTY=0 ${lock_prefix}${scrape_command}>> $(printf '%q' "$LOG_FILE") 2>&1" + job_line="$cron_line cd $(printf '%q' "$REPO_ROOT") && DCE_COMPOSE_TTY=0 DCE_ENV_FILE=$(printf '%q' "$ENV_FILE") DCE_COMPOSE_FILE=$(printf '%q' "$COMPOSE_FILE") ${lock_prefix}${scrape_command}" local cron_block cron_block=$(printf '%s\n%s\n%s\n' "$begin_marker" "$job_line" "$end_marker") diff --git a/scripts/tests/setup-cron-smoke.sh b/scripts/tests/setup-cron-smoke.sh index a600178b..f0fa828d 100755 --- a/scripts/tests/setup-cron-smoke.sh +++ b/scripts/tests/setup-cron-smoke.sh @@ -85,8 +85,11 @@ run_setup grep -q '^MAILTO=test@example.com$' "$CRONTAB_FILE" || { echo "expected unrelated crontab line to remain" >&2; exit 1; } [[ "$(grep -c '^# BEGIN discord-scrape$' "$CRONTAB_FILE")" == "1" ]] || { echo "expected exactly one managed cron block after install" >&2; exit 1; } grep -q 'compose --env-file' "$DOCKER_LOG" || { echo "expected docker preflight to run during install" >&2; exit 1; } -grep -q 'scripts/run-discord-scrape-host.sh' "$CRONTAB_FILE" || { echo "expected cron job to run host wrapper" >&2; exit 1; } -grep -q 'DCE_COMPOSE_TTY=0' "$CRONTAB_FILE" || { echo "expected cron job to disable compose TTY for log append" >&2; exit 1; } +grep -q 'scripts/run-documents-scrape.sh' "$CRONTAB_FILE" || { echo "expected cron job to run documents scrape" >&2; exit 1; } +grep -q -- '--log-file' "$CRONTAB_FILE" || { echo "expected cron job to pass --log-file" >&2; exit 1; } +grep -q 'DCE_ENV_FILE=' "$CRONTAB_FILE" || { echo "expected cron job to set DCE_ENV_FILE" >&2; exit 1; } +grep -q 'DCE_COMPOSE_TTY=0' "$CRONTAB_FILE" || { echo "expected cron job to disable compose TTY" >&2; exit 1; } +grep -q '>>' "$CRONTAB_FILE" && { echo "expected cron job to rely on documents scrape tee, not shell redirect" >&2; exit 1; } run_setup [[ "$(grep -c '^# BEGIN discord-scrape$' "$CRONTAB_FILE")" == "1" ]] || { echo "expected exactly one managed cron block after reinstall" >&2; exit 1; }