mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-09 15:52:37 -06:00
feat(scrape): cron uses documents scrape with --log-file
Monthly cron now runs the unified documents workflow with teed logs and paired JSON summaries instead of host scrape shell redirect.
This commit is contained in:
parent
759e33efe9
commit
b71c697530
|
|
@ -297,6 +297,9 @@ Check logs from your last run:
|
|||
# Primary log file (default from setup-cron.sh)
|
||||
tail -f logs/discord-scrape.log
|
||||
|
||||
# Machine-readable totals beside the cron log
|
||||
./scripts/print-scrape-summary.sh logs/discord-scrape.summary.json
|
||||
|
||||
# Recent cron execution (system log)
|
||||
sudo grep discord-scrape /var/log/syslog # Debian/Ubuntu
|
||||
sudo grep discord-scrape /var/log/cron # CentOS/RHEL
|
||||
|
|
|
|||
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
title: "feat: Cron uses documents scrape with --log-file"
|
||||
type: feat
|
||||
status: complete
|
||||
date: 2026-06-04
|
||||
origin: /lfg — plan 078 deferred wire --log-file into setup-cron crontab line
|
||||
---
|
||||
|
||||
# feat: Cron uses documents scrape with --log-file
|
||||
|
||||
## Summary
|
||||
|
||||
Change `setup-cron.sh` to install `run-documents-scrape.sh --log-file PATH` instead of `run-discord-scrape-host.sh scrape >> log`. Cron jobs get archive verify, disk preflight, lock gate, teed logs, and paired JSON summaries.
|
||||
|
||||
## Problem Frame
|
||||
|
||||
Plan 078 added `--log-file` tee to documents scrape, but monthly cron still invokes the bare host wrapper with shell `>>` redirect — bypassing the unified workflow and JSON summary pairing.
|
||||
|
||||
## Requirements
|
||||
|
||||
| ID | Requirement |
|
||||
|----|-------------|
|
||||
| R1 | Cron job line runs `run-documents-scrape.sh --config HOST_CONFIG --log-file LOG_FILE` |
|
||||
| R2 | `--target`, `--channel`, `--guild` forwarded to documents scrape |
|
||||
| R3 | Cron sets `DCE_ENV_FILE`, `DCE_COMPOSE_FILE`, `DCE_COMPOSE_TTY=0` (no shell `>>` redirect) |
|
||||
| R4 | `run-documents-scrape.sh` accepts `--guild` passthrough like `--channel` |
|
||||
| R5 | `setup-cron-smoke.sh` asserts documents scrape + `--log-file` in crontab |
|
||||
| R6 | Docs note cron log + `<basename>.summary.json` |
|
||||
| R7 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 23/23 |
|
||||
|
||||
## Implementation Units
|
||||
|
||||
### U1. setup-cron.sh
|
||||
|
||||
**Files:** `scripts/setup-cron.sh`, `scripts/tests/setup-cron-smoke.sh`
|
||||
|
||||
### U2. run-documents-scrape.sh
|
||||
|
||||
**Files:** `scripts/run-documents-scrape.sh` (--guild passthrough)
|
||||
|
||||
### U3. Docs
|
||||
|
||||
**Files:** `docs/recurring-scrape-merge-readiness.md`, `.docs/Recurring-Scrape-Setup.md`
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh
|
||||
```
|
||||
|
||||
## Scope Boundaries
|
||||
|
||||
### Deferred
|
||||
|
||||
- Live KotOR catch-up on host
|
||||
- Refresh PR #1538 body with plans 070–079 stamps
|
||||
- `--salvage-before-scrape` on cron (operator opt-in only)
|
||||
|
|
@ -186,6 +186,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \
|
|||
|
||||
**Plan 078 (2026-06-04):** `run-documents-scrape.sh` `--log-file` with auto tee on live scrapes; summary pairs with log basename.
|
||||
|
||||
**Plan 079 (2026-06-04):** `setup-cron.sh` installs `run-documents-scrape.sh --log-file` (unified workflow + JSON summary) instead of bare host scrape redirect.
|
||||
|
||||
**Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom.
|
||||
|
||||
## CI note (fork PRs)
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ Options:
|
|||
--salvage-before-scrape Run salvage-only pass before preflight and incremental scrape
|
||||
--target NAME Limit preflight/scrape to one configured target
|
||||
--channel ID With exactly one --target, limit scrape to channel ID (repeatable)
|
||||
--guild ID With exactly one --target, limit scrape to guild ID (repeatable)
|
||||
--config PATH Scrape target config (default: config/scrape-targets.json)
|
||||
--log-file PATH Append full workflow output to this file (default on live scrape: logs/documents-scrape-UTC.log)
|
||||
--summary-file PATH Machine-readable scrape summary JSON (default: <log-basename>.summary.json on live scrape)
|
||||
|
|
@ -172,6 +173,11 @@ main() {
|
|||
passthrough+=(--channel "$2")
|
||||
shift 2
|
||||
;;
|
||||
--guild)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --guild."
|
||||
passthrough+=(--guild "$2")
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --config."
|
||||
CONFIG_PATH=$2
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
|||
COMPOSE_FILE="${DCE_COMPOSE_FILE:-$REPO_ROOT/docker-compose.yml}"
|
||||
ENV_FILE="${DCE_ENV_FILE:-$REPO_ROOT/scrape.env}"
|
||||
HOST_RUNNER="${DCE_HOST_RUNNER:-$REPO_ROOT/scripts/run-discord-scrape-host.sh}"
|
||||
DOCUMENTS_SCRAPE="${DCE_DOCUMENTS_SCRAPE:-$REPO_ROOT/scripts/run-documents-scrape.sh}"
|
||||
CONFIG_FILE="${DCE_CONFIG_FILE:-$REPO_ROOT/config/scrape-targets.json}"
|
||||
LOG_FILE="${DCE_LOG_FILE:-$REPO_ROOT/logs/discord-scrape.log}"
|
||||
JOB_NAME="discord-scrape"
|
||||
|
|
@ -292,6 +293,7 @@ main() {
|
|||
|
||||
[[ -f "$COMPOSE_FILE" ]] || die "Missing compose file: $COMPOSE_FILE"
|
||||
[[ -x "$HOST_RUNNER" ]] || die "Missing or non-executable host runner: $HOST_RUNNER"
|
||||
[[ -x "$DOCUMENTS_SCRAPE" ]] || die "Missing or non-executable documents scrape: $DOCUMENTS_SCRAPE"
|
||||
[[ -f "$CONFIG_FILE" ]] || die "Missing config file: $CONFIG_FILE"
|
||||
"$JQ_BIN" empty "$CONFIG_FILE" >/dev/null 2>&1 || die "Invalid JSON config: $CONFIG_FILE"
|
||||
|
||||
|
|
@ -334,11 +336,9 @@ main() {
|
|||
fi
|
||||
|
||||
scrape_args=(
|
||||
"$HOST_RUNNER"
|
||||
--env-file "$ENV_FILE"
|
||||
--compose-file "$COMPOSE_FILE"
|
||||
scrape
|
||||
--config "$(container_config_path "$CONFIG_FILE")"
|
||||
"$DOCUMENTS_SCRAPE"
|
||||
--config "$CONFIG_FILE"
|
||||
--log-file "$LOG_FILE"
|
||||
)
|
||||
append_target_args scrape_args
|
||||
scrape_command=$(printf '%q ' "${scrape_args[@]}")
|
||||
|
|
@ -348,7 +348,7 @@ main() {
|
|||
lock_prefix=""
|
||||
fi
|
||||
|
||||
job_line="$cron_line cd $(printf '%q' "$REPO_ROOT") && DCE_COMPOSE_TTY=0 ${lock_prefix}${scrape_command}>> $(printf '%q' "$LOG_FILE") 2>&1"
|
||||
job_line="$cron_line cd $(printf '%q' "$REPO_ROOT") && DCE_COMPOSE_TTY=0 DCE_ENV_FILE=$(printf '%q' "$ENV_FILE") DCE_COMPOSE_FILE=$(printf '%q' "$COMPOSE_FILE") ${lock_prefix}${scrape_command}"
|
||||
|
||||
local cron_block
|
||||
cron_block=$(printf '%s\n%s\n%s\n' "$begin_marker" "$job_line" "$end_marker")
|
||||
|
|
|
|||
|
|
@ -85,8 +85,11 @@ run_setup
|
|||
grep -q '^MAILTO=test@example.com$' "$CRONTAB_FILE" || { echo "expected unrelated crontab line to remain" >&2; exit 1; }
|
||||
[[ "$(grep -c '^# BEGIN discord-scrape$' "$CRONTAB_FILE")" == "1" ]] || { echo "expected exactly one managed cron block after install" >&2; exit 1; }
|
||||
grep -q 'compose --env-file' "$DOCKER_LOG" || { echo "expected docker preflight to run during install" >&2; exit 1; }
|
||||
grep -q 'scripts/run-discord-scrape-host.sh' "$CRONTAB_FILE" || { echo "expected cron job to run host wrapper" >&2; exit 1; }
|
||||
grep -q 'DCE_COMPOSE_TTY=0' "$CRONTAB_FILE" || { echo "expected cron job to disable compose TTY for log append" >&2; exit 1; }
|
||||
grep -q 'scripts/run-documents-scrape.sh' "$CRONTAB_FILE" || { echo "expected cron job to run documents scrape" >&2; exit 1; }
|
||||
grep -q -- '--log-file' "$CRONTAB_FILE" || { echo "expected cron job to pass --log-file" >&2; exit 1; }
|
||||
grep -q 'DCE_ENV_FILE=' "$CRONTAB_FILE" || { echo "expected cron job to set DCE_ENV_FILE" >&2; exit 1; }
|
||||
grep -q 'DCE_COMPOSE_TTY=0' "$CRONTAB_FILE" || { echo "expected cron job to disable compose TTY" >&2; exit 1; }
|
||||
grep -q '>>' "$CRONTAB_FILE" && { echo "expected cron job to rely on documents scrape tee, not shell redirect" >&2; exit 1; }
|
||||
|
||||
run_setup
|
||||
[[ "$(grep -c '^# BEGIN discord-scrape$' "$CRONTAB_FILE")" == "1" ]] || { echo "expected exactly one managed cron block after reinstall" >&2; exit 1; }
|
||||
|
|
|
|||
Loading…
Reference in a new issue