diff --git a/docs/plans/2026-06-04-078-feat-documents-scrape-log-file-plan.md b/docs/plans/2026-06-04-078-feat-documents-scrape-log-file-plan.md new file mode 100644 index 00000000..61d1a647 --- /dev/null +++ b/docs/plans/2026-06-04-078-feat-documents-scrape-log-file-plan.md @@ -0,0 +1,54 @@ +--- +title: "feat: Documents scrape --log-file with tee" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — plan 077 deferred tee full documents-scrape stdout to persistent log +--- + +# feat: Documents scrape --log-file with tee + +## Summary + +Add `--log-file PATH` to `run-documents-scrape.sh`. Live scrapes auto-tee to `logs/documents-scrape-.log` and pair JSON summary with `.summary.json` (parity with operator validation). + +## Problem Frame + +Validation and proof persist teed logs with recoverable JSON summaries. The primary cron/operator entry `run-documents-scrape.sh` only prints to stdout; long KotOR catch-up runs leave no durable log unless the operator wraps the command manually. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `--log-file PATH` appends all workflow output via `tee -a` | +| R2 | Live scrape (not dry-run/salvage-only) auto-defaults log to `logs/documents-scrape-.log` when unset | +| R3 | Live scrape pairs summary with `${LOG_FILE%.log}.summary.json` unless `--summary-file` or `DCE_RUN_SUMMARY_FILE` set | +| R4 | Prints `Log file:` before scrape; `Log:` after tee completes | +| R5 | Recovers missing summary from teed log via `recover_json_summary_if_missing` | +| R6 | Dry-run and salvage-only skip auto log/summary unless `--log-file` explicitly passed | +| R7 | `documents-scrape-smoke.sh` asserts teed log file on live `--log-file` run | +| R8 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 23/23 | + +## Implementation Units + +### U1. run-documents-scrape.sh + +**Files:** `scripts/run-documents-scrape.sh`, `scripts/tests/documents-scrape-smoke.sh` + +### U2. Docs + +**Files:** `docs/recurring-scrape-merge-readiness.md`, `docs/recurring-scrape-operator-checklist.md`, `scrape.env.example` + +## Verification + +```bash +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up on host +- Refresh PR #1538 body with plans 070–078 stamps +- Wire `--log-file` into setup-cron crontab line diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index be3f3ac9..9ff5f5aa 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -184,6 +184,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 077 (2026-06-04):** Setup doc + merge-readiness smoke inventory synced to 23 offline tests (includes `print-scrape-summary-smoke`, `scrape-summary-json-smoke`). +**Plan 078 (2026-06-04):** `run-documents-scrape.sh` `--log-file` with auto tee on live scrapes; summary pairs with log basename. + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/docs/recurring-scrape-operator-checklist.md b/docs/recurring-scrape-operator-checklist.md index 61f941ef..3a0cf691 100644 --- a/docs/recurring-scrape-operator-checklist.md +++ b/docs/recurring-scrape-operator-checklist.md @@ -53,11 +53,10 @@ Salvage partial exports under `output_dir/.dce-temp/` without calling Discord: Salvage then incremental scrape: ```bash -./scripts/run-documents-scrape.sh --salvage-before-scrape --target NAME [--channel ID] +./scripts/run-documents-scrape.sh --salvage-before-scrape --target NAME [--channel ID] [--log-file logs/scrape.log] ./scripts/run-operator-validation.sh --salvage-before-scrape --target NAME [--channel ID] --log-file logs/scrape.log ./scripts/run-operator-proof.sh --salvage-before-scrape --sync-gui --target NAME -# When scraping one target, also writes logs/operator-proof-.summary.json beside the proof log -# All enabled targets: each gets logs/operator-proof--.summary.json +# Live documents scrape auto-tees to logs/documents-scrape-.log (or --log-file); summary at .summary.json ``` **KotOR yes_general** (`221726893064454144`): first catch-up after a 2021 archive cursor can take hours and may OOM; salvage preserved partials before retrying. Stop duplicate validation processes (MyBook vs Downloads checkouts share the same lock). `KotOR_discord_msgs` sets `container_memory: "8g"` in `scrape-targets.json` for single-target runs; override globally with `DCE_CONTAINER_MEMORY` in `scrape.env` if needed. Channel-scoped proof: diff --git a/scrape.env.example b/scrape.env.example index a7128253..5fc89f4f 100644 --- a/scrape.env.example +++ b/scrape.env.example @@ -29,6 +29,7 @@ DCE_USERNS_MODE= # Optional: machine-readable scrape summary (run-discord-scrape.sh). # run-documents-scrape.sh, run-operator-validation.sh, and run-operator-proof.sh # auto-enable summary export on live scrapes unless these are already set. +# Live documents scrape also auto-tees to logs/documents-scrape-.log (override with --log-file). # Host paths under logs/ map to /logs/ in the container (see docker-compose.yml). # DCE_RUN_SUMMARY_JSON=1 # DCE_RUN_SUMMARY_FILE=logs/scrape-summary.json diff --git a/scripts/run-documents-scrape.sh b/scripts/run-documents-scrape.sh index 72236213..841f4dc8 100755 --- a/scripts/run-documents-scrape.sh +++ b/scripts/run-documents-scrape.sh @@ -36,7 +36,8 @@ Options: --target NAME Limit preflight/scrape to one configured target --channel ID With exactly one --target, limit scrape to channel ID (repeatable) --config PATH Scrape target config (default: config/scrape-targets.json) - --summary-file PATH Machine-readable scrape summary JSON (default: logs/documents-scrape-UTC.summary.json) + --log-file PATH Append full workflow output to this file (default on live scrape: logs/documents-scrape-UTC.log) + --summary-file PATH Machine-readable scrape summary JSON (default: .summary.json on live scrape) EOF } @@ -68,12 +69,82 @@ run_local_salvage() { "$HOST_RUNNER" salvage "${salvage_args[@]}" } +run_documents_scrape_workflow() { + local dry_run=$1 + local salvage_only=$2 + local salvage_before=$3 + local target=$4 + local log_file=$5 + local -a passthrough=("${@:6}") + + "$VERIFY_SCRIPT" --config "$CONFIG_PATH" + + local -a plan_targets=() + if [[ -n "$target" ]]; then + plan_targets=("$target") + fi + print_scrape_config_plan "$CONFIG_PATH" "Documents scrape" "${plan_targets[@]}" + + if (( dry_run == 1 )); then + printf 'Dry run complete: archive paths verified. Export DISCORD_TOKEN or create a token file, then rerun without --dry-run.\n' + return 0 + fi + + "$VERIFY_READY" --disk-only --config "$CONFIG_PATH" + + require_scrape_lock_free + + if (( salvage_only == 1 )); then + run_local_salvage "${passthrough[@]}" + return 0 + fi + + if (( salvage_before == 1 )); then + run_local_salvage "${passthrough[@]}" + fi + + local -a container_args=("${passthrough[@]}") + local has_config=0 idx=0 + + while (( idx < ${#container_args[@]} )); do + if [[ "${container_args[idx]}" == "--config" ]]; then + has_config=1 + case "${container_args[idx + 1]:-}" in + "$CONFIG_PATH"|config/scrape-targets.json|./config/scrape-targets.json) + container_args[idx + 1]="$CONTAINER_CONFIG" + ;; + esac + break + fi + idx=$((idx + 1)) + done + + if (( has_config == 0 )); then + container_args=(--config "$CONTAINER_CONFIG" "${container_args[@]}") + fi + + if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then + "$SETUP_AUTH" 2>/dev/null || true + elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then + "$SETUP_AUTH" 2>/dev/null || true + fi + + if [[ -n "$log_file" ]]; then + printf 'Log file: %s\n' "$log_file" + fi + printf 'JSON summary file: %s\n' "${DCE_RUN_SUMMARY_FILE:-}" + + "$HOST_RUNNER" preflight "${container_args[@]}" + "$HOST_RUNNER" scrape "${container_args[@]}" +} + main() { local dry_run=0 local salvage_only=0 local salvage_before=0 local target="" local summary_file="" + local log_file="" local -a passthrough=() while (($#)); do @@ -107,6 +178,11 @@ main() { passthrough+=(--config "$2") shift 2 ;; + --log-file) + [[ $# -ge 2 ]] || die "Missing value for --log-file." + log_file=$2 + shift 2 + ;; --summary-file) [[ $# -ge 2 ]] || die "Missing value for --summary-file." summary_file=$2 @@ -130,71 +206,49 @@ main() { die "Use only one of --dry-run, --salvage-only, or --salvage-before-scrape." fi - "$VERIFY_SCRIPT" --config "$CONFIG_PATH" - - local -a plan_targets=() - if [[ -n "$target" ]]; then - plan_targets=("$target") - fi - print_scrape_config_plan "$CONFIG_PATH" "Documents scrape" "${plan_targets[@]}" - - if (( dry_run == 1 )); then - printf 'Dry run complete: archive paths verified. Export DISCORD_TOKEN or create a token file, then rerun without --dry-run.\n' - exit 0 - fi - - "$VERIFY_READY" --disk-only --config "$CONFIG_PATH" - - require_scrape_lock_free - - if (( salvage_only == 1 )); then - run_local_salvage "${passthrough[@]}" - exit 0 - fi - - if (( salvage_before == 1 )); then - run_local_salvage "${passthrough[@]}" - fi - - local -a container_args=("${passthrough[@]}") - local has_config=0 idx=0 - - while (( idx < ${#container_args[@]} )); do - if [[ "${container_args[idx]}" == "--config" ]]; then - has_config=1 - case "${container_args[idx + 1]:-}" in - "$CONFIG_PATH"|config/scrape-targets.json|./config/scrape-targets.json) - container_args[idx + 1]="$CONTAINER_CONFIG" - ;; - esac - break + local export_json_summary=0 + if (( dry_run == 0 && salvage_only == 0 )); then + export_json_summary=1 + mkdir -p "$LOG_DIR" + if [[ -z "$log_file" ]]; then + log_file="$LOG_DIR/documents-scrape-$(date -u +%Y%m%dT%H%M%SZ).log" fi - idx=$((idx + 1)) - done - - if (( has_config == 0 )); then - container_args=(--config "$CONTAINER_CONFIG" "${container_args[@]}") - fi - - if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then - "$SETUP_AUTH" 2>/dev/null || true - elif [[ -x "$DISCOVER_TOKEN" ]] && "$DISCOVER_TOKEN" >/dev/null 2>&1; then - "$SETUP_AUTH" 2>/dev/null || true - fi - - export DCE_RUN_SUMMARY_JSON=1 - if [[ -z "${DCE_RUN_SUMMARY_FILE:-}" ]]; then - if [[ -n "$summary_file" ]]; then - export DCE_RUN_SUMMARY_FILE="$summary_file" - else - mkdir -p "$LOG_DIR" - export DCE_RUN_SUMMARY_FILE="$LOG_DIR/documents-scrape-$(date -u +%Y%m%dT%H%M%SZ).summary.json" + export DCE_RUN_SUMMARY_JSON=1 + if [[ -z "${DCE_RUN_SUMMARY_FILE:-}" ]]; then + if [[ -n "$summary_file" ]]; then + export DCE_RUN_SUMMARY_FILE="$summary_file" + else + export DCE_RUN_SUMMARY_FILE="${log_file%.log}.summary.json" + fi fi fi - printf 'JSON summary file: %s\n' "$DCE_RUN_SUMMARY_FILE" - "$HOST_RUNNER" preflight "${container_args[@]}" - "$HOST_RUNNER" scrape "${container_args[@]}" + local pipeline_status=0 + if [[ -n "$log_file" ]]; then + mkdir -p "$(dirname "$log_file")" + set -o pipefail + { + run_documents_scrape_workflow "$dry_run" "$salvage_only" "$salvage_before" "$target" "$log_file" "${passthrough[@]}" + } 2>&1 | tee -a "$log_file" + pipeline_status=${PIPESTATUS[0]} + else + run_documents_scrape_workflow "$dry_run" "$salvage_only" "$salvage_before" "$target" "" "${passthrough[@]}" + pipeline_status=$? + fi + + if (( export_json_summary )) && [[ -n "${DCE_RUN_SUMMARY_FILE:-}" && -n "$log_file" ]]; then + # shellcheck source=lib/scrape-summary-json.sh + source "$SCRIPT_DIR/lib/scrape-summary-json.sh" + if recover_json_summary_if_missing "$log_file" "$DCE_RUN_SUMMARY_FILE"; then + printf 'JSON summary recovered from log: %s\n' "$DCE_RUN_SUMMARY_FILE" + fi + fi + + if [[ -n "$log_file" ]]; then + printf 'Log: %s\n' "$log_file" + fi + + exit "$pipeline_status" } main "$@" diff --git a/scripts/tests/documents-scrape-smoke.sh b/scripts/tests/documents-scrape-smoke.sh index 3b2626fd..26ed035f 100755 --- a/scripts/tests/documents-scrape-smoke.sh +++ b/scripts/tests/documents-scrape-smoke.sh @@ -122,12 +122,52 @@ grep -q 'JSON summary file:' "$LIVE_DOC_OUT" || { cat "$LIVE_DOC_OUT" >&2 exit 1 } +grep -q 'Log:' "$LIVE_DOC_OUT" || { + echo "expected live documents scrape to print Log: path" >&2 + exit 1 +} +shopt -s nullglob +auto_logs=("$TMP_DIR/logs"/documents-scrape-*.log) +((${#auto_logs[@]} > 0)) || { + echo "expected auto teed log under DCE_LOG_DIR" >&2 + exit 1 +} +grep -q 'JSON summary file:' "${auto_logs[0]}" || { + echo "expected JSON summary line in teed log file" >&2 + exit 1 +} +shopt -u nullglob grep -q '111111111111111111' "$ARGS_LOG" || { echo "expected --channel to reach container compose invocation" >&2 cat "$ARGS_LOG" >&2 exit 1 } +EXPLICIT_LOG="$TMP_DIR/logs/live-documents.log" +EXPLICIT_SUMMARY="$TMP_DIR/logs/live-documents.summary.json" +: >"$ARGS_LOG" +DCE_MIN_FREE_MB=0 \ + DCE_SKIP_SCRAPE_LOCK=1 \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \ + DCE_ENV_FILE="$TMP_DIR/scrape.env" \ + "$REPO_ROOT/scripts/run-documents-scrape.sh" \ + --config "$TMP_DIR/config.json" --target demo --log-file "$EXPLICIT_LOG" >"$TMP_DIR/explicit-live.out" 2>&1 + +[[ -s "$EXPLICIT_LOG" ]] || { + echo "expected --log-file to create teed log" >&2 + exit 1 +} +grep -q 'Log file: '"$EXPLICIT_LOG" "$EXPLICIT_LOG" || { + echo "expected Log file: marker in teed log" >&2 + exit 1 +} +grep -q 'JSON summary file: '"$EXPLICIT_SUMMARY" "$EXPLICIT_LOG" || { + echo "expected summary path paired with --log-file basename" >&2 + cat "$EXPLICIT_LOG" >&2 + exit 1 +} + cp "$REPO_ROOT/scripts/run-discord-scrape.sh" "$FAKE_REPO/scripts/" chmod +x "$FAKE_REPO/scripts/run-discord-scrape.sh" SALVAGE_DOC_LOG="$TMP_DIR/salvage-documents.log"