From 90bd9da143a7300366b42cfbbccb1a2041d1c23f Mon Sep 17 00:00:00 2001 From: Boden Date: Fri, 29 May 2026 13:49:09 -0500 Subject: [PATCH] feat(scrape): harden preflight and cron config for Documents archives Preflight probes skip forbidden channels when seeded archives exist. Cron installer passes container config path and supports --config override. Compose and docs align with append-only ~/Documents scrape workflow. --- .docs/Recurring-Scrape-Setup.md | 41 ++++++--- Readme.md | 1 + STRATEGY.md | 2 +- docker-compose.yml | 7 +- ...t-recurring-scrape-merge-readiness-plan.md | 7 ++ ...-documents-recurring-scrape-verify-plan.md | 67 ++++++++++++++ scripts/run-discord-scrape.sh | 92 +++++++++++++++---- scripts/setup-cron.sh | 24 +++++ 8 files changed, 203 insertions(+), 38 deletions(-) create mode 100644 docs/plans/2026-05-29-011-feat-documents-recurring-scrape-verify-plan.md diff --git a/.docs/Recurring-Scrape-Setup.md b/.docs/Recurring-Scrape-Setup.md index 21808198..7e8f3ff9 100644 --- a/.docs/Recurring-Scrape-Setup.md +++ b/.docs/Recurring-Scrape-Setup.md @@ -11,6 +11,14 @@ This guide walks you through setting up automated recurring Discord exports usin ## Quick Start +**Append-only contract (read first)** + +- Each target writes under its configured `output_dir` (for example `~/Documents/KotOR_discord_msgs/`). +- Existing files named `Guild - Category - Channel [channel_id].json` are discovered automatically and updated in place. +- On the first run against an existing archive tree, the wrapper bootstraps `output_dir/.dce-meta/channel-map.json` from those filenames so it never creates a parallel export file. +- Incremental exports use DiscordChatExporter `--after` with the highest existing message id, then merge new messages by id. +- A merge that would reduce message count is rejected; the on-disk archive is left unchanged. + ### 1. Configure Your Targets Create or edit `config/scrape-targets.json` with your channel selections: @@ -44,6 +52,8 @@ Create or edit `config/scrape-targets.json` with your channel selections: ### 2. Set Your Discord Token +**Cron requires `scrape.env`.** Manual `export DISCORD_TOKEN` works for one-off runs, but scheduled jobs run in a minimal environment and need a persisted env file. + Either copy the environment template: ```bash @@ -76,7 +86,7 @@ Before the first incremental run, confirm each enabled target points at the corr ./scripts/verify-documents-archives.sh --config config/scrape-targets.json ``` -Each enabled target should show a non-zero **JSON** count and **SEEDED** channel IDs under `/home/brunner56/Documents//`. +Each enabled target should show a non-zero **JSON** count and **SEEDED** channel IDs under your configured `output_dir` values (see `archive_root` in `config/scrape-targets.json`). **One-command workflow** (verify → preflight → incremental scrape): @@ -141,8 +151,8 @@ The default is monthly. Customize it with: # Run every day at 2 AM ./scripts/setup-cron.sh --config config/scrape-targets.json --interval "daily" --at "2:00" -# Run every Sunday at noon -./scripts/setup-cron.sh --config config/scrape-targets.json --interval "weekly" --at "sun 12:00" +# Run every Sunday at noon (weekly uses Sunday; time is HH:MM only) +./scripts/setup-cron.sh --config config/scrape-targets.json --interval weekly --at "12:00" # Custom cron expression (every 6 hours) ./scripts/setup-cron.sh --config config/scrape-targets.json --cron "0 */6 * * *" @@ -186,15 +196,7 @@ archive_root/ └── ... ``` -Existing exports are updated in-place with new messages appended and deduplicated by message ID. - -**In-place append contract** - -- Each target writes under its configured `output_dir` (for example `~/Documents/KotOR_discord_msgs/`). -- Existing files named `Guild - Category - Channel [channel_id].json` are discovered automatically and updated in place. -- On the first run against an existing archive tree, the wrapper bootstraps `output_dir/.dce-meta/channel-map.json` from those filenames so it never creates a parallel export file. -- Incremental exports use DiscordChatExporter `--after` with the highest existing message id, then merge new messages by id. -- A merge that would reduce message count is rejected; the on-disk archive is left unchanged. +Existing exports are updated in-place with new messages appended and deduplicated by message ID. See **Append-only contract** at the top of this guide. ## Troubleshooting @@ -286,12 +288,21 @@ Re-run setup with new parameters (old entry replaced): Check logs from your last run: ```bash -# Recent cron execution +# Primary log file (default from setup-cron.sh) +tail -f logs/discord-scrape.log + +# Recent cron execution (system log) sudo grep discord-scrape /var/log/syslog # Debian/Ubuntu sudo grep discord-scrape /var/log/cron # CentOS/RHEL -# Or check via Docker logs if using containers -docker-compose logs -f +# Container build/run issues +docker compose logs -f +``` + +After a scheduled run, confirm archives grew in place: + +```bash +./scripts/prove-incremental-append.sh --target KotOR_discord_msgs ``` ## Performance Considerations diff --git a/Readme.md b/Readme.md index 2842c03e..ed7e1c2f 100644 --- a/Readme.md +++ b/Readme.md @@ -82,5 +82,6 @@ To learn more about the war and how you can help, [click here](https://tyrrrz.me ## See also - [**Recurring Exports**](.docs/Recurring-Scrape-Setup.md) — automated scheduled exports using cron (Linux/macOS) +- [**Documented solutions**](docs/solutions/) — searchable learnings (append-only scrape, Docker/cron workflow); YAML frontmatter: `module`, `tags`, `problem_type` - [**Chat Analytics**](https://github.com/mlomb/chat-analytics) — solution for analyzing chat patterns of Discord users, using exports produced by **DiscordChatExporter**. - [**DiscordChatExporter-frontend**](https://github.com/slatinsky/DiscordChatExporter-frontend) — convenient viewer for exports produced by **DiscordChatExporter**. diff --git a/STRATEGY.md b/STRATEGY.md index 3a16bdbe..5e8d7663 100644 --- a/STRATEGY.md +++ b/STRATEGY.md @@ -1,6 +1,6 @@ --- name: Recurring Discord scrape automation -last_updated: 2026-05-25 +last_updated: 2026-05-29 --- # Recurring Discord scrape automation Strategy diff --git a/docker-compose.yml b/docker-compose.yml index d036d9e8..2b31d8b0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,11 +8,14 @@ services: user: "${DCE_UID:-1000}:${DCE_GID:-1000}" userns_mode: "${DCE_USERNS_MODE:-}" working_dir: /workspace + env_file: + - path: scrape.env + required: false environment: - DISCORD_TOKEN: ${DISCORD_TOKEN:?Set DISCORD_TOKEN in scrape.env or your shell environment.} + DISCORD_TOKEN: ${DISCORD_TOKEN:-} TZ: ${TZ:-UTC} volumes: - ./config:/config:ro,z - ./scripts/run-discord-scrape.sh:/opt/dce-scheduler/run-discord-scrape.sh:ro,z - - /home/brunner56/Documents:/home/brunner56/Documents:z + - ${DCE_ARCHIVE_ROOT:-/home/brunner56/Documents}:${DCE_ARCHIVE_ROOT:-/home/brunner56/Documents}:z command: ["help"] diff --git a/docs/plans/2026-05-29-010-feat-recurring-scrape-merge-readiness-plan.md b/docs/plans/2026-05-29-010-feat-recurring-scrape-merge-readiness-plan.md index 78d7fc09..2a954a4b 100644 --- a/docs/plans/2026-05-29-010-feat-recurring-scrape-merge-readiness-plan.md +++ b/docs/plans/2026-05-29-010-feat-recurring-scrape-merge-readiness-plan.md @@ -102,3 +102,10 @@ Operators rely on `run-documents-scrape.sh`, `verify-documents-archives.sh`, and - All ten smoke scripts exit 0. **Verification:** Single shell loop over `scripts/tests/*.sh`. + +--- + +### Delta Update (2026-05-29) +- **Landed:** Source-built Docker + compose + `setup-cron.sh` (monthly default); append-only merge; custom `~/Documents/*` targets; compound solution doc; preflight skips forbidden channels when seeded archives exist; `--config` on `setup-cron.sh`; compose `DCE_ARCHIVE_ROOT` + optional `scrape.env` for builds; operator doc fixes (append contract, weekly schedule, monitoring log path). +- **Partial:** Live grow-only proof on all enabled targets not run in this pass; some channels remain forbidden under current token. +- **Next:** `prove-incremental-append.sh` per enabled target; consider `container-smoke.sh` in CI when Docker is available on runners. diff --git a/docs/plans/2026-05-29-011-feat-documents-recurring-scrape-verify-plan.md b/docs/plans/2026-05-29-011-feat-documents-recurring-scrape-verify-plan.md new file mode 100644 index 00000000..a08b14ef --- /dev/null +++ b/docs/plans/2026-05-29-011-feat-documents-recurring-scrape-verify-plan.md @@ -0,0 +1,67 @@ +--- +title: feat: Documents recurring scrape verification and operator closure +type: feat +status: completed +date: 2026-05-29 +origin: LFG — Docker/cron append-only Discord scrape for ~/Documents archive folders +--- + +# feat: Documents recurring scrape verification and operator closure + +## Summary + +Close the recurring Discord scrape vertical slice: source-built Docker image, compose mounts for `config/scrape-targets.json` and `/home/brunner56/Documents` archives, append-only JSON merge in `scripts/run-discord-scrape.sh`, monthly cron via `scripts/setup-cron.sh`, and runtime proof (preflight + incremental scrape on at least one enabled target). + +## Problem Frame + +Operators need monthly (configurable) incremental exports into existing `~/Documents/*_discord*` folders without re-downloading full history or overwriting archives when Discord deletes messages server-side. Infrastructure exists on `feat/recurring-cli-scrape`; this pass validates end-to-end behavior and documents the operator path. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | `Dockerfile` builds `DiscordChatExporter.Cli` from source; compose mounts config, scripts, and `archive_root` | +| R2 | `config/scrape-targets.json` maps user Documents folders; empty `channel_ids` exports all accessible channels per target | +| R3 | `run-discord-scrape.sh` uses `--after` + merge-by-id; rejects shrink merges | +| R4 | `setup-cron.sh` defaults to monthly schedule; supports `--target`, `--guild`, `--channel`, `--interval`, `--cron` | +| R5 | `scrape.env` (gitignored) supplies token for compose; never commit secrets | +| R6 | Preflight and one-target scrape succeed against live Discord API | +| R7 | Smoke tests pass; operator docs list validation commands | + +## Scope Boundaries + +- No changes to upstream C# merge API (wrapper-only append). +- Do not enable `discord_dms` without user token. +- Token stays in `scrape.env` only. + +## Implementation Units + +### U1. Harden bootstrap and compose paths + +**Requirements:** R1, R2 + +**Files:** `scripts/run-discord-scrape.sh`, `docker-compose.yml`, `Dockerfile` + +**Test scenarios:** Archive seed files bootstrap channel-map; compose bind-mount resolves host Documents path. + +### U2. Cron installer and docs alignment + +**Requirements:** R4, R7 + +**Files:** `scripts/setup-cron.sh`, `.docs/Recurring-Scrape-Setup.md`, `Readme.md` + +**Test scenarios:** `setup-cron.sh --dry-run` emits monthly block; `--remove` idempotent. + +### U3. Runtime verification + +**Requirements:** R5, R6 + +**Commands:** `docker compose build`, `run-discord-scrape-host.sh preflight`, scrape `--target` with smallest enabled archive. + +**Test scenarios:** Message count non-decreasing after scrape; logs show `--after` when archive non-empty. + +## Verification Ladder + +1. `bash -n` on changed shell scripts +2. `scripts/tests/setup-cron-smoke.sh`, `run-discord-scrape-smoke.sh` +3. `docker compose build` + preflight + single-target scrape diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh index bd27ab29..2c44949d 100755 --- a/scripts/run-discord-scrape.sh +++ b/scripts/run-discord-scrape.sh @@ -645,26 +645,16 @@ resolve_target_channels() { fi } -preflight_target() { - local target_json=$1 - local defaults_json=$2 - local target_name output_dir - local probe_channel_id probe_dir probe_output - local -a channel_ids +preflight_probe_channel() { + local probe_channel_id=$1 + local output_dir=$2 + local probe_dir probe_output probe_log + local -a probe_command after_id probe_destination + local probe_status=0 - target_name=$(jq -r '.name' <<<"$target_json") - output_dir=$(jq -r '.output_dir' <<<"$target_json") - bootstrap_channel_map_from_archives "$output_dir" - - mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") - if (( ${#channel_ids[@]} == 0 )); then - die "Target '$target_name' resolved no channels during preflight." - fi - - probe_channel_id="${channel_ids[0]}" probe_dir=$(mktemp -d "${TMPDIR:-/tmp}/dce-preflight.${probe_channel_id}.XXXXXX") probe_output="$probe_dir/probe.json" - local -a probe_command after_id probe_destination + probe_log=$(mktemp "${TMPDIR:-/tmp}/dce-preflight-log.${probe_channel_id}.XXXXXX") probe_destination=$(resolve_destination_path "$output_dir" "$probe_channel_id") after_id="" @@ -683,13 +673,75 @@ preflight_target() { probe_command+=(--after "$after_id") fi - if ! "${probe_command[@]}"; then + set +e + "${probe_command[@]}" >"$probe_log" 2>&1 + probe_status=$? + set -e + + if (( probe_status == 0 )); then + rm -f "$probe_log" rm -rf "$probe_dir" - die "Target '$target_name' failed authenticated preflight on channel '$probe_channel_id'." + return 0 fi + if is_skippable_channel_export_failure "$probe_log"; then + log "Preflight probe skipped channel $probe_channel_id (forbidden or inaccessible)." + cat "$probe_log" >&2 + rm -f "$probe_log" + rm -rf "$probe_dir" + return 2 + fi + + cat "$probe_log" >&2 + rm -f "$probe_log" rm -rf "$probe_dir" - log "Preflight ok for target '$target_name': ${#channel_ids[@]} channel(s) resolved for $output_dir." + return 1 +} + +preflight_target() { + local target_json=$1 + local defaults_json=$2 + local target_name output_dir + local probe_channel_id + local -a channel_ids seeded_channel_ids + local probe_status=0 + local skipped_channels=0 + local probed_channels=0 + + target_name=$(jq -r '.name' <<<"$target_json") + output_dir=$(jq -r '.output_dir' <<<"$target_json") + bootstrap_channel_map_from_archives "$output_dir" + + mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") + if (( ${#channel_ids[@]} == 0 )); then + die "Target '$target_name' resolved no channels during preflight." + fi + + for probe_channel_id in "${channel_ids[@]}"; do + probed_channels=$((probed_channels + 1)) + preflight_probe_channel "$probe_channel_id" "$output_dir" || probe_status=$? + case "$probe_status" in + 0) + log "Preflight ok for target '$target_name': ${#channel_ids[@]} channel(s) resolved for $output_dir." + return 0 + ;; + 2) + skipped_channels=$((skipped_channels + 1)) + probe_status=0 + ;; + *) + die "Target '$target_name' failed authenticated preflight on channel '$probe_channel_id'." + ;; + esac + done + + mapfile -t seeded_channel_ids < <(load_archive_seed_channel_ids "$output_dir" | sort -u) + if (( skipped_channels == probed_channels && ${#seeded_channel_ids[@]} > 0 )); then + log "Preflight ok for target '$target_name' with warning: all ${#channel_ids[@]} resolved channel(s) are inaccessible, but ${#seeded_channel_ids[@]} seeded archive(s) exist under $output_dir." + return 0 + fi + + die "Target '$target_name' failed preflight: every resolved channel is inaccessible and no seeded archives exist under $output_dir." } scrape_target() { diff --git a/scripts/setup-cron.sh b/scripts/setup-cron.sh index 069bc868..26ad6cea 100755 --- a/scripts/setup-cron.sh +++ b/scripts/setup-cron.sh @@ -45,6 +45,7 @@ Options: --cron EXPR Use an explicit five-field cron expression instead of --interval/--at. --job-name NAME Marker name for the installed cron block. Default: discord-scrape --log-file PATH Cron log file. Default: $LOG_FILE + --config PATH Scrape targets JSON. Default: $CONFIG_FILE --env-file PATH Compose env file. Default: $ENV_FILE --skip-preflight Install the cron job without running the authenticated container preflight. --dry-run Print the cron block instead of installing it. @@ -130,6 +131,22 @@ append_target_args() { done } +container_config_path() { + local config_path=$1 + + if [[ "$config_path" == "$REPO_ROOT/config/"* ]]; then + printf '/config/%s\n' "$(basename "$config_path")" + return 0 + fi + + if [[ "$config_path" == config/* ]]; then + printf '/config/%s\n' "${config_path#config/}" + return 0 + fi + + printf '%s\n' "$config_path" +} + ensure_target_directories() { local selected_targets_json archive_root output_dir @@ -182,6 +199,7 @@ run_preflight() { --env-file "$ENV_FILE" --compose-file "$COMPOSE_FILE" preflight + --config "$(container_config_path "$CONFIG_FILE")" ) append_target_args preflight_args "${preflight_args[@]}" @@ -230,6 +248,11 @@ main() { LOG_FILE=$2 shift 2 ;; + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + CONFIG_FILE=$2 + shift 2 + ;; --env-file) [[ $# -ge 2 ]] || die "Missing value for --env-file." ENV_FILE=$2 @@ -315,6 +338,7 @@ main() { --env-file "$ENV_FILE" --compose-file "$COMPOSE_FILE" scrape + --config "$(container_config_path "$CONFIG_FILE")" ) append_target_args scrape_args scrape_command=$(printf '%q ' "${scrape_args[@]}")