diff --git a/.docs/Recurring-Scrape-Setup.md b/.docs/Recurring-Scrape-Setup.md index d4f88110..51c4b061 100644 --- a/.docs/Recurring-Scrape-Setup.md +++ b/.docs/Recurring-Scrape-Setup.md @@ -44,7 +44,7 @@ Create or edit `config/scrape-targets.json` with your channel selections: ### 2. Set Your Discord Token -Copy the environment template and add your token: +Either copy the environment template: ```bash cp scrape.env.example scrape.env @@ -52,12 +52,25 @@ cp scrape.env.example scrape.env # OR set DISCORD_TOKEN_FILE=/path/to/token/file for automatic token rotation ``` +Or export the token directly in your shell (the host wrapper accepts this when `scrape.env` is absent): + +```bash +export DISCORD_TOKEN="your-token-here" +# optional: export DISCORD_TOKEN_FILE=/path/to/token/file +``` + ### 3. Run Preflight Validation Before installing cron, validate your setup: ```bash export DISCORD_TOKEN="your-token" +./scripts/run-discord-scrape-host.sh preflight --config config/scrape-targets.json +``` + +Or run inside the container workflow directly: + +```bash ./scripts/run-discord-scrape.sh preflight --config config/scrape-targets.json ``` @@ -141,6 +154,14 @@ archive_root/ Existing exports are updated in-place with new messages appended and deduplicated by message ID. +**In-place append contract** + +- Each target writes under its configured `output_dir` (for example `~/Documents/KotOR_discord_msgs/`). +- Existing files named `Guild - Category - Channel [channel_id].json` are discovered automatically and updated in place. +- On the first run against an existing archive tree, the wrapper bootstraps `output_dir/.dce-meta/channel-map.json` from those filenames so it never creates a parallel export file. +- Incremental exports use DiscordChatExporter `--after` with the highest existing message id, then merge new messages by id. +- A merge that would reduce message count is rejected; the on-disk archive is left unchanged. + ## Troubleshooting For common issues and solutions, see [Recurring-Scrape-Troubleshooting.md](Recurring-Scrape-Troubleshooting.md). diff --git a/config/scrape-targets.json b/config/scrape-targets.json index 3073dc36..444e75a4 100644 --- a/config/scrape-targets.json +++ b/config/scrape-targets.json @@ -33,11 +33,13 @@ }, { "name": "OpenKotOR_discord_msgs", + "enabled": false, "kind": "guild", "output_dir": "/home/brunner56/Documents/OpenKotOR_discord_msgs", "guild_ids": [], "channel_ids": [], - "guild_name_patterns": ["OpenKotOR_discord_msgs", "OpenKotOR"] + "guild_name_patterns": ["OpenKotOR_discord_msgs", "OpenKotOR"], + "disabled_reason": "No local archive directory exists; use openkotor_discord_msgs for the active OpenKotOR export folder." }, { "name": "openkotor_discord_msgs", diff --git a/docs/plans/2026-05-28-006-fix-documents-append-auth-plan.md b/docs/plans/2026-05-28-006-fix-documents-append-auth-plan.md new file mode 100644 index 00000000..5fcec764 --- /dev/null +++ b/docs/plans/2026-05-28-006-fix-documents-append-auth-plan.md @@ -0,0 +1,44 @@ +--- +title: fix: Ensure Documents archive paths append safely with auth +type: fix +status: completed +date: 2026-05-28 +origin: User request — extract to ~/Documents/** per server, append not overwrite, proper CLI auth +--- + +# fix: Ensure Documents archive paths append safely with auth + +## Summary + +Recurring scrapes must update the user's existing large JSON archives under `~/Documents//` in place using DiscordChatExporter incremental export (`--after`) and merge-by-id, never replacing a file with a fresh full export when an archive already exists. Auth must work without fragile manual setup. + +## Problem Frame + +| Gap | Impact | +|-----|--------| +| `scrape.env` required even when `DISCORD_TOKEN` is already exported | Preflight/scrape fail before auth is attempted | +| Channel map not bootstrapped from existing `* [id].json` files | Risk of creating parallel files instead of updating in place | +| Merge replaces destination via direct `mv` without monotonic guard | Large archives could shrink on bad merge | +| `OpenKotOR_discord_msgs` target points at missing folder | Target resolves zero channels while `openkotor_discord_msgs` holds data | + +## Requirements + +| ID | Requirement | Files | +|----|-------------|-------| +| U1 | Make host runner accept exported `DISCORD_TOKEN` / `DISCORD_TOKEN_FILE` when `scrape.env` is absent | `scripts/run-discord-scrape-host.sh`, smoke test | +| U2 | Bootstrap `output_dir/.dce-meta/channel-map.json` from existing `* [channel_id].json` archives before scrape/preflight | `scripts/run-discord-scrape.sh`, smoke test | +| U3 | Safe merge: verify merged message count ≥ existing; replace via temp file in target directory | `scripts/run-discord-scrape.sh`, smoke test | +| U4 | Align config with on-disk folders (disable missing OpenKotOR target) | `config/scrape-targets.json` | +| U5 | Document auth + in-place append contract | `.docs/Recurring-Scrape-Setup.md` | + +## Test Scenarios + +- Host runner succeeds with only `DISCORD_TOKEN` in environment (no scrape.env) +- Bootstrap writes channel-map entries for seeded archives without overwriting map entries +- Merge rejects shrinkage (fixture with fewer messages after merge) +- Existing smoke suite still passes + +## Success Criteria + +- `./scripts/tests/run-discord-scrape-smoke.sh` and host smoke pass +- Preflight can run once user exports token (even without scrape.env file) diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index 6f443011..3f4b8849 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -10,6 +10,8 @@ DOCKER_BIN="${DCE_DOCKER_BIN:-docker}" COMPOSE_BIN="${DCE_COMPOSE_BIN:-}" DOCKER_BIN_OVERRIDDEN=0 REAUTH_COMMAND="" +COMPOSE_ENV_FILE="" +COMPOSE_ENV_TEMP="" if [[ -n "${DCE_DOCKER_BIN:-}" ]]; then DOCKER_BIN_OVERRIDDEN=1 @@ -30,6 +32,9 @@ Environment: DISCORD_TOKEN Direct token value (highest precedence after refresh). DISCORD_TOKEN_FILE Optional path to a file containing the Discord token. DCE_REAUTH_COMMAND Optional absolute path to an executable reauth script under the repo root. + +Notes: + When $ENV_FILE is missing, exported DISCORD_TOKEN or DISCORD_TOKEN_FILE is used instead. EOF } @@ -42,6 +47,12 @@ require_program() { command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing." } +cleanup_compose_env() { + if [[ -n "$COMPOSE_ENV_TEMP" && -f "$COMPOSE_ENV_TEMP" ]]; then + rm -f "$COMPOSE_ENV_TEMP" + fi +} + load_env_file() { [[ -f "$ENV_FILE" ]] || die "Missing env file: $ENV_FILE" local raw_line line key value @@ -73,6 +84,40 @@ load_env_file() { done <"$ENV_FILE" } +write_compose_env_temp() { + COMPOSE_ENV_TEMP=$(mktemp "${TMPDIR:-/tmp}/dce-compose-env.XXXXXX") + COMPOSE_ENV_FILE="$COMPOSE_ENV_TEMP" + + if [[ -n "${DISCORD_TOKEN:-}" ]]; then + printf 'DISCORD_TOKEN=%s\n' "$DISCORD_TOKEN" >"$COMPOSE_ENV_TEMP" + fi + if [[ -n "${DISCORD_TOKEN_FILE:-}" ]]; then + printf 'DISCORD_TOKEN_FILE=%s\n' "$DISCORD_TOKEN_FILE" >>"$COMPOSE_ENV_TEMP" + fi + if [[ -n "${DCE_REAUTH_COMMAND:-}" ]]; then + printf 'DCE_REAUTH_COMMAND=%s\n' "$DCE_REAUTH_COMMAND" >>"$COMPOSE_ENV_TEMP" + fi +} + +prepare_compose_env() { + if [[ -f "$ENV_FILE" ]]; then + load_env_file + COMPOSE_ENV_FILE="$ENV_FILE" + return 0 + fi + + if [[ -z "${DISCORD_TOKEN:-}" ]]; then + load_token_from_file || true + fi + + if [[ -n "${DISCORD_TOKEN:-}" || -n "${DISCORD_TOKEN_FILE:-}" ]]; then + write_compose_env_temp + return 0 + fi + + die "Missing env file: $ENV_FILE (copy scrape.env.example to scrape.env) or export DISCORD_TOKEN / DISCORD_TOKEN_FILE in the shell." +} + load_token_from_file() { local token_file=${DISCORD_TOKEN_FILE:-} [[ -n "$token_file" ]] || return 1 @@ -89,7 +134,7 @@ ensure_token_present() { if [[ -z "${DISCORD_TOKEN:-}" ]]; then load_token_from_file || true fi - [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set. Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in $ENV_FILE." + [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set. Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in $ENV_FILE or export it in the shell." } compose_run_args() { @@ -101,7 +146,7 @@ compose_run_args() { if [[ -n "$COMPOSE_BIN" ]]; then _out=( "$COMPOSE_BIN" - --env-file "$ENV_FILE" + --env-file "$COMPOSE_ENV_FILE" -f "$COMPOSE_FILE" run -T @@ -112,7 +157,7 @@ compose_run_args() { elif (( DOCKER_BIN_OVERRIDDEN == 0 )) && command -v docker-compose >/dev/null 2>&1; then _out=( docker-compose - --env-file "$ENV_FILE" + --env-file "$COMPOSE_ENV_FILE" -f "$COMPOSE_FILE" run -T @@ -124,7 +169,7 @@ compose_run_args() { _out=( "$DOCKER_BIN" compose - --env-file "$ENV_FILE" + --env-file "$COMPOSE_ENV_FILE" -f "$COMPOSE_FILE" run -T @@ -198,6 +243,13 @@ run_subcommand_with_retry() { printf 'Detected Discord auth failure. Refreshing token and retrying once...\n' >&2 load_token_from_file || true + if [[ -f "$ENV_FILE" ]]; then + COMPOSE_ENV_FILE="$ENV_FILE" + elif [[ -n "${DISCORD_TOKEN:-}" ]]; then + rm -f "$COMPOSE_ENV_TEMP" + COMPOSE_ENV_TEMP="" + write_compose_env_temp + fi try_interactive_reauth || true ensure_token_present @@ -216,6 +268,8 @@ main() { local -a passthrough_args=() local subcommand="" + trap cleanup_compose_env EXIT + while (($#)); do case "$1" in --env-file) @@ -265,7 +319,7 @@ main() { fi [[ -f "$COMPOSE_FILE" ]] || die "Missing compose file: $COMPOSE_FILE" - load_env_file + prepare_compose_env REAUTH_COMMAND="${DCE_REAUTH_COMMAND:-}" case "$subcommand" in diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh index 976c4b29..273f3f0d 100755 --- a/scripts/run-discord-scrape.sh +++ b/scripts/run-discord-scrape.sh @@ -191,6 +191,45 @@ load_archive_seed_channel_ids() { done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0) } +bootstrap_channel_map_from_archives() { + local output_dir=$1 + local map_file file_path file_name channel_id mapped_path embedded_channel_id bootstrapped=0 + + [[ -d "$output_dir" ]] || return 0 + + map_file=$(get_channel_map_path "$output_dir") + ensure_json_file "$map_file" + + while IFS= read -r -d '' file_path; do + file_name=$(basename "$file_path") + if [[ ! "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then + continue + fi + + channel_id="${BASH_REMATCH[1]}" + mapped_path=$(jq -r --arg channel_id "$channel_id" '.[$channel_id] // empty' "$map_file") + if [[ -n "$mapped_path" ]]; then + continue + fi + + path_is_within_root "$output_dir" "$file_path" || continue + jq empty "$file_path" >/dev/null 2>&1 || continue + + embedded_channel_id=$(jq -r '.channel.id // empty' "$file_path") + if [[ -n "$embedded_channel_id" && "$embedded_channel_id" != "$channel_id" ]]; then + log "Skipping bootstrap for '$file_path': filename channel id $channel_id does not match export metadata ($embedded_channel_id)." + continue + fi + + update_channel_map "$map_file" "$channel_id" "$file_path" + bootstrapped=$((bootstrapped + 1)) + done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0) + + if (( bootstrapped > 0 )); then + log "Bootstrapped $bootstrapped channel map entries from existing archives under $output_dir." + fi +} + parse_two_column_listing() { local line id name @@ -342,6 +381,24 @@ message_count() { jq -r '(.messages | length) // 0' "$export_path" } +commit_merged_export() { + local destination_path=$1 + local merged_path=$2 + local before_count after_count atomic_path + + before_count=$(message_count "$destination_path") + after_count=$(message_count "$merged_path") + if (( after_count < before_count )); then + die "Merge would shrink archive '$destination_path' ($before_count -> $after_count messages). Existing file was not modified." + fi + + atomic_path=$(mktemp -p "$(dirname "$destination_path")" ".$(basename "$destination_path").dce-replace.XXXXXX") + cp "$merged_path" "$atomic_path" + jq empty "$atomic_path" >/dev/null 2>&1 || die "Merged export is not valid JSON: $atomic_path" + assert_export_channel_identity "$atomic_path" "$(channel_id_from_export "$destination_path")" + mv -f "$atomic_path" "$destination_path" +} + merge_exports() { local existing_path=$1 local incremental_path=$2 @@ -553,6 +610,7 @@ preflight_target() { target_name=$(jq -r '.name' <<<"$target_json") output_dir=$(jq -r '.output_dir' <<<"$target_json") + bootstrap_channel_map_from_archives "$output_dir" mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") if (( ${#channel_ids[@]} == 0 )); then @@ -582,6 +640,7 @@ scrape_target() { target_name=$(jq -r '.name' <<<"$target_json") output_dir=$(jq -r '.output_dir' <<<"$target_json") mkdir -p "$output_dir" + bootstrap_channel_map_from_archives "$output_dir" mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") if (( ${#channel_ids[@]} == 0 )); then @@ -644,7 +703,7 @@ scrape_target() { [[ -s "$temp_merged" ]] || die "Merged export is empty for channel $channel_id." jq empty "$temp_merged" >/dev/null 2>&1 || die "Merged export is not valid JSON: $temp_merged" assert_export_channel_identity "$temp_merged" "$channel_id" - mv "$temp_merged" "$destination_path" + commit_merged_export "$destination_path" "$temp_merged" rm -rf "$temp_dir" done @@ -795,4 +854,6 @@ main() { esac } -main "$@" +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/scripts/tests/run-discord-scrape-host-smoke.sh b/scripts/tests/run-discord-scrape-host-smoke.sh index 943ec2ea..cbc39f8a 100755 --- a/scripts/tests/run-discord-scrape-host-smoke.sh +++ b/scripts/tests/run-discord-scrape-host-smoke.sh @@ -68,6 +68,21 @@ run_host() { "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --target demo } +run_host_with_shell_token() { + local mode=$1 + local missing_env_path=$2 + + DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_DOCKER_BIN="$FAKE_DOCKER" \ + DCE_ENV_FILE="$missing_env_path" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + DISCORD_TOKEN=dummy-token \ + FAKE_DOCKER_CALL_COUNT="$CALL_COUNT" \ + FAKE_DOCKER_TOKEN_FILE="$TOKEN_FILE" \ + FAKE_DOCKER_MODE="$mode" \ + "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape --target demo +} + MALICIOUS_ENV="$TMP_DIR/malicious.env" MARKER_FILE="$TMP_DIR/marker" cat >"$MALICIOUS_ENV" </dev/null; then fi [[ "$(cat "$CALL_COUNT")" == "2" ]] || { echo "expected exactly one retry before final failure" >&2; exit 1; } +MISSING_ENV="$TMP_DIR/missing-scrape.env" +[[ ! -e "$MISSING_ENV" ]] +printf '0' >"$CALL_COUNT" +run_host_with_shell_token success "$MISSING_ENV" >/dev/null +[[ "$(cat "$CALL_COUNT")" == "1" ]] || { echo "expected host wrapper to run with exported DISCORD_TOKEN when scrape.env is missing" >&2; exit 1; } + echo "run-discord-scrape-host smoke test passed" diff --git a/scripts/tests/run-discord-scrape-smoke.sh b/scripts/tests/run-discord-scrape-smoke.sh index 8b6f9fb5..c78ee601 100755 --- a/scripts/tests/run-discord-scrape-smoke.sh +++ b/scripts/tests/run-discord-scrape-smoke.sh @@ -102,6 +102,14 @@ cat >"$CONFIG_PATH" <&2; exit 1; } +mkdir -p "$ARCHIVE_ROOT/bootstrap-map" +cp "$FIXTURE_DIR/append-existing.json" "$ARCHIVE_ROOT/bootstrap-map/$DEFAULT_FILE_NAME" +[[ ! -f "$ARCHIVE_ROOT/bootstrap-map/.dce-meta/channel-map.json" ]] || { echo "bootstrap-map should start without channel map" >&2; exit 1; } +run_wrapper bootstrap-map append +BOOTSTRAP_DEST="$ARCHIVE_ROOT/bootstrap-map/$DEFAULT_FILE_NAME" +bootstrap_mapped_dest=$(jq -r '."111"' "$ARCHIVE_ROOT/bootstrap-map/.dce-meta/channel-map.json") +[[ "$bootstrap_mapped_dest" == "$BOOTSTRAP_DEST" ]] || { echo "expected bootstrap to register existing archive in channel map" >&2; exit 1; } +[[ "$(jq -r '.messages | length' "$BOOTSTRAP_DEST")" == "3" ]] || { echo "expected bootstrap-map archive to append in place" >&2; exit 1; } + +# shellcheck disable=SC1091 +source "$REPO_ROOT/scripts/run-discord-scrape.sh" +SHRINK_EXISTING="$TMP_DIR/shrink-existing.json" +SHRINK_MERGED="$TMP_DIR/shrink-merged.json" +cp "$FIXTURE_DIR/append-existing.json" "$SHRINK_EXISTING" +jq '.messages = [.messages[0]]' "$SHRINK_EXISTING" >"$SHRINK_MERGED" +if ( commit_merged_export "$SHRINK_EXISTING" "$SHRINK_MERGED" >/dev/null 2>&1 ); then + echo "commit_merged_export should reject shrinking archives" >&2 + exit 1 +fi +[[ "$(jq -r '.messages | length' "$SHRINK_EXISTING")" == "2" ]] || { echo "existing archive changed after rejected shrink merge" >&2; exit 1; } + echo "U1: append-only merge test coverage passed"