diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh new file mode 100755 index 00000000..08d74a23 --- /dev/null +++ b/scripts/run-discord-scrape.sh @@ -0,0 +1,728 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +CLI_BIN="${DCE_CLI_BIN:-/opt/app/DiscordChatExporter.Cli}" +PRIMARY_CONFIG="${DCE_PRIMARY_CONFIG:-/config/scrape-targets.json}" +FALLBACK_CONFIG="${DCE_FALLBACK_CONFIG:-/opt/dce-config/scrape-targets.json}" + +OVERRIDE_GUILDS=() +OVERRIDE_CHANNELS=() +CACHE_ROOT="" + +usage() { + cat <<'EOF' +Usage: + run-discord-scrape.sh scrape [options] + run-discord-scrape.sh preflight [options] + run-discord-scrape.sh list-targets [--config PATH] + run-discord-scrape.sh help + run-discord-scrape.sh + +Subcommands: + scrape Incrementally export channels into append-only JSON files. + preflight Validate token/config/target resolution without writing archives. + list-targets Print configured targets from the scrape config. + help Show this help text. + +Options: + --config PATH Config file path inside the container. + --target NAME Restrict the run to one configured target. Repeatable. + --guild ID Narrow a selected target to one of its allowed guild IDs. Repeatable. + --channel ID Narrow a selected target to one of its allowed channel IDs. Repeatable. + +Notes: + * DISCORD_TOKEN must be provided via environment variables. + * Channel exports are always stored as JSON because the append-only merge flow depends on it. + * Unknown subcommands are passed through to the raw DiscordChatExporter CLI. +EOF +} + +timestamp() { + date -u +"%Y-%m-%dT%H:%M:%SZ" +} + +log() { + printf '[%s] %s\n' "$(timestamp)" "$*" >&2 +} + +die() { + log "ERROR: $*" + exit 1 +} + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing." +} + +require_file() { + [[ -f "$1" ]] || die "Required file not found: $1" +} + +default_config_path() { + if [[ -f "$PRIMARY_CONFIG" ]]; then + printf '%s\n' "$PRIMARY_CONFIG" + else + printf '%s\n' "$FALLBACK_CONFIG" + fi +} + +normalize_name() { + printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cd '[:alnum:]' +} + +json_array_from_args() { + jq -cn '$ARGS.positional' --args "$@" +} + +contains_value() { + local needle=$1 + shift + local value + + for value in "$@"; do + [[ "$value" == "$needle" ]] && return 0 + done + + return 1 +} + +assert_subset() { + local label=$1 + local -n requested_ref=$2 + local -n allowed_ref=$3 + local value + + for value in "${requested_ref[@]}"; do + contains_value "$value" "${allowed_ref[@]}" || die "$label '$value' is outside the selected target's allowed scope." + done +} + +path_is_within_root() { + local root=$1 + local path=$2 + + case "$path" in + "$root"|"${root}/"*) + return 0 + ;; + *) + return 1 + ;; + esac +} + +config_archive_root() { + local config_path=$1 + jq -r '.archive_root // empty' "$config_path" +} + +validate_config_contract() { + local config_path=$1 + local archive_root output_dir name kind + local -a duplicate_names duplicate_dirs + + require_file "$config_path" + jq empty "$config_path" >/dev/null 2>&1 || die "Invalid JSON config: $config_path" + + archive_root=$(config_archive_root "$config_path") + [[ -n "$archive_root" ]] || die "Config is missing top-level archive_root." + [[ "$archive_root" == /* ]] || die "archive_root must be an absolute path." + + jq -e '.targets | type == "array" and length > 0' "$config_path" >/dev/null \ + || die "Config must define at least one target." + + mapfile -t duplicate_names < <(jq -r '.targets[].name' "$config_path" | sort | uniq -d) + (( ${#duplicate_names[@]} == 0 )) || die "Duplicate target names found: ${duplicate_names[*]}" + + mapfile -t duplicate_dirs < <(jq -r '.targets[].output_dir' "$config_path" | sort | uniq -d) + (( ${#duplicate_dirs[@]} == 0 )) || die "Duplicate target output directories found: ${duplicate_dirs[*]}" + + while IFS=$'\t' read -r name kind output_dir; do + [[ -n "$name" ]] || die "Every target must have a name." + [[ -n "$output_dir" ]] || die "Target '$name' is missing output_dir." + [[ "$kind" == "guild" || "$kind" == "dms" ]] || die "Target '$name' has unsupported kind '$kind'." + path_is_within_root "$archive_root" "$output_dir" \ + || die "Target '$name' output_dir '$output_dir' is outside archive_root '$archive_root'." + done < <(jq -r '.targets[] | [.name, (.kind // "guild"), .output_dir] | @tsv' "$config_path") +} + +load_archive_seed_channel_ids() { + local output_dir=$1 + local file_path file_name channel_id + + [[ -d "$output_dir" ]] || return 0 + + while IFS= read -r -d '' file_path; do + file_name=$(basename "$file_path") + if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then + channel_id="${BASH_REMATCH[1]}" + printf '%s\n' "$channel_id" + fi + done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0) +} + +parse_two_column_listing() { + local line id name + + while IFS= read -r line; do + if [[ "$line" =~ ^([0-9]{16,22})[[:space:]]+\|[[:space:]]+(.+)$ ]]; then + id="${BASH_REMATCH[1]}" + name="${BASH_REMATCH[2]}" + printf '%s\t%s\n' "$id" "$name" + fi + done +} + +parse_channel_listing() { + local line id + + while IFS= read -r line; do + if [[ "$line" =~ ^[[:space:]]*\*?[[:space:]]*([0-9]{16,22})[[:space:]]+\|[[:space:]]+ ]]; then + id="${BASH_REMATCH[1]}" + printf '%s\n' "$id" + fi + done +} + +ensure_json_file() { + local file_path=$1 + mkdir -p "$(dirname "$file_path")" + + if [[ ! -f "$file_path" ]]; then + printf '{}\n' >"$file_path" + fi +} + +update_channel_map() { + local map_file=$1 + local channel_id=$2 + local destination_path=$3 + local temp_file + + mkdir -p "$(dirname "$map_file")" + temp_file=$(mktemp "$(dirname "$map_file")/channel-map.XXXXXX.json") + jq --arg channel_id "$channel_id" --arg destination_path "$destination_path" \ + '.[$channel_id] = $destination_path' \ + "$map_file" >"$temp_file" + mv "$temp_file" "$map_file" +} + +get_channel_map_path() { + local output_dir=$1 + printf '%s/.dce-meta/channel-map.json' "$output_dir" +} + +resolve_destination_path() { + local output_dir=$1 + local channel_id=$2 + local map_file mapped_path + local -a existing_candidates + + mkdir -p "$output_dir/.dce-meta" "$output_dir/channels" + map_file=$(get_channel_map_path "$output_dir") + ensure_json_file "$map_file" + + mapped_path=$(jq -r --arg channel_id "$channel_id" '.[$channel_id] // empty' "$map_file") + if [[ -n "$mapped_path" ]]; then + path_is_within_root "$output_dir" "$mapped_path" \ + || die "Mapped destination '$mapped_path' for channel $channel_id is outside target root '$output_dir'." + printf '%s\n' "$mapped_path" + return 0 + fi + + mapfile -t existing_candidates < <( + find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print \ + | grep -F "[$channel_id].json" || true + ) + + if (( ${#existing_candidates[@]} > 1 )); then + die "Found multiple existing JSON exports for channel $channel_id under $output_dir; add an explicit mapping in $(get_channel_map_path "$output_dir")." + fi + + if (( ${#existing_candidates[@]} == 1 )); then + update_channel_map "$map_file" "$channel_id" "${existing_candidates[0]}" + printf '%s\n' "${existing_candidates[0]}" + return 0 + fi + + mapped_path="$output_dir/channels/$channel_id.json" + update_channel_map "$map_file" "$channel_id" "$mapped_path" + printf '%s\n' "$mapped_path" +} + +channel_id_from_export() { + local export_path=$1 + jq -r '.channel.id // empty' "$export_path" +} + +assert_export_channel_identity() { + local export_path=$1 + local expected_channel_id=$2 + local actual_channel_id + + actual_channel_id=$(channel_id_from_export "$export_path") + [[ -n "$actual_channel_id" ]] || die "Export '$export_path' is missing channel.id metadata." + [[ "$actual_channel_id" == "$expected_channel_id" ]] \ + || die "Export '$export_path' belongs to channel '$actual_channel_id', expected '$expected_channel_id'." +} + +last_message_id() { + local export_path=$1 + + [[ -f "$export_path" ]] || return 0 + jq -r '(.messages | last | .id) // empty' "$export_path" +} + +message_count() { + local export_path=$1 + jq -r '(.messages | length) // 0' "$export_path" +} + +merge_exports() { + local existing_path=$1 + local incremental_path=$2 + local merged_path=$3 + + jq -s ' + .[0] as $existing + | .[1] as $incremental + | ($existing + $incremental) + | .messages = ( + reduce (($existing.messages // []) + ($incremental.messages // []))[] as $message + ({}; + .[$message.id] = $message + ) + | to_entries + | map(.value) + | sort_by(.timestamp, .id) + ) + | .dateRange = { + after: ($existing.dateRange.after // $incremental.dateRange.after), + before: ($existing.dateRange.before // $incremental.dateRange.before) + } + | .exportedAt = ($incremental.exportedAt // $existing.exportedAt) + | if ($existing | has("messageCount")) or ($incremental | has("messageCount")) + then .messageCount = (.messages | length) + else . + end + ' "$existing_path" "$incremental_path" >"$merged_path" +} + +load_guild_cache() { + local output + + if [[ ! -f "$CACHE_ROOT/guilds.tsv" ]]; then + if ! output=$("$CLI_BIN" guilds 2>&1); then + die "Guild discovery failed. If you are using a bot token, configure explicit guild_ids/channel_ids for each non-DM target or switch to a user token. CLI output: $output" + fi + + printf '%s\n' "$output" | parse_two_column_listing >"$CACHE_ROOT/guilds.tsv" + fi + + cat "$CACHE_ROOT/guilds.tsv" +} + +load_dm_channel_cache() { + local output + + if [[ ! -f "$CACHE_ROOT/dms.txt" ]]; then + if ! output=$("$CLI_BIN" dm 2>&1); then + die "DM discovery failed. Bot tokens cannot read direct messages; disable the DM target or switch to a user token. CLI output: $output" + fi + + printf '%s\n' "$output" | parse_channel_listing >"$CACHE_ROOT/dms.txt" + fi + + cat "$CACHE_ROOT/dms.txt" +} + +load_guild_channel_cache() { + local guild_id=$1 + local include_voice=$2 + local include_threads=$3 + local cache_file="$CACHE_ROOT/channels_${guild_id}_${include_voice}_${include_threads}.txt" + + if [[ ! -f "$cache_file" ]]; then + "$CLI_BIN" channels \ + --guild "$guild_id" \ + --include-vc "$include_voice" \ + --include-threads "$include_threads" \ + | parse_channel_listing >"$cache_file" + fi + + cat "$cache_file" +} + +resolve_guild_ids_from_patterns() { + local patterns=("$@") + local guild_id guild_name normalized_guild normalized_pattern pattern + + (( ${#patterns[@]} > 0 )) || return 0 + + while IFS=$'\t' read -r guild_id guild_name; do + normalized_guild=$(normalize_name "$guild_name") + + for pattern in "${patterns[@]}"; do + normalized_pattern=$(normalize_name "$pattern") + [[ -n "$normalized_pattern" ]] || continue + + if [[ "$normalized_guild" == "$normalized_pattern" || "$normalized_guild" == *"$normalized_pattern"* || "$normalized_pattern" == *"$normalized_guild"* ]]; then + printf '%s\n' "$guild_id" + break + fi + done + done < <(load_guild_cache) +} + +resolve_configured_guilds() { + local target_json=$1 + local -a configured_guild_ids name_patterns resolved_guild_ids + + mapfile -t configured_guild_ids < <(jq -r '.guild_ids[]? | tostring' <<<"$target_json") + mapfile -t name_patterns < <(jq -r '.guild_name_patterns[]?' <<<"$target_json") + + if (( ${#configured_guild_ids[@]} > 0 )); then + printf '%s\n' "${configured_guild_ids[@]}" | sort -u + return 0 + fi + + mapfile -t resolved_guild_ids < <(resolve_guild_ids_from_patterns "${name_patterns[@]}" | sort -u) + if (( ${#resolved_guild_ids[@]} == 0 )); then + return 0 + fi + + if (( ${#resolved_guild_ids[@]} > 1 )); then + die "Target '$(jq -r '.name' <<<"$target_json")' matched multiple guilds (${resolved_guild_ids[*]}). Configure explicit guild_ids to make it safe." + fi + + printf '%s\n' "${resolved_guild_ids[@]}" +} + +resolve_target_channels() { + local target_json=$1 + local defaults_json=$2 + local kind include_voice include_threads + local target_name output_dir + local -a configured_channel_ids configured_guild_ids seeded_channel_ids allowed_channels allowed_guilds selected_guilds + + target_name=$(jq -r '.name' <<<"$target_json") + output_dir=$(jq -r '.output_dir' <<<"$target_json") + kind=$(jq -r '.kind // "guild"' <<<"$target_json") + include_voice=$(jq -r --argjson defaults "$defaults_json" '(.include_voice_channels // $defaults.include_voice_channels // false) | tostring' <<<"$target_json") + include_threads=$(jq -r --argjson defaults "$defaults_json" '.include_threads // $defaults.include_threads // "all"' <<<"$target_json") + + mapfile -t configured_channel_ids < <(jq -r '.channel_ids[]? | tostring' <<<"$target_json") + mapfile -t seeded_channel_ids < <(load_archive_seed_channel_ids "$output_dir" | sort -u) + + if [[ "$kind" == "dms" ]]; then + (( ${#OVERRIDE_GUILDS[@]} == 0 )) || die "DM targets do not support --guild overrides." + mapfile -t allowed_channels < <(load_dm_channel_cache | sort -u) + + if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then + assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels + printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u + else + printf '%s\n' "${allowed_channels[@]}" + fi + return 0 + fi + + if (( ${#configured_channel_ids[@]} > 0 )); then + (( ${#OVERRIDE_GUILDS[@]} == 0 )) || die "Channel-scoped targets do not support --guild overrides." + mapfile -t allowed_channels < <(printf '%s\n' "${configured_channel_ids[@]}" | sort -u) + + if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then + assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels + printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u + else + printf '%s\n' "${allowed_channels[@]}" + fi + return 0 + fi + + if (( ${#seeded_channel_ids[@]} > 0 )); then + (( ${#OVERRIDE_GUILDS[@]} == 0 )) || die "Archive-seeded target '$target_name' does not support --guild overrides." + mapfile -t allowed_channels < <(printf '%s\n' "${seeded_channel_ids[@]}" | sort -u) + + if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then + assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels + printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u + else + printf '%s\n' "${allowed_channels[@]}" + fi + return 0 + fi + + mapfile -t configured_guild_ids < <(resolve_configured_guilds "$target_json") + if (( ${#configured_guild_ids[@]} == 0 )); then + return 0 + fi + + if (( ${#OVERRIDE_GUILDS[@]} > 0 )); then + assert_subset "Guild override" OVERRIDE_GUILDS configured_guild_ids + selected_guilds=("${OVERRIDE_GUILDS[@]}") + else + selected_guilds=("${configured_guild_ids[@]}") + fi + + local guild_id + mapfile -t allowed_channels < <( + for guild_id in "${selected_guilds[@]}"; do + load_guild_channel_cache "$guild_id" "$include_voice" "$include_threads" + done | sort -u + ) + + if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then + assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels + printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u + else + printf '%s\n' "${allowed_channels[@]}" + fi +} + +preflight_target() { + local target_json=$1 + local defaults_json=$2 + local target_name output_dir + local probe_channel_id probe_dir probe_output + local -a channel_ids + + target_name=$(jq -r '.name' <<<"$target_json") + output_dir=$(jq -r '.output_dir' <<<"$target_json") + + mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") + if (( ${#channel_ids[@]} == 0 )); then + die "Target '$target_name' resolved no channels during preflight." + fi + + probe_channel_id="${channel_ids[0]}" + probe_dir=$(mktemp -d "${TMPDIR:-/tmp}/dce-preflight.${probe_channel_id}.XXXXXX") + probe_output="$probe_dir/probe.json" + + if ! "$CLI_BIN" export --channel "$probe_channel_id" --format Json --output "$probe_output" --before "1970-01-01"; then + rm -rf "$probe_dir" + die "Target '$target_name' failed authenticated preflight on channel '$probe_channel_id'." + fi + + rm -rf "$probe_dir" + log "Preflight ok for target '$target_name': ${#channel_ids[@]} channel(s) resolved for $output_dir." +} + +scrape_target() { + local target_json=$1 + local defaults_json=$2 + local target_name output_dir destination_path after_id temp_dir temp_export temp_merged + local latest_batch_count + local -a channel_ids export_command + + target_name=$(jq -r '.name' <<<"$target_json") + output_dir=$(jq -r '.output_dir' <<<"$target_json") + mkdir -p "$output_dir" + + mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json") + if (( ${#channel_ids[@]} == 0 )); then + die "Target '$target_name' resolved no channels." + fi + + log "Target '$target_name': processing ${#channel_ids[@]} channel(s) into $output_dir." + + local channel_id + for channel_id in "${channel_ids[@]}"; do + destination_path=$(resolve_destination_path "$output_dir" "$channel_id") + mkdir -p "$(dirname "$destination_path")" + + if [[ -f "$destination_path" ]]; then + jq empty "$destination_path" >/dev/null 2>&1 || die "Existing export is not valid JSON: $destination_path" + assert_export_channel_identity "$destination_path" "$channel_id" + fi + + after_id=$(last_message_id "$destination_path") + mkdir -p "$output_dir/.dce-temp" + temp_dir=$(mktemp -d "$output_dir/.dce-temp/export.${channel_id}.XXXXXX") + temp_export="$temp_dir/export.json" + temp_merged="$temp_dir/merged.json" + + export_command=("$CLI_BIN" export --channel "$channel_id" --format Json --output "$temp_export") + if [[ -n "$after_id" ]]; then + export_command+=(--after "$after_id") + fi + + log "Exporting channel $channel_id for target '$target_name'${after_id:+ after message $after_id}." + + if ! "${export_command[@]}"; then + rm -rf "$temp_dir" + die "Channel $channel_id failed for target '$target_name'." + fi + + jq empty "$temp_export" >/dev/null 2>&1 || die "Incremental export is not valid JSON: $temp_export" + assert_export_channel_identity "$temp_export" "$channel_id" + + latest_batch_count=$(message_count "$temp_export") + if [[ ! -f "$destination_path" ]]; then + mv "$temp_export" "$destination_path" + rm -rf "$temp_dir" + continue + fi + + if (( latest_batch_count == 0 )); then + rm -rf "$temp_dir" + continue + fi + + merge_exports "$destination_path" "$temp_export" "$temp_merged" + [[ -s "$temp_merged" ]] || die "Merged export is empty for channel $channel_id." + jq empty "$temp_merged" >/dev/null 2>&1 || die "Merged export is not valid JSON: $temp_merged" + assert_export_channel_identity "$temp_merged" "$channel_id" + mv "$temp_merged" "$destination_path" + rm -rf "$temp_dir" + done + + log "Target '$target_name': scrape completed successfully." +} + +list_targets() { + local config_path=$1 + + validate_config_contract "$config_path" + jq -r '.targets[] | [.name, (.kind // "guild"), .output_dir] | @tsv' "$config_path" +} + +load_selected_targets() { + local config_path=$1 + shift + local -a requested_targets=("$@") + local target_names_json + + if (( ${#requested_targets[@]} > 0 )); then + target_names_json=$(json_array_from_args "${requested_targets[@]}") + jq -c --argjson selected_target_names "$target_names_json" \ + '.targets[] | select(.name as $name | $selected_target_names | index($name))' \ + "$config_path" + else + jq -c '.targets[] | select(.enabled != false)' "$config_path" + fi +} + +parse_target_options() { + local mode=$1 + shift + local -n config_path_ref=$1 + local -n requested_targets_ref=$2 + shift 2 + + while (($#)); do + case "$1" in + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + config_path_ref=$2 + shift 2 + ;; + --target) + [[ $# -ge 2 ]] || die "Missing value for --target." + requested_targets_ref+=("$2") + shift 2 + ;; + --guild) + [[ $# -ge 2 ]] || die "Missing value for --guild." + OVERRIDE_GUILDS+=("$2") + shift 2 + ;; + --channel) + [[ $# -ge 2 ]] || die "Missing value for --channel." + OVERRIDE_CHANNELS+=("$2") + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown $mode option: $1" + ;; + esac + done +} + +run_target_mode() { + local mode=$1 + local config_path requested_targets_json defaults_json + local -a requested_targets=() selected_targets=() + shift + + config_path=$(default_config_path) + parse_target_options "$mode" config_path requested_targets "$@" + + require_command jq + validate_config_contract "$config_path" + [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set." + + defaults_json=$(jq -c '.defaults // {}' "$config_path") + mapfile -t selected_targets < <(load_selected_targets "$config_path" "${requested_targets[@]}") + + if (( ${#requested_targets[@]} > 0 && ${#selected_targets[@]} != ${#requested_targets[@]} )); then + die "One or more requested --target names are not present in $config_path." + fi + + if (( ${#selected_targets[@]} == 0 )); then + if (( ${#requested_targets[@]} > 0 )); then + die "No targets matched the requested selection." + fi + die "No enabled targets are available in $config_path." + fi + + if (( (${#OVERRIDE_GUILDS[@]} > 0 || ${#OVERRIDE_CHANNELS[@]} > 0) && ${#selected_targets[@]} != 1 )); then + die "When using --guild or --channel overrides, select exactly one --target." + fi + + CACHE_ROOT=$(mktemp -d "${TMPDIR:-/tmp}/dce-scrape.XXXXXX") + trap 'rm -rf "$CACHE_ROOT"' EXIT + + local target_json + for target_json in "${selected_targets[@]}"; do + if [[ "$mode" == "preflight" ]]; then + preflight_target "$target_json" "$defaults_json" + else + scrape_target "$target_json" "$defaults_json" + fi + done +} + +main() { + local subcommand=${1:-help} + local config_path + shift || true + + case "$subcommand" in + help|-h|--help) + usage + ;; + list-targets) + config_path=$(default_config_path) + while (($#)); do + case "$1" in + --config) + [[ $# -ge 2 ]] || die "Missing value for --config." + config_path=$2 + shift 2 + ;; + *) + die "Unknown list-targets option: $1" + ;; + esac + done + list_targets "$config_path" + ;; + preflight) + run_target_mode preflight "$@" + ;; + scrape) + run_target_mode scrape "$@" + ;; + *) + exec "$CLI_BIN" "$subcommand" "$@" + ;; + esac +} + +main "$@"