mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
Bootstrap channel-map entries from existing archive filenames, reject merges that would shrink large JSON exports, accept exported DISCORD_TOKEN when scrape.env is missing, and disable the duplicate OpenKotOR target folder. Co-authored-by: Cursor <cursoragent@cursor.com>
860 lines
26 KiB
Bash
Executable file
860 lines
26 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
set -Eeuo pipefail
|
|
|
|
CLI_BIN="${DCE_CLI_BIN:-/opt/app/DiscordChatExporter.Cli}"
|
|
PRIMARY_CONFIG="${DCE_PRIMARY_CONFIG:-/config/scrape-targets.json}"
|
|
FALLBACK_CONFIG="${DCE_FALLBACK_CONFIG:-/opt/dce-config/scrape-targets.json}"
|
|
|
|
OVERRIDE_GUILDS=()
|
|
OVERRIDE_CHANNELS=()
|
|
CACHE_ROOT=""
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
run-discord-scrape.sh scrape [options]
|
|
run-discord-scrape.sh preflight [options]
|
|
run-discord-scrape.sh list-targets [--config PATH]
|
|
run-discord-scrape.sh help
|
|
run-discord-scrape.sh <DiscordChatExporter CLI args...>
|
|
|
|
Subcommands:
|
|
scrape Incrementally export channels into append-only JSON files.
|
|
preflight Validate token/config/target resolution without writing archives.
|
|
list-targets Print configured targets from the scrape config.
|
|
help Show this help text.
|
|
|
|
Options:
|
|
--config PATH Config file path inside the container.
|
|
--target NAME Restrict the run to one configured target. Repeatable.
|
|
--guild ID Narrow a selected target to one of its allowed guild IDs. Repeatable.
|
|
--channel ID Narrow a selected target to one of its allowed channel IDs. Repeatable.
|
|
|
|
Notes:
|
|
* DISCORD_TOKEN must be provided via environment variables.
|
|
* Channel exports are always stored as JSON because the append-only merge flow depends on it.
|
|
* Unknown subcommands are passed through to the raw DiscordChatExporter CLI.
|
|
EOF
|
|
}
|
|
|
|
timestamp() {
|
|
date -u +"%Y-%m-%dT%H:%M:%SZ"
|
|
}
|
|
|
|
log() {
|
|
printf '[%s] %s\n' "$(timestamp)" "$*" >&2
|
|
}
|
|
|
|
die() {
|
|
log "ERROR: $*"
|
|
exit 1
|
|
}
|
|
|
|
require_command() {
|
|
command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing."
|
|
}
|
|
|
|
require_file() {
|
|
[[ -f "$1" ]] || die "Required file not found: $1"
|
|
}
|
|
|
|
default_config_path() {
|
|
if [[ -f "$PRIMARY_CONFIG" ]]; then
|
|
printf '%s\n' "$PRIMARY_CONFIG"
|
|
else
|
|
printf '%s\n' "$FALLBACK_CONFIG"
|
|
fi
|
|
}
|
|
|
|
normalize_name() {
|
|
printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | tr -cd '[:alnum:]'
|
|
}
|
|
|
|
escape_file_name_component() {
|
|
printf '%s' "$1" \
|
|
| tr '\r\n' ' ' \
|
|
| sed -E 's#[/\\]+#_#g; s/[[:cntrl:]]+/ /g; s/[[:space:]]+/ /g; s/^ //; s/ $//'
|
|
}
|
|
|
|
join_name_parts() {
|
|
local result=""
|
|
local part
|
|
|
|
for part in "$@"; do
|
|
[[ -n "$part" ]] || continue
|
|
if [[ -n "$result" ]]; then
|
|
result+=" - "
|
|
fi
|
|
result+="$part"
|
|
done
|
|
|
|
printf '%s\n' "$result"
|
|
}
|
|
|
|
json_array_from_args() {
|
|
jq -cn '$ARGS.positional' --args "$@"
|
|
}
|
|
|
|
contains_value() {
|
|
local needle=$1
|
|
shift
|
|
local value
|
|
|
|
for value in "$@"; do
|
|
[[ "$value" == "$needle" ]] && return 0
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
assert_subset() {
|
|
local label=$1
|
|
local -n requested_ref=$2
|
|
local -n allowed_ref=$3
|
|
local value
|
|
|
|
for value in "${requested_ref[@]}"; do
|
|
contains_value "$value" "${allowed_ref[@]}" || die "$label '$value' is outside the selected target's allowed scope."
|
|
done
|
|
}
|
|
|
|
path_is_within_root() {
|
|
local root=$1
|
|
local path=$2
|
|
|
|
[[ "$path" == /* ]] || return 1
|
|
|
|
case "$path" in
|
|
*"/../"*|../*|*/..|..|*"/./"*|./*|*/.)
|
|
return 1
|
|
;;
|
|
esac
|
|
|
|
case "$path" in
|
|
"$root"|"${root}/"*)
|
|
return 0
|
|
;;
|
|
*)
|
|
return 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
config_archive_root() {
|
|
local config_path=$1
|
|
jq -r '.archive_root // empty' "$config_path"
|
|
}
|
|
|
|
validate_config_contract() {
|
|
local config_path=$1
|
|
local archive_root output_dir name kind
|
|
local -a duplicate_names duplicate_dirs
|
|
|
|
require_file "$config_path"
|
|
jq empty "$config_path" >/dev/null 2>&1 || die "Invalid JSON config: $config_path"
|
|
|
|
archive_root=$(config_archive_root "$config_path")
|
|
[[ -n "$archive_root" ]] || die "Config is missing top-level archive_root."
|
|
[[ "$archive_root" == /* ]] || die "archive_root must be an absolute path."
|
|
|
|
jq -e '.targets | type == "array" and length > 0' "$config_path" >/dev/null \
|
|
|| die "Config must define at least one target."
|
|
|
|
mapfile -t duplicate_names < <(jq -r '.targets[].name' "$config_path" | sort | uniq -d)
|
|
(( ${#duplicate_names[@]} == 0 )) || die "Duplicate target names found: ${duplicate_names[*]}"
|
|
|
|
mapfile -t duplicate_dirs < <(jq -r '.targets[].output_dir' "$config_path" | sort | uniq -d)
|
|
(( ${#duplicate_dirs[@]} == 0 )) || die "Duplicate target output directories found: ${duplicate_dirs[*]}"
|
|
|
|
while IFS=$'\t' read -r name kind output_dir; do
|
|
[[ -n "$name" ]] || die "Every target must have a name."
|
|
[[ -n "$output_dir" ]] || die "Target '$name' is missing output_dir."
|
|
[[ "$kind" == "guild" || "$kind" == "dms" ]] || die "Target '$name' has unsupported kind '$kind'."
|
|
path_is_within_root "$archive_root" "$output_dir" \
|
|
|| die "Target '$name' output_dir '$output_dir' is outside archive_root '$archive_root'."
|
|
done < <(jq -r '.targets[] | [.name, (.kind // "guild"), .output_dir] | @tsv' "$config_path")
|
|
}
|
|
|
|
load_archive_seed_channel_ids() {
|
|
local output_dir=$1
|
|
local file_path file_name channel_id
|
|
|
|
[[ -d "$output_dir" ]] || return 0
|
|
|
|
while IFS= read -r -d '' file_path; do
|
|
file_name=$(basename "$file_path")
|
|
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
|
channel_id="${BASH_REMATCH[1]}"
|
|
printf '%s\n' "$channel_id"
|
|
fi
|
|
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0)
|
|
}
|
|
|
|
bootstrap_channel_map_from_archives() {
|
|
local output_dir=$1
|
|
local map_file file_path file_name channel_id mapped_path embedded_channel_id bootstrapped=0
|
|
|
|
[[ -d "$output_dir" ]] || return 0
|
|
|
|
map_file=$(get_channel_map_path "$output_dir")
|
|
ensure_json_file "$map_file"
|
|
|
|
while IFS= read -r -d '' file_path; do
|
|
file_name=$(basename "$file_path")
|
|
if [[ ! "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
|
continue
|
|
fi
|
|
|
|
channel_id="${BASH_REMATCH[1]}"
|
|
mapped_path=$(jq -r --arg channel_id "$channel_id" '.[$channel_id] // empty' "$map_file")
|
|
if [[ -n "$mapped_path" ]]; then
|
|
continue
|
|
fi
|
|
|
|
path_is_within_root "$output_dir" "$file_path" || continue
|
|
jq empty "$file_path" >/dev/null 2>&1 || continue
|
|
|
|
embedded_channel_id=$(jq -r '.channel.id // empty' "$file_path")
|
|
if [[ -n "$embedded_channel_id" && "$embedded_channel_id" != "$channel_id" ]]; then
|
|
log "Skipping bootstrap for '$file_path': filename channel id $channel_id does not match export metadata ($embedded_channel_id)."
|
|
continue
|
|
fi
|
|
|
|
update_channel_map "$map_file" "$channel_id" "$file_path"
|
|
bootstrapped=$((bootstrapped + 1))
|
|
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0)
|
|
|
|
if (( bootstrapped > 0 )); then
|
|
log "Bootstrapped $bootstrapped channel map entries from existing archives under $output_dir."
|
|
fi
|
|
}
|
|
|
|
parse_two_column_listing() {
|
|
local line id name
|
|
|
|
while IFS= read -r line; do
|
|
if [[ "$line" =~ ^([0-9]{16,22})[[:space:]]+\|[[:space:]]+(.+)$ ]]; then
|
|
id="${BASH_REMATCH[1]}"
|
|
name="${BASH_REMATCH[2]}"
|
|
printf '%s\t%s\n' "$id" "$name"
|
|
fi
|
|
done
|
|
}
|
|
|
|
parse_channel_listing() {
|
|
local line id
|
|
|
|
while IFS= read -r line; do
|
|
if [[ "$line" =~ ^[[:space:]]*\*?[[:space:]]*([0-9]{16,22})[[:space:]]+\|[[:space:]]+ ]]; then
|
|
id="${BASH_REMATCH[1]}"
|
|
printf '%s\n' "$id"
|
|
fi
|
|
done
|
|
}
|
|
|
|
ensure_json_file() {
|
|
local file_path=$1
|
|
mkdir -p "$(dirname "$file_path")"
|
|
|
|
if [[ ! -f "$file_path" ]]; then
|
|
printf '{}\n' >"$file_path"
|
|
fi
|
|
}
|
|
|
|
update_channel_map() {
|
|
local map_file=$1
|
|
local channel_id=$2
|
|
local destination_path=$3
|
|
local temp_file
|
|
|
|
mkdir -p "$(dirname "$map_file")"
|
|
temp_file=$(mktemp "$(dirname "$map_file")/channel-map.XXXXXX")
|
|
jq --arg channel_id "$channel_id" --arg destination_path "$destination_path" \
|
|
'.[$channel_id] = $destination_path' \
|
|
"$map_file" >"$temp_file"
|
|
mv "$temp_file" "$map_file"
|
|
}
|
|
|
|
get_channel_map_path() {
|
|
local output_dir=$1
|
|
printf '%s/.dce-meta/channel-map.json' "$output_dir"
|
|
}
|
|
|
|
default_destination_path_from_export() {
|
|
local output_dir=$1
|
|
local export_path=$2
|
|
local guild_name category_name channel_name channel_id
|
|
local escaped_guild_name escaped_category_name escaped_channel_name base_name
|
|
|
|
guild_name=$(jq -r '.guild.name // empty' "$export_path")
|
|
category_name=$(jq -r '.channel.category // empty' "$export_path")
|
|
channel_name=$(jq -r '.channel.name // empty' "$export_path")
|
|
channel_id=$(jq -r '.channel.id // empty' "$export_path")
|
|
|
|
[[ -n "$channel_id" ]] || die "Export '$export_path' is missing channel.id metadata."
|
|
|
|
escaped_guild_name=$(escape_file_name_component "$guild_name")
|
|
escaped_category_name=$(escape_file_name_component "$category_name")
|
|
escaped_channel_name=$(escape_file_name_component "$channel_name")
|
|
|
|
base_name=$(join_name_parts "$escaped_guild_name" "$escaped_category_name" "$escaped_channel_name")
|
|
[[ -n "$base_name" ]] || base_name="channel"
|
|
|
|
printf '%s/%s [%s].json\n' "$output_dir" "$base_name" "$channel_id"
|
|
}
|
|
|
|
resolve_destination_path() {
|
|
local output_dir=$1
|
|
local channel_id=$2
|
|
local export_path=${3:-}
|
|
local map_file mapped_path
|
|
local -a existing_candidates
|
|
|
|
mkdir -p "$output_dir/.dce-meta"
|
|
map_file=$(get_channel_map_path "$output_dir")
|
|
ensure_json_file "$map_file"
|
|
|
|
mapped_path=$(jq -r --arg channel_id "$channel_id" '.[$channel_id] // empty' "$map_file")
|
|
if [[ -n "$mapped_path" ]]; then
|
|
path_is_within_root "$output_dir" "$mapped_path" \
|
|
|| die "Mapped destination '$mapped_path' for channel $channel_id is outside target root '$output_dir'."
|
|
printf '%s\n' "$mapped_path"
|
|
return 0
|
|
fi
|
|
|
|
mapfile -t existing_candidates < <(
|
|
find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print \
|
|
| grep -F "[$channel_id].json" || true
|
|
)
|
|
|
|
if (( ${#existing_candidates[@]} > 1 )); then
|
|
die "Found multiple existing JSON exports for channel $channel_id under $output_dir; add an explicit mapping in $(get_channel_map_path "$output_dir")."
|
|
fi
|
|
|
|
if (( ${#existing_candidates[@]} == 1 )); then
|
|
jq empty "${existing_candidates[0]}" >/dev/null 2>&1 \
|
|
|| die "Existing export is not valid JSON: ${existing_candidates[0]}"
|
|
assert_export_channel_identity "${existing_candidates[0]}" "$channel_id"
|
|
update_channel_map "$map_file" "$channel_id" "${existing_candidates[0]}"
|
|
printf '%s\n' "${existing_candidates[0]}"
|
|
return 0
|
|
fi
|
|
|
|
[[ -n "$export_path" ]] || return 0
|
|
|
|
mapped_path=$(default_destination_path_from_export "$output_dir" "$export_path")
|
|
path_is_within_root "$output_dir" "$mapped_path" \
|
|
|| die "Derived destination '$mapped_path' for channel $channel_id is outside target root '$output_dir'."
|
|
update_channel_map "$map_file" "$channel_id" "$mapped_path"
|
|
printf '%s\n' "$mapped_path"
|
|
}
|
|
|
|
channel_id_from_export() {
|
|
local export_path=$1
|
|
jq -r '.channel.id // empty' "$export_path"
|
|
}
|
|
|
|
assert_export_channel_identity() {
|
|
local export_path=$1
|
|
local expected_channel_id=$2
|
|
local actual_channel_id
|
|
|
|
actual_channel_id=$(channel_id_from_export "$export_path")
|
|
[[ -n "$actual_channel_id" ]] || die "Export '$export_path' is missing channel.id metadata."
|
|
[[ "$actual_channel_id" == "$expected_channel_id" ]] \
|
|
|| die "Export '$export_path' belongs to channel '$actual_channel_id', expected '$expected_channel_id'."
|
|
}
|
|
|
|
last_message_id() {
|
|
local export_path=$1
|
|
|
|
[[ -f "$export_path" ]] || return 0
|
|
jq -r '
|
|
(.messages // [])
|
|
| if length == 0 then empty else (max_by(.id) | .id) end
|
|
' "$export_path"
|
|
}
|
|
|
|
message_count() {
|
|
local export_path=$1
|
|
jq -r '(.messages | length) // 0' "$export_path"
|
|
}
|
|
|
|
commit_merged_export() {
|
|
local destination_path=$1
|
|
local merged_path=$2
|
|
local before_count after_count atomic_path
|
|
|
|
before_count=$(message_count "$destination_path")
|
|
after_count=$(message_count "$merged_path")
|
|
if (( after_count < before_count )); then
|
|
die "Merge would shrink archive '$destination_path' ($before_count -> $after_count messages). Existing file was not modified."
|
|
fi
|
|
|
|
atomic_path=$(mktemp -p "$(dirname "$destination_path")" ".$(basename "$destination_path").dce-replace.XXXXXX")
|
|
cp "$merged_path" "$atomic_path"
|
|
jq empty "$atomic_path" >/dev/null 2>&1 || die "Merged export is not valid JSON: $atomic_path"
|
|
assert_export_channel_identity "$atomic_path" "$(channel_id_from_export "$destination_path")"
|
|
mv -f "$atomic_path" "$destination_path"
|
|
}
|
|
|
|
merge_exports() {
|
|
local existing_path=$1
|
|
local incremental_path=$2
|
|
local merged_path=$3
|
|
|
|
jq -s '
|
|
.[0] as $existing
|
|
| .[1] as $incremental
|
|
| ($existing + $incremental)
|
|
| .messages = (
|
|
reduce (($existing.messages // []) + ($incremental.messages // []))[] as $message
|
|
({};
|
|
.[$message.id] = $message
|
|
)
|
|
| to_entries
|
|
| map(.value)
|
|
| sort_by(.timestamp, .id)
|
|
)
|
|
| .dateRange = {
|
|
after: ($existing.dateRange.after // $incremental.dateRange.after),
|
|
before: ($existing.dateRange.before // $incremental.dateRange.before)
|
|
}
|
|
| .exportedAt = ($incremental.exportedAt // $existing.exportedAt)
|
|
| if ($existing | has("messageCount")) or ($incremental | has("messageCount"))
|
|
then .messageCount = (.messages | length)
|
|
else .
|
|
end
|
|
' "$existing_path" "$incremental_path" >"$merged_path"
|
|
}
|
|
|
|
load_guild_cache() {
|
|
local output
|
|
|
|
if [[ ! -f "$CACHE_ROOT/guilds.tsv" ]]; then
|
|
if ! output=$("$CLI_BIN" guilds 2>&1); then
|
|
die "Guild discovery failed. If you are using a bot token, configure explicit guild_ids/channel_ids for each non-DM target or switch to a user token. CLI output: $output"
|
|
fi
|
|
|
|
printf '%s\n' "$output" | parse_two_column_listing >"$CACHE_ROOT/guilds.tsv"
|
|
fi
|
|
|
|
cat "$CACHE_ROOT/guilds.tsv"
|
|
}
|
|
|
|
load_dm_channel_cache() {
|
|
local output
|
|
|
|
if [[ ! -f "$CACHE_ROOT/dms.txt" ]]; then
|
|
if ! output=$("$CLI_BIN" dm 2>&1); then
|
|
die "DM discovery failed. Bot tokens cannot read direct messages; disable the DM target or switch to a user token. CLI output: $output"
|
|
fi
|
|
|
|
printf '%s\n' "$output" | parse_channel_listing >"$CACHE_ROOT/dms.txt"
|
|
fi
|
|
|
|
cat "$CACHE_ROOT/dms.txt"
|
|
}
|
|
|
|
load_guild_channel_cache() {
|
|
local guild_id=$1
|
|
local include_voice=$2
|
|
local include_threads=$3
|
|
local cache_file="$CACHE_ROOT/channels_${guild_id}_${include_voice}_${include_threads}.txt"
|
|
|
|
if [[ ! -f "$cache_file" ]]; then
|
|
"$CLI_BIN" channels \
|
|
--guild "$guild_id" \
|
|
--include-vc "$include_voice" \
|
|
--include-threads "$include_threads" \
|
|
| parse_channel_listing >"$cache_file"
|
|
fi
|
|
|
|
cat "$cache_file"
|
|
}
|
|
|
|
resolve_guild_ids_from_patterns() {
|
|
local patterns=("$@")
|
|
local guild_id guild_name normalized_guild normalized_pattern pattern
|
|
|
|
(( ${#patterns[@]} > 0 )) || return 0
|
|
|
|
while IFS=$'\t' read -r guild_id guild_name; do
|
|
normalized_guild=$(normalize_name "$guild_name")
|
|
|
|
for pattern in "${patterns[@]}"; do
|
|
normalized_pattern=$(normalize_name "$pattern")
|
|
[[ -n "$normalized_pattern" ]] || continue
|
|
|
|
if [[ "$normalized_guild" == "$normalized_pattern" || "$normalized_guild" == *"$normalized_pattern"* || "$normalized_pattern" == *"$normalized_guild"* ]]; then
|
|
printf '%s\n' "$guild_id"
|
|
break
|
|
fi
|
|
done
|
|
done < <(load_guild_cache)
|
|
}
|
|
|
|
resolve_configured_guilds() {
|
|
local target_json=$1
|
|
local -a configured_guild_ids name_patterns resolved_guild_ids
|
|
|
|
mapfile -t configured_guild_ids < <(jq -r '.guild_ids[]? | tostring' <<<"$target_json")
|
|
mapfile -t name_patterns < <(jq -r '.guild_name_patterns[]?' <<<"$target_json")
|
|
|
|
if (( ${#configured_guild_ids[@]} > 0 )); then
|
|
printf '%s\n' "${configured_guild_ids[@]}" | sort -u
|
|
return 0
|
|
fi
|
|
|
|
mapfile -t resolved_guild_ids < <(resolve_guild_ids_from_patterns "${name_patterns[@]}" | sort -u)
|
|
if (( ${#resolved_guild_ids[@]} == 0 )); then
|
|
return 0
|
|
fi
|
|
|
|
if (( ${#resolved_guild_ids[@]} > 1 )); then
|
|
die "Target '$(jq -r '.name' <<<"$target_json")' matched multiple guilds (${resolved_guild_ids[*]}). Configure explicit guild_ids to make it safe."
|
|
fi
|
|
|
|
printf '%s\n' "${resolved_guild_ids[@]}"
|
|
}
|
|
|
|
resolve_target_channels() {
|
|
local target_json=$1
|
|
local defaults_json=$2
|
|
local kind include_voice include_threads
|
|
local target_name output_dir
|
|
local -a configured_channel_ids configured_guild_ids seeded_channel_ids allowed_channels allowed_guilds selected_guilds
|
|
|
|
target_name=$(jq -r '.name' <<<"$target_json")
|
|
output_dir=$(jq -r '.output_dir' <<<"$target_json")
|
|
kind=$(jq -r '.kind // "guild"' <<<"$target_json")
|
|
include_voice=$(jq -r --argjson defaults "$defaults_json" '(.include_voice_channels // $defaults.include_voice_channels // false) | tostring' <<<"$target_json")
|
|
include_threads=$(jq -r --argjson defaults "$defaults_json" '.include_threads // $defaults.include_threads // "all"' <<<"$target_json")
|
|
|
|
mapfile -t configured_channel_ids < <(jq -r '.channel_ids[]? | tostring' <<<"$target_json")
|
|
mapfile -t seeded_channel_ids < <(load_archive_seed_channel_ids "$output_dir" | sort -u)
|
|
|
|
if [[ "$kind" == "dms" ]]; then
|
|
(( ${#OVERRIDE_GUILDS[@]} == 0 )) || die "DM targets do not support --guild overrides."
|
|
mapfile -t allowed_channels < <(load_dm_channel_cache | sort -u)
|
|
|
|
if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then
|
|
assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels
|
|
printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u
|
|
else
|
|
printf '%s\n' "${allowed_channels[@]}"
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
if (( ${#configured_channel_ids[@]} > 0 )); then
|
|
(( ${#OVERRIDE_GUILDS[@]} == 0 )) || die "Channel-scoped targets do not support --guild overrides."
|
|
mapfile -t allowed_channels < <(printf '%s\n' "${configured_channel_ids[@]}" | sort -u)
|
|
|
|
if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then
|
|
assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels
|
|
printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u
|
|
else
|
|
printf '%s\n' "${allowed_channels[@]}"
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
if (( ${#seeded_channel_ids[@]} > 0 )); then
|
|
(( ${#OVERRIDE_GUILDS[@]} == 0 )) || die "Archive-seeded target '$target_name' does not support --guild overrides."
|
|
mapfile -t allowed_channels < <(printf '%s\n' "${seeded_channel_ids[@]}" | sort -u)
|
|
|
|
if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then
|
|
assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels
|
|
printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u
|
|
else
|
|
printf '%s\n' "${allowed_channels[@]}"
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
mapfile -t configured_guild_ids < <(resolve_configured_guilds "$target_json")
|
|
if (( ${#configured_guild_ids[@]} == 0 )); then
|
|
return 0
|
|
fi
|
|
|
|
if (( ${#OVERRIDE_GUILDS[@]} > 0 )); then
|
|
assert_subset "Guild override" OVERRIDE_GUILDS configured_guild_ids
|
|
selected_guilds=("${OVERRIDE_GUILDS[@]}")
|
|
else
|
|
selected_guilds=("${configured_guild_ids[@]}")
|
|
fi
|
|
|
|
local guild_id
|
|
mapfile -t allowed_channels < <(
|
|
for guild_id in "${selected_guilds[@]}"; do
|
|
load_guild_channel_cache "$guild_id" "$include_voice" "$include_threads"
|
|
done | sort -u
|
|
)
|
|
|
|
if (( ${#OVERRIDE_CHANNELS[@]} > 0 )); then
|
|
assert_subset "Channel override" OVERRIDE_CHANNELS allowed_channels
|
|
printf '%s\n' "${OVERRIDE_CHANNELS[@]}" | sort -u
|
|
else
|
|
printf '%s\n' "${allowed_channels[@]}"
|
|
fi
|
|
}
|
|
|
|
preflight_target() {
|
|
local target_json=$1
|
|
local defaults_json=$2
|
|
local target_name output_dir
|
|
local probe_channel_id probe_dir probe_output
|
|
local -a channel_ids
|
|
|
|
target_name=$(jq -r '.name' <<<"$target_json")
|
|
output_dir=$(jq -r '.output_dir' <<<"$target_json")
|
|
bootstrap_channel_map_from_archives "$output_dir"
|
|
|
|
mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json")
|
|
if (( ${#channel_ids[@]} == 0 )); then
|
|
die "Target '$target_name' resolved no channels during preflight."
|
|
fi
|
|
|
|
probe_channel_id="${channel_ids[0]}"
|
|
probe_dir=$(mktemp -d "${TMPDIR:-/tmp}/dce-preflight.${probe_channel_id}.XXXXXX")
|
|
probe_output="$probe_dir/probe.json"
|
|
|
|
if ! "$CLI_BIN" export --channel "$probe_channel_id" --format Json --output "$probe_output" --before "1970-01-01"; then
|
|
rm -rf "$probe_dir"
|
|
die "Target '$target_name' failed authenticated preflight on channel '$probe_channel_id'."
|
|
fi
|
|
|
|
rm -rf "$probe_dir"
|
|
log "Preflight ok for target '$target_name': ${#channel_ids[@]} channel(s) resolved for $output_dir."
|
|
}
|
|
|
|
scrape_target() {
|
|
local target_json=$1
|
|
local defaults_json=$2
|
|
local target_name output_dir destination_path after_id temp_dir temp_export temp_merged
|
|
local latest_batch_count
|
|
local -a channel_ids export_command
|
|
|
|
target_name=$(jq -r '.name' <<<"$target_json")
|
|
output_dir=$(jq -r '.output_dir' <<<"$target_json")
|
|
mkdir -p "$output_dir"
|
|
bootstrap_channel_map_from_archives "$output_dir"
|
|
|
|
mapfile -t channel_ids < <(resolve_target_channels "$target_json" "$defaults_json")
|
|
if (( ${#channel_ids[@]} == 0 )); then
|
|
die "Target '$target_name' resolved no channels."
|
|
fi
|
|
|
|
log "Target '$target_name': processing ${#channel_ids[@]} channel(s) into $output_dir."
|
|
|
|
local channel_id
|
|
for channel_id in "${channel_ids[@]}"; do
|
|
destination_path=$(resolve_destination_path "$output_dir" "$channel_id")
|
|
if [[ -n "$destination_path" ]]; then
|
|
mkdir -p "$(dirname "$destination_path")"
|
|
fi
|
|
|
|
if [[ -n "$destination_path" && -f "$destination_path" ]]; then
|
|
jq empty "$destination_path" >/dev/null 2>&1 || die "Existing export is not valid JSON: $destination_path"
|
|
assert_export_channel_identity "$destination_path" "$channel_id"
|
|
fi
|
|
|
|
after_id=$(last_message_id "$destination_path")
|
|
mkdir -p "$output_dir/.dce-temp"
|
|
temp_dir=$(mktemp -d "$output_dir/.dce-temp/export.${channel_id}.XXXXXX")
|
|
temp_export="$temp_dir/export.json"
|
|
temp_merged="$temp_dir/merged.json"
|
|
|
|
export_command=("$CLI_BIN" export --channel "$channel_id" --format Json --output "$temp_export")
|
|
if [[ -n "$after_id" ]]; then
|
|
export_command+=(--after "$after_id")
|
|
fi
|
|
|
|
log "Exporting channel $channel_id for target '$target_name'${after_id:+ after message $after_id}."
|
|
|
|
if ! "${export_command[@]}"; then
|
|
rm -rf "$temp_dir"
|
|
die "Channel $channel_id failed for target '$target_name'."
|
|
fi
|
|
|
|
jq empty "$temp_export" >/dev/null 2>&1 || die "Incremental export is not valid JSON: $temp_export"
|
|
assert_export_channel_identity "$temp_export" "$channel_id"
|
|
|
|
if [[ -z "$destination_path" ]]; then
|
|
destination_path=$(resolve_destination_path "$output_dir" "$channel_id" "$temp_export")
|
|
mkdir -p "$(dirname "$destination_path")"
|
|
fi
|
|
|
|
latest_batch_count=$(message_count "$temp_export")
|
|
if [[ ! -f "$destination_path" ]]; then
|
|
mv "$temp_export" "$destination_path"
|
|
rm -rf "$temp_dir"
|
|
continue
|
|
fi
|
|
|
|
if (( latest_batch_count == 0 )); then
|
|
rm -rf "$temp_dir"
|
|
continue
|
|
fi
|
|
|
|
merge_exports "$destination_path" "$temp_export" "$temp_merged"
|
|
[[ -s "$temp_merged" ]] || die "Merged export is empty for channel $channel_id."
|
|
jq empty "$temp_merged" >/dev/null 2>&1 || die "Merged export is not valid JSON: $temp_merged"
|
|
assert_export_channel_identity "$temp_merged" "$channel_id"
|
|
commit_merged_export "$destination_path" "$temp_merged"
|
|
rm -rf "$temp_dir"
|
|
done
|
|
|
|
log "Target '$target_name': scrape completed successfully."
|
|
}
|
|
|
|
list_targets() {
|
|
local config_path=$1
|
|
|
|
validate_config_contract "$config_path"
|
|
jq -r '.targets[] | [.name, (.kind // "guild"), .output_dir] | @tsv' "$config_path"
|
|
}
|
|
|
|
load_selected_targets() {
|
|
local config_path=$1
|
|
shift
|
|
local -a requested_targets=("$@")
|
|
local target_names_json
|
|
|
|
if (( ${#requested_targets[@]} > 0 )); then
|
|
target_names_json=$(json_array_from_args "${requested_targets[@]}")
|
|
jq -c --argjson selected_target_names "$target_names_json" \
|
|
'.targets[] | select(.name as $name | $selected_target_names | index($name))' \
|
|
"$config_path"
|
|
else
|
|
jq -c '.targets[] | select(.enabled != false)' "$config_path"
|
|
fi
|
|
}
|
|
|
|
parse_target_options() {
|
|
local mode=$1
|
|
shift
|
|
local -n config_path_ref=$1
|
|
local -n requested_targets_ref=$2
|
|
shift 2
|
|
|
|
while (($#)); do
|
|
case "$1" in
|
|
--config)
|
|
[[ $# -ge 2 ]] || die "Missing value for --config."
|
|
config_path_ref=$2
|
|
shift 2
|
|
;;
|
|
--target)
|
|
[[ $# -ge 2 ]] || die "Missing value for --target."
|
|
requested_targets_ref+=("$2")
|
|
shift 2
|
|
;;
|
|
--guild)
|
|
[[ $# -ge 2 ]] || die "Missing value for --guild."
|
|
OVERRIDE_GUILDS+=("$2")
|
|
shift 2
|
|
;;
|
|
--channel)
|
|
[[ $# -ge 2 ]] || die "Missing value for --channel."
|
|
OVERRIDE_CHANNELS+=("$2")
|
|
shift 2
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
die "Unknown $mode option: $1"
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
run_target_mode() {
|
|
local mode=$1
|
|
local config_path requested_targets_json defaults_json
|
|
local -a requested_targets=() selected_targets=()
|
|
shift
|
|
|
|
config_path=$(default_config_path)
|
|
parse_target_options "$mode" config_path requested_targets "$@"
|
|
|
|
require_command jq
|
|
validate_config_contract "$config_path"
|
|
[[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set."
|
|
|
|
defaults_json=$(jq -c '.defaults // {}' "$config_path")
|
|
mapfile -t selected_targets < <(load_selected_targets "$config_path" "${requested_targets[@]}")
|
|
|
|
if (( ${#requested_targets[@]} > 0 && ${#selected_targets[@]} != ${#requested_targets[@]} )); then
|
|
die "One or more requested --target names are not present in $config_path."
|
|
fi
|
|
|
|
if (( ${#selected_targets[@]} == 0 )); then
|
|
if (( ${#requested_targets[@]} > 0 )); then
|
|
die "No targets matched the requested selection."
|
|
fi
|
|
die "No enabled targets are available in $config_path."
|
|
fi
|
|
|
|
if (( (${#OVERRIDE_GUILDS[@]} > 0 || ${#OVERRIDE_CHANNELS[@]} > 0) && ${#selected_targets[@]} != 1 )); then
|
|
die "When using --guild or --channel overrides, select exactly one --target."
|
|
fi
|
|
|
|
CACHE_ROOT=$(mktemp -d "${TMPDIR:-/tmp}/dce-scrape.XXXXXX")
|
|
trap 'rm -rf "$CACHE_ROOT"' EXIT
|
|
|
|
local target_json
|
|
for target_json in "${selected_targets[@]}"; do
|
|
if [[ "$mode" == "preflight" ]]; then
|
|
preflight_target "$target_json" "$defaults_json"
|
|
else
|
|
scrape_target "$target_json" "$defaults_json"
|
|
fi
|
|
done
|
|
}
|
|
|
|
main() {
|
|
local subcommand=${1:-help}
|
|
local config_path
|
|
shift || true
|
|
|
|
case "$subcommand" in
|
|
help|-h|--help)
|
|
usage
|
|
;;
|
|
list-targets)
|
|
config_path=$(default_config_path)
|
|
while (($#)); do
|
|
case "$1" in
|
|
--config)
|
|
[[ $# -ge 2 ]] || die "Missing value for --config."
|
|
config_path=$2
|
|
shift 2
|
|
;;
|
|
*)
|
|
die "Unknown list-targets option: $1"
|
|
;;
|
|
esac
|
|
done
|
|
list_targets "$config_path"
|
|
;;
|
|
preflight)
|
|
run_target_mode preflight "$@"
|
|
;;
|
|
scrape)
|
|
run_target_mode scrape "$@"
|
|
;;
|
|
*)
|
|
exec "$CLI_BIN" "$subcommand" "$@"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
|
main "$@"
|
|
fi
|