mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
feat(scrape): add Documents verify and auth bootstrap helpers
Add scripts to confirm enabled targets map to seeded ~/Documents archives and to create scrape.env from exported DISCORD_TOKEN credentials. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
8c14dbbf45
commit
9e55378c84
|
|
@ -59,6 +59,23 @@ export DISCORD_TOKEN="your-token-here"
|
|||
# optional: export DISCORD_TOKEN_FILE=/path/to/token/file
|
||||
```
|
||||
|
||||
To materialize `scrape.env` from exported credentials (mode `600`, no manual editing):
|
||||
|
||||
```bash
|
||||
export DISCORD_TOKEN="your-token-here"
|
||||
./scripts/setup-scrape-auth.sh
|
||||
```
|
||||
|
||||
### 2b. Verify existing ~/Documents archives
|
||||
|
||||
Before the first incremental run, confirm each enabled target points at the correct on-disk server folder and already has seeded channel JSON exports (the scraper appends in place and bootstraps `.dce-meta/channel-map.json` from these files):
|
||||
|
||||
```bash
|
||||
./scripts/verify-documents-archives.sh --config config/scrape-targets.json
|
||||
```
|
||||
|
||||
Each enabled target should show a non-zero **JSON** count and **SEEDED** channel IDs under `/home/brunner56/Documents/<server>/`.
|
||||
|
||||
### 3. Run Preflight Validation
|
||||
|
||||
Before installing cron, validate your setup:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,28 @@
|
|||
---
|
||||
title: fix: Verify Documents targets and bootstrap scrape auth
|
||||
type: fix
|
||||
status: completed
|
||||
date: 2026-05-28
|
||||
origin: LFG repeat — confirm ~/Documents/** append paths and unblock auth setup
|
||||
depends_on: docs/plans/2026-05-28-006-fix-documents-append-auth-plan.md
|
||||
---
|
||||
|
||||
# fix: Verify Documents targets and bootstrap scrape auth
|
||||
|
||||
## Summary
|
||||
|
||||
Plan 006 landed append-safe scraping. This pass adds operator tooling so you can (1) verify every enabled target maps to an on-disk `~/Documents/<server>/` tree with seeded channel archives, and (2) create `scrape.env` without manual file editing when a token is already exported.
|
||||
|
||||
## Requirements
|
||||
|
||||
| ID | Requirement | Files |
|
||||
|----|-------------|-------|
|
||||
| V1 | `verify-documents-archives.sh` reports per-target output_dir, JSON count, seeded channel IDs, channel-map coverage | `scripts/verify-documents-archives.sh`, smoke test |
|
||||
| V2 | `setup-scrape-auth.sh` writes `scrape.env` from `DISCORD_TOKEN` or `DISCORD_TOKEN_FILE` (chmod 600), idempotent | `scripts/setup-scrape-auth.sh`, smoke test |
|
||||
| V3 | Document verify + auth bootstrap in setup guide | `.docs/Recurring-Scrape-Setup.md` |
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- Verify script runs against real `config/scrape-targets.json` and exits 0 when enabled targets have archive dirs
|
||||
- Auth bootstrap creates scrape.env when token env vars are set
|
||||
- Smoke tests pass
|
||||
106
scripts/setup-scrape-auth.sh
Executable file
106
scripts/setup-scrape-auth.sh
Executable file
|
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
|
||||
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
||||
ENV_FILE="${DCE_ENV_FILE:-$REPO_ROOT/scrape.env}"
|
||||
EXAMPLE_FILE="$REPO_ROOT/scrape.env.example"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage:
|
||||
$(basename "$0") [--env-file PATH] [--force]
|
||||
|
||||
Create scrape.env for recurring Discord scrapes.
|
||||
|
||||
Reads credentials from the environment (never prompts for a token in the terminal):
|
||||
DISCORD_TOKEN Write directly into scrape.env
|
||||
DISCORD_TOKEN_FILE Write a pointer to an existing token file
|
||||
|
||||
If scrape.env already exists and --force is not set, the script exits without changes.
|
||||
|
||||
Examples:
|
||||
export DISCORD_TOKEN="your-token"
|
||||
$(basename "$0")
|
||||
|
||||
export DISCORD_TOKEN_FILE="\$HOME/.config/discord-token"
|
||||
$(basename "$0")
|
||||
EOF
|
||||
}
|
||||
|
||||
die() {
|
||||
printf 'ERROR: %s\n' "$*" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
main() {
|
||||
local force=0
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--env-file)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --env-file."
|
||||
ENV_FILE=$2
|
||||
shift 2
|
||||
;;
|
||||
--force)
|
||||
force=1
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
die "Unknown option: $1"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
[[ -f "$EXAMPLE_FILE" ]] || die "Missing example env file: $EXAMPLE_FILE"
|
||||
|
||||
if [[ -f "$ENV_FILE" && "$force" -eq 0 ]]; then
|
||||
printf 'scrape.env already exists at %s (use --force to overwrite).\n' "$ENV_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ -z "${DISCORD_TOKEN:-}" && -z "${DISCORD_TOKEN_FILE:-}" ]]; then
|
||||
die "Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in the environment, then rerun this script."
|
||||
fi
|
||||
|
||||
if [[ -n "${DISCORD_TOKEN_FILE:-}" && ! -f "$DISCORD_TOKEN_FILE" ]]; then
|
||||
die "DISCORD_TOKEN_FILE does not exist: $DISCORD_TOKEN_FILE"
|
||||
fi
|
||||
|
||||
local tmp_file
|
||||
tmp_file=$(mktemp "${TMPDIR:-/tmp}/scrape.env.XXXXXX")
|
||||
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||
case "$line" in
|
||||
DISCORD_TOKEN=*)
|
||||
if [[ -n "${DISCORD_TOKEN:-}" ]]; then
|
||||
printf 'DISCORD_TOKEN=%s\n' "$DISCORD_TOKEN"
|
||||
else
|
||||
printf '%s\n' "$line"
|
||||
fi
|
||||
;;
|
||||
DISCORD_TOKEN_FILE=*)
|
||||
if [[ -n "${DISCORD_TOKEN_FILE:-}" ]]; then
|
||||
printf 'DISCORD_TOKEN_FILE=%s\n' "$DISCORD_TOKEN_FILE"
|
||||
else
|
||||
printf '%s\n' "$line"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
printf '%s\n' "$line"
|
||||
;;
|
||||
esac
|
||||
done <"$EXAMPLE_FILE" >"$tmp_file"
|
||||
mv "$tmp_file" "$ENV_FILE"
|
||||
|
||||
chmod 600 "$ENV_FILE"
|
||||
printf 'Created %s (mode 600).\n' "$ENV_FILE"
|
||||
printf 'Next: ./scripts/verify-documents-archives.sh && ./scripts/run-discord-scrape-host.sh preflight\n'
|
||||
}
|
||||
|
||||
main "$@"
|
||||
80
scripts/tests/verify-documents-auth-smoke.sh
Executable file
80
scripts/tests/verify-documents-auth-smoke.sh
Executable file
|
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
|
||||
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-verify-auth-smoke.XXXXXX")
|
||||
|
||||
cleanup() {
|
||||
rm -rf "$TMP_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
mkdir -p "$TMP_DIR/good-server" "$TMP_DIR/bad-server"
|
||||
printf '{"messages":[{"id":"1"}]}\n' >"$TMP_DIR/good-server/Guild - general [111111111111111111].json"
|
||||
|
||||
cat >"$TMP_DIR/config.json" <<JSON
|
||||
{
|
||||
"archive_root": "$TMP_DIR",
|
||||
"targets": [
|
||||
{
|
||||
"name": "good-server",
|
||||
"kind": "guild",
|
||||
"output_dir": "$TMP_DIR/good-server",
|
||||
"channel_ids": [],
|
||||
"guild_ids": [],
|
||||
"guild_name_patterns": []
|
||||
},
|
||||
{
|
||||
"name": "bad-server",
|
||||
"enabled": false,
|
||||
"kind": "guild",
|
||||
"output_dir": "$TMP_DIR/bad-server",
|
||||
"channel_ids": [],
|
||||
"guild_ids": [],
|
||||
"guild_name_patterns": []
|
||||
},
|
||||
{
|
||||
"name": "missing-server",
|
||||
"kind": "guild",
|
||||
"output_dir": "$TMP_DIR/missing-server",
|
||||
"channel_ids": [],
|
||||
"guild_ids": [],
|
||||
"guild_name_patterns": []
|
||||
}
|
||||
]
|
||||
}
|
||||
JSON
|
||||
|
||||
"$REPO_ROOT/scripts/verify-documents-archives.sh" --config "$TMP_DIR/config.json" >/dev/null && {
|
||||
echo "expected verify to fail when enabled target dir is missing" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
mkdir -p "$TMP_DIR/missing-server"
|
||||
printf '{"messages":[{"id":"1"}]}\n' >"$TMP_DIR/missing-server/Guild - general [222222222222222222].json"
|
||||
"$REPO_ROOT/scripts/verify-documents-archives.sh" --config "$TMP_DIR/config.json" >/dev/null
|
||||
|
||||
ENV_OUT="$TMP_DIR/scrape.env"
|
||||
DISCORD_TOKEN=smoke-token \
|
||||
"$REPO_ROOT/scripts/setup-scrape-auth.sh" --env-file "$ENV_OUT"
|
||||
|
||||
grep -q '^DISCORD_TOKEN=smoke-token$' "$ENV_OUT" || {
|
||||
echo "expected setup-scrape-auth to write DISCORD_TOKEN" >&2
|
||||
exit 1
|
||||
}
|
||||
[[ "$(stat -c '%a' "$ENV_OUT")" == "600" ]] || {
|
||||
echo "expected scrape.env mode 600" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
DISCORD_TOKEN_FILE="$TMP_DIR/token.txt"
|
||||
printf 'file-token\n' >"$DISCORD_TOKEN_FILE"
|
||||
DISCORD_TOKEN_FILE="$DISCORD_TOKEN_FILE" \
|
||||
"$REPO_ROOT/scripts/setup-scrape-auth.sh" --env-file "$ENV_OUT" --force
|
||||
grep -q "^DISCORD_TOKEN_FILE=$DISCORD_TOKEN_FILE\$" "$ENV_OUT" || {
|
||||
echo "expected setup-scrape-auth to write DISCORD_TOKEN_FILE" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "verify-documents-auth-smoke: ok"
|
||||
129
scripts/verify-documents-archives.sh
Executable file
129
scripts/verify-documents-archives.sh
Executable file
|
|
@ -0,0 +1,129 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
|
||||
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
||||
CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage:
|
||||
$(basename "$0") [--config PATH]
|
||||
|
||||
Verify enabled scrape targets against on-disk ~/Documents archive folders.
|
||||
Reports JSON export counts, archive-seeded channel IDs, and channel-map coverage.
|
||||
Exits non-zero when an enabled target's output_dir is missing or has zero JSON exports.
|
||||
EOF
|
||||
}
|
||||
|
||||
die() {
|
||||
printf 'ERROR: %s\n' "$*" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
require_command() {
|
||||
command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing."
|
||||
}
|
||||
|
||||
count_archive_json() {
|
||||
local output_dir=$1
|
||||
find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' 2>/dev/null | wc -l | tr -d ' '
|
||||
}
|
||||
|
||||
count_seeded_channel_ids() {
|
||||
local output_dir=$1
|
||||
local file_path file_name
|
||||
|
||||
[[ -d "$output_dir" ]] || return 0
|
||||
|
||||
while IFS= read -r -d '' file_path; do
|
||||
file_name=$(basename "$file_path")
|
||||
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
||||
printf '%s\n' "${BASH_REMATCH[1]}"
|
||||
fi
|
||||
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null) | sort -u | wc -l | tr -d ' '
|
||||
}
|
||||
|
||||
count_channel_map_entries() {
|
||||
local map_file=$1
|
||||
[[ -f "$map_file" ]] || { printf '0'; return 0; }
|
||||
jq -r 'keys | length' "$map_file"
|
||||
}
|
||||
|
||||
main() {
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--config)
|
||||
[[ $# -ge 2 ]] || die "Missing value for --config."
|
||||
CONFIG_PATH=$2
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
die "Unknown option: $1"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
require_command jq
|
||||
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
|
||||
|
||||
local archive_root failures=0
|
||||
archive_root=$(jq -r '.archive_root // empty' "$CONFIG_PATH")
|
||||
[[ -n "$archive_root" ]] || die "Config is missing archive_root."
|
||||
|
||||
printf 'Archive root: %s\n\n' "$archive_root"
|
||||
printf '%-28s %-40s %8s %8s %8s %s\n' "TARGET" "OUTPUT_DIR" "JSON" "SEEDED" "MAP" "STATUS"
|
||||
printf '%-28s %-40s %8s %8s %8s %s\n' "------" "----------" "----" "------" "-----" "------"
|
||||
|
||||
local target_json name output_dir enabled json_count seeded_count map_count map_file status
|
||||
while IFS= read -r target_json; do
|
||||
name=$(jq -r '.name' <<<"$target_json")
|
||||
output_dir=$(jq -r '.output_dir' <<<"$target_json")
|
||||
enabled=$(jq -r 'if has("enabled") then .enabled else true end' <<<"$target_json")
|
||||
|
||||
if [[ "$enabled" == "false" ]]; then
|
||||
printf '%-28s %-40s %8s %8s %8s %s\n' "$name" "$output_dir" "-" "-" "-" "disabled"
|
||||
continue
|
||||
fi
|
||||
|
||||
json_count=0
|
||||
seeded_count=0
|
||||
map_count=0
|
||||
status="ok"
|
||||
|
||||
if [[ ! -d "$output_dir" ]]; then
|
||||
status="missing output_dir"
|
||||
failures=$((failures + 1))
|
||||
else
|
||||
json_count=$(count_archive_json "$output_dir")
|
||||
seeded_count=$(count_seeded_channel_ids "$output_dir")
|
||||
map_file="$output_dir/.dce-meta/channel-map.json"
|
||||
map_count=$(count_channel_map_entries "$map_file")
|
||||
if (( json_count == 0 )); then
|
||||
status="no json archives"
|
||||
failures=$((failures + 1))
|
||||
elif (( seeded_count == 0 )); then
|
||||
status="no seeded channel ids"
|
||||
failures=$((failures + 1))
|
||||
elif (( map_count == 0 )); then
|
||||
status="ok (map will bootstrap on first run)"
|
||||
fi
|
||||
fi
|
||||
|
||||
printf '%-28s %-40s %8s %8s %8s %s\n' "$name" "$output_dir" "$json_count" "$seeded_count" "$map_count" "$status"
|
||||
done < <(jq -c '.targets[]' "$CONFIG_PATH")
|
||||
|
||||
printf '\n'
|
||||
if (( failures > 0 )); then
|
||||
die "$failures enabled target(s) failed archive verification."
|
||||
fi
|
||||
|
||||
printf 'All enabled targets have archive directories with seeded channel exports.\n'
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Loading…
Reference in a new issue