mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
feat(scrape): add Documents verify and auth bootstrap helpers
Add scripts to confirm enabled targets map to seeded ~/Documents archives and to create scrape.env from exported DISCORD_TOKEN credentials. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
parent
8c14dbbf45
commit
9e55378c84
|
|
@ -59,6 +59,23 @@ export DISCORD_TOKEN="your-token-here"
|
||||||
# optional: export DISCORD_TOKEN_FILE=/path/to/token/file
|
# optional: export DISCORD_TOKEN_FILE=/path/to/token/file
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To materialize `scrape.env` from exported credentials (mode `600`, no manual editing):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export DISCORD_TOKEN="your-token-here"
|
||||||
|
./scripts/setup-scrape-auth.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2b. Verify existing ~/Documents archives
|
||||||
|
|
||||||
|
Before the first incremental run, confirm each enabled target points at the correct on-disk server folder and already has seeded channel JSON exports (the scraper appends in place and bootstraps `.dce-meta/channel-map.json` from these files):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/verify-documents-archives.sh --config config/scrape-targets.json
|
||||||
|
```
|
||||||
|
|
||||||
|
Each enabled target should show a non-zero **JSON** count and **SEEDED** channel IDs under `/home/brunner56/Documents/<server>/`.
|
||||||
|
|
||||||
### 3. Run Preflight Validation
|
### 3. Run Preflight Validation
|
||||||
|
|
||||||
Before installing cron, validate your setup:
|
Before installing cron, validate your setup:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
---
|
||||||
|
title: fix: Verify Documents targets and bootstrap scrape auth
|
||||||
|
type: fix
|
||||||
|
status: completed
|
||||||
|
date: 2026-05-28
|
||||||
|
origin: LFG repeat — confirm ~/Documents/** append paths and unblock auth setup
|
||||||
|
depends_on: docs/plans/2026-05-28-006-fix-documents-append-auth-plan.md
|
||||||
|
---
|
||||||
|
|
||||||
|
# fix: Verify Documents targets and bootstrap scrape auth
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Plan 006 landed append-safe scraping. This pass adds operator tooling so you can (1) verify every enabled target maps to an on-disk `~/Documents/<server>/` tree with seeded channel archives, and (2) create `scrape.env` without manual file editing when a token is already exported.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement | Files |
|
||||||
|
|----|-------------|-------|
|
||||||
|
| V1 | `verify-documents-archives.sh` reports per-target output_dir, JSON count, seeded channel IDs, channel-map coverage | `scripts/verify-documents-archives.sh`, smoke test |
|
||||||
|
| V2 | `setup-scrape-auth.sh` writes `scrape.env` from `DISCORD_TOKEN` or `DISCORD_TOKEN_FILE` (chmod 600), idempotent | `scripts/setup-scrape-auth.sh`, smoke test |
|
||||||
|
| V3 | Document verify + auth bootstrap in setup guide | `.docs/Recurring-Scrape-Setup.md` |
|
||||||
|
|
||||||
|
## Success Criteria
|
||||||
|
|
||||||
|
- Verify script runs against real `config/scrape-targets.json` and exits 0 when enabled targets have archive dirs
|
||||||
|
- Auth bootstrap creates scrape.env when token env vars are set
|
||||||
|
- Smoke tests pass
|
||||||
106
scripts/setup-scrape-auth.sh
Executable file
106
scripts/setup-scrape-auth.sh
Executable file
|
|
@ -0,0 +1,106 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
|
||||||
|
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
||||||
|
ENV_FILE="${DCE_ENV_FILE:-$REPO_ROOT/scrape.env}"
|
||||||
|
EXAMPLE_FILE="$REPO_ROOT/scrape.env.example"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage:
|
||||||
|
$(basename "$0") [--env-file PATH] [--force]
|
||||||
|
|
||||||
|
Create scrape.env for recurring Discord scrapes.
|
||||||
|
|
||||||
|
Reads credentials from the environment (never prompts for a token in the terminal):
|
||||||
|
DISCORD_TOKEN Write directly into scrape.env
|
||||||
|
DISCORD_TOKEN_FILE Write a pointer to an existing token file
|
||||||
|
|
||||||
|
If scrape.env already exists and --force is not set, the script exits without changes.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
export DISCORD_TOKEN="your-token"
|
||||||
|
$(basename "$0")
|
||||||
|
|
||||||
|
export DISCORD_TOKEN_FILE="\$HOME/.config/discord-token"
|
||||||
|
$(basename "$0")
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf 'ERROR: %s\n' "$*" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
local force=0
|
||||||
|
|
||||||
|
while (($#)); do
|
||||||
|
case "$1" in
|
||||||
|
--env-file)
|
||||||
|
[[ $# -ge 2 ]] || die "Missing value for --env-file."
|
||||||
|
ENV_FILE=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--force)
|
||||||
|
force=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--help|-h)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die "Unknown option: $1"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[[ -f "$EXAMPLE_FILE" ]] || die "Missing example env file: $EXAMPLE_FILE"
|
||||||
|
|
||||||
|
if [[ -f "$ENV_FILE" && "$force" -eq 0 ]]; then
|
||||||
|
printf 'scrape.env already exists at %s (use --force to overwrite).\n' "$ENV_FILE"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "${DISCORD_TOKEN:-}" && -z "${DISCORD_TOKEN_FILE:-}" ]]; then
|
||||||
|
die "Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in the environment, then rerun this script."
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${DISCORD_TOKEN_FILE:-}" && ! -f "$DISCORD_TOKEN_FILE" ]]; then
|
||||||
|
die "DISCORD_TOKEN_FILE does not exist: $DISCORD_TOKEN_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local tmp_file
|
||||||
|
tmp_file=$(mktemp "${TMPDIR:-/tmp}/scrape.env.XXXXXX")
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
case "$line" in
|
||||||
|
DISCORD_TOKEN=*)
|
||||||
|
if [[ -n "${DISCORD_TOKEN:-}" ]]; then
|
||||||
|
printf 'DISCORD_TOKEN=%s\n' "$DISCORD_TOKEN"
|
||||||
|
else
|
||||||
|
printf '%s\n' "$line"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
DISCORD_TOKEN_FILE=*)
|
||||||
|
if [[ -n "${DISCORD_TOKEN_FILE:-}" ]]; then
|
||||||
|
printf 'DISCORD_TOKEN_FILE=%s\n' "$DISCORD_TOKEN_FILE"
|
||||||
|
else
|
||||||
|
printf '%s\n' "$line"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
printf '%s\n' "$line"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done <"$EXAMPLE_FILE" >"$tmp_file"
|
||||||
|
mv "$tmp_file" "$ENV_FILE"
|
||||||
|
|
||||||
|
chmod 600 "$ENV_FILE"
|
||||||
|
printf 'Created %s (mode 600).\n' "$ENV_FILE"
|
||||||
|
printf 'Next: ./scripts/verify-documents-archives.sh && ./scripts/run-discord-scrape-host.sh preflight\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
80
scripts/tests/verify-documents-auth-smoke.sh
Executable file
80
scripts/tests/verify-documents-auth-smoke.sh
Executable file
|
|
@ -0,0 +1,80 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
|
||||||
|
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-verify-auth-smoke.XXXXXX")
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
rm -rf "$TMP_DIR"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
mkdir -p "$TMP_DIR/good-server" "$TMP_DIR/bad-server"
|
||||||
|
printf '{"messages":[{"id":"1"}]}\n' >"$TMP_DIR/good-server/Guild - general [111111111111111111].json"
|
||||||
|
|
||||||
|
cat >"$TMP_DIR/config.json" <<JSON
|
||||||
|
{
|
||||||
|
"archive_root": "$TMP_DIR",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"name": "good-server",
|
||||||
|
"kind": "guild",
|
||||||
|
"output_dir": "$TMP_DIR/good-server",
|
||||||
|
"channel_ids": [],
|
||||||
|
"guild_ids": [],
|
||||||
|
"guild_name_patterns": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "bad-server",
|
||||||
|
"enabled": false,
|
||||||
|
"kind": "guild",
|
||||||
|
"output_dir": "$TMP_DIR/bad-server",
|
||||||
|
"channel_ids": [],
|
||||||
|
"guild_ids": [],
|
||||||
|
"guild_name_patterns": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "missing-server",
|
||||||
|
"kind": "guild",
|
||||||
|
"output_dir": "$TMP_DIR/missing-server",
|
||||||
|
"channel_ids": [],
|
||||||
|
"guild_ids": [],
|
||||||
|
"guild_name_patterns": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
|
||||||
|
"$REPO_ROOT/scripts/verify-documents-archives.sh" --config "$TMP_DIR/config.json" >/dev/null && {
|
||||||
|
echo "expected verify to fail when enabled target dir is missing" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "$TMP_DIR/missing-server"
|
||||||
|
printf '{"messages":[{"id":"1"}]}\n' >"$TMP_DIR/missing-server/Guild - general [222222222222222222].json"
|
||||||
|
"$REPO_ROOT/scripts/verify-documents-archives.sh" --config "$TMP_DIR/config.json" >/dev/null
|
||||||
|
|
||||||
|
ENV_OUT="$TMP_DIR/scrape.env"
|
||||||
|
DISCORD_TOKEN=smoke-token \
|
||||||
|
"$REPO_ROOT/scripts/setup-scrape-auth.sh" --env-file "$ENV_OUT"
|
||||||
|
|
||||||
|
grep -q '^DISCORD_TOKEN=smoke-token$' "$ENV_OUT" || {
|
||||||
|
echo "expected setup-scrape-auth to write DISCORD_TOKEN" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
[[ "$(stat -c '%a' "$ENV_OUT")" == "600" ]] || {
|
||||||
|
echo "expected scrape.env mode 600" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
DISCORD_TOKEN_FILE="$TMP_DIR/token.txt"
|
||||||
|
printf 'file-token\n' >"$DISCORD_TOKEN_FILE"
|
||||||
|
DISCORD_TOKEN_FILE="$DISCORD_TOKEN_FILE" \
|
||||||
|
"$REPO_ROOT/scripts/setup-scrape-auth.sh" --env-file "$ENV_OUT" --force
|
||||||
|
grep -q "^DISCORD_TOKEN_FILE=$DISCORD_TOKEN_FILE\$" "$ENV_OUT" || {
|
||||||
|
echo "expected setup-scrape-auth to write DISCORD_TOKEN_FILE" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "verify-documents-auth-smoke: ok"
|
||||||
129
scripts/verify-documents-archives.sh
Executable file
129
scripts/verify-documents-archives.sh
Executable file
|
|
@ -0,0 +1,129 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)
|
||||||
|
REPO_ROOT="${DCE_REPO_ROOT:-$(cd "$SCRIPT_DIR/.." && pwd -P)}"
|
||||||
|
CONFIG_PATH="${DCE_PRIMARY_CONFIG:-$REPO_ROOT/config/scrape-targets.json}"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage:
|
||||||
|
$(basename "$0") [--config PATH]
|
||||||
|
|
||||||
|
Verify enabled scrape targets against on-disk ~/Documents archive folders.
|
||||||
|
Reports JSON export counts, archive-seeded channel IDs, and channel-map coverage.
|
||||||
|
Exits non-zero when an enabled target's output_dir is missing or has zero JSON exports.
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
die() {
|
||||||
|
printf 'ERROR: %s\n' "$*" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
require_command() {
|
||||||
|
command -v "$1" >/dev/null 2>&1 || die "Required command '$1' is missing."
|
||||||
|
}
|
||||||
|
|
||||||
|
count_archive_json() {
|
||||||
|
local output_dir=$1
|
||||||
|
find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' 2>/dev/null | wc -l | tr -d ' '
|
||||||
|
}
|
||||||
|
|
||||||
|
count_seeded_channel_ids() {
|
||||||
|
local output_dir=$1
|
||||||
|
local file_path file_name
|
||||||
|
|
||||||
|
[[ -d "$output_dir" ]] || return 0
|
||||||
|
|
||||||
|
while IFS= read -r -d '' file_path; do
|
||||||
|
file_name=$(basename "$file_path")
|
||||||
|
if [[ "$file_name" =~ \[([0-9]{16,22})\]\.json$ ]]; then
|
||||||
|
printf '%s\n' "${BASH_REMATCH[1]}"
|
||||||
|
fi
|
||||||
|
done < <(find "$output_dir" -type f -name '*.json' ! -path '*/.dce-meta/*' -print0 2>/dev/null) | sort -u | wc -l | tr -d ' '
|
||||||
|
}
|
||||||
|
|
||||||
|
count_channel_map_entries() {
|
||||||
|
local map_file=$1
|
||||||
|
[[ -f "$map_file" ]] || { printf '0'; return 0; }
|
||||||
|
jq -r 'keys | length' "$map_file"
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
while (($#)); do
|
||||||
|
case "$1" in
|
||||||
|
--config)
|
||||||
|
[[ $# -ge 2 ]] || die "Missing value for --config."
|
||||||
|
CONFIG_PATH=$2
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--help|-h)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die "Unknown option: $1"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
require_command jq
|
||||||
|
[[ -f "$CONFIG_PATH" ]] || die "Missing config: $CONFIG_PATH"
|
||||||
|
|
||||||
|
local archive_root failures=0
|
||||||
|
archive_root=$(jq -r '.archive_root // empty' "$CONFIG_PATH")
|
||||||
|
[[ -n "$archive_root" ]] || die "Config is missing archive_root."
|
||||||
|
|
||||||
|
printf 'Archive root: %s\n\n' "$archive_root"
|
||||||
|
printf '%-28s %-40s %8s %8s %8s %s\n' "TARGET" "OUTPUT_DIR" "JSON" "SEEDED" "MAP" "STATUS"
|
||||||
|
printf '%-28s %-40s %8s %8s %8s %s\n' "------" "----------" "----" "------" "-----" "------"
|
||||||
|
|
||||||
|
local target_json name output_dir enabled json_count seeded_count map_count map_file status
|
||||||
|
while IFS= read -r target_json; do
|
||||||
|
name=$(jq -r '.name' <<<"$target_json")
|
||||||
|
output_dir=$(jq -r '.output_dir' <<<"$target_json")
|
||||||
|
enabled=$(jq -r 'if has("enabled") then .enabled else true end' <<<"$target_json")
|
||||||
|
|
||||||
|
if [[ "$enabled" == "false" ]]; then
|
||||||
|
printf '%-28s %-40s %8s %8s %8s %s\n' "$name" "$output_dir" "-" "-" "-" "disabled"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
json_count=0
|
||||||
|
seeded_count=0
|
||||||
|
map_count=0
|
||||||
|
status="ok"
|
||||||
|
|
||||||
|
if [[ ! -d "$output_dir" ]]; then
|
||||||
|
status="missing output_dir"
|
||||||
|
failures=$((failures + 1))
|
||||||
|
else
|
||||||
|
json_count=$(count_archive_json "$output_dir")
|
||||||
|
seeded_count=$(count_seeded_channel_ids "$output_dir")
|
||||||
|
map_file="$output_dir/.dce-meta/channel-map.json"
|
||||||
|
map_count=$(count_channel_map_entries "$map_file")
|
||||||
|
if (( json_count == 0 )); then
|
||||||
|
status="no json archives"
|
||||||
|
failures=$((failures + 1))
|
||||||
|
elif (( seeded_count == 0 )); then
|
||||||
|
status="no seeded channel ids"
|
||||||
|
failures=$((failures + 1))
|
||||||
|
elif (( map_count == 0 )); then
|
||||||
|
status="ok (map will bootstrap on first run)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf '%-28s %-40s %8s %8s %8s %s\n' "$name" "$output_dir" "$json_count" "$seeded_count" "$map_count" "$status"
|
||||||
|
done < <(jq -c '.targets[]' "$CONFIG_PATH")
|
||||||
|
|
||||||
|
printf '\n'
|
||||||
|
if (( failures > 0 )); then
|
||||||
|
die "$failures enabled target(s) failed archive verification."
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf 'All enabled targets have archive directories with seeded channel exports.\n'
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
Loading…
Reference in a new issue