DiscordChatExporter/scripts/tests/documents-scrape-smoke.sh
Copilot 8468e34e37 feat(scrape): add salvage-only mode for stale temp exports
Expose run-discord-scrape.sh salvage and run-documents-scrape.sh
--salvage-only so operators can merge quiescent .dce-temp partials without
re-downloading from Discord after stopping a long or OOM-aborted run.
2026-06-03 06:48:39 -05:00

136 lines
4.2 KiB
Bash
Executable file

#!/usr/bin/env bash
set -Eeuo pipefail
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd -P)
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/dce-documents-scrape-smoke.XXXXXX")
cleanup() {
rm -rf "$TMP_DIR"
}
trap cleanup EXIT
FAKE_REPO="$TMP_DIR/fake-repo"
mkdir -p "$FAKE_REPO/scripts/lib"
cp "$REPO_ROOT/scripts/run-discord-scrape-host.sh" "$FAKE_REPO/scripts/"
cp "$REPO_ROOT/scripts/lib/scrape-run-plan.sh" "$FAKE_REPO/scripts/lib/"
chmod +x "$FAKE_REPO/scripts/run-discord-scrape-host.sh"
COMPOSE_FILE="$TMP_DIR/docker-compose.yml"
FAKE_DOCKER="$TMP_DIR/docker"
CALL_COUNT="$TMP_DIR/call-count"
cat >"$COMPOSE_FILE" <<'EOF'
services:
discord-scraper:
image: fake
EOF
cat >"$FAKE_DOCKER" <<'EOF'
#!/usr/bin/env bash
printf 'run succeeded\n'
EOF
chmod +x "$FAKE_DOCKER"
printf 'discovered-token\n' >"$FAKE_REPO/.discord-token"
MISSING_ENV="$TMP_DIR/missing-scrape.env"
[[ ! -e "$MISSING_ENV" ]]
DCE_REPO_ROOT="$FAKE_REPO" \
DCE_SKIP_SCRAPE_LOCK=1 \
DCE_DOCKER_BIN="$FAKE_DOCKER" \
DCE_ENV_FILE="$MISSING_ENV" \
DCE_COMPOSE_FILE="$COMPOSE_FILE" \
FAKE_DOCKER_CALL_COUNT="$CALL_COUNT" \
"$FAKE_REPO/scripts/run-discord-scrape-host.sh" scrape --target demo >/dev/null
ARCHIVE="$TMP_DIR/server"
mkdir -p "$ARCHIVE"
printf '{"guild":{"id":"1","name":"Guild"},"channel":{"id":"111111111111111111","name":"general"},"messages":[{"id":"1","timestamp":"2020-01-01T00:00:00"}]}\n' >"$ARCHIVE/Guild - general [111111111111111111].json"
cat >"$TMP_DIR/config.json" <<JSON
{
"archive_root": "$TMP_DIR",
"targets": [
{
"name": "demo",
"kind": "guild",
"output_dir": "$ARCHIVE",
"channel_ids": ["111111111111111111"],
"guild_ids": [],
"guild_name_patterns": []
}
]
}
JSON
PROVE="$REPO_ROOT/scripts/prove-incremental-append.sh"
HOST="$REPO_ROOT/scripts/run-discord-scrape-host.sh"
# Prove script should fail when host would shrink archives (simulate by patching fake docker to no-op)
DCE_REPO_ROOT="$REPO_ROOT" \
DCE_SKIP_SCRAPE_LOCK=1 \
DCE_DOCKER_BIN="$FAKE_DOCKER" \
DCE_ENV_FILE="$MISSING_ENV" \
DCE_COMPOSE_FILE="$COMPOSE_FILE" \
DISCORD_TOKEN=dummy \
"$PROVE" --config "$TMP_DIR/config.json" --target demo >/dev/null
DOC_OUT="$TMP_DIR/documents-dry-run.log"
"$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" >"$DOC_OUT" 2>&1
grep -q 'Documents scrape run plan' "$DOC_OUT" || {
echo "expected Documents scrape run plan in dry-run output" >&2
exit 1
}
CHANNEL_DRY="$TMP_DIR/channel-dry-run.log"
"$REPO_ROOT/scripts/run-documents-scrape.sh" --dry-run --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >"$CHANNEL_DRY" 2>&1
grep -q 'Documents scrape run plan' "$CHANNEL_DRY" || {
echo "expected dry-run to accept --channel passthrough" >&2
exit 1
}
ARGS_LOG="$TMP_DIR/compose-args.log"
cat >"$FAKE_DOCKER" <<'EOF'
#!/usr/bin/env bash
printf '%s\n' "$*" >>"${FAKE_DOCKER_ARGS_LOG:?}"
printf 'run succeeded\n'
EOF
chmod +x "$FAKE_DOCKER"
printf 'DISCORD_TOKEN=dummy-token\n' >"$TMP_DIR/scrape.env"
DCE_MIN_FREE_MB=0 \
DCE_SKIP_SCRAPE_LOCK=1 \
DCE_DOCKER_BIN="$FAKE_DOCKER" \
FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \
DCE_ENV_FILE="$TMP_DIR/scrape.env" \
"$REPO_ROOT/scripts/run-documents-scrape.sh" --config "$TMP_DIR/config.json" --target demo --channel 111111111111111111 >/dev/null
grep -q '111111111111111111' "$ARGS_LOG" || {
echo "expected --channel to reach container compose invocation" >&2
cat "$ARGS_LOG" >&2
exit 1
}
cp "$REPO_ROOT/scripts/run-discord-scrape.sh" "$FAKE_REPO/scripts/"
chmod +x "$FAKE_REPO/scripts/run-discord-scrape.sh"
SALVAGE_DOC_LOG="$TMP_DIR/salvage-documents.log"
DCE_MIN_FREE_MB=0 \
DCE_SKIP_SCRAPE_LOCK=1 \
"$REPO_ROOT/scripts/run-documents-scrape.sh" --salvage-only --config "$TMP_DIR/config.json" --target demo >"$SALVAGE_DOC_LOG" 2>&1 || {
echo "salvage-only documents scrape failed" >&2
cat "$SALVAGE_DOC_LOG" >&2
exit 1
}
grep -q 'salvage completed' "$SALVAGE_DOC_LOG" || {
echo "expected --salvage-only to run local salvage" >&2
cat "$SALVAGE_DOC_LOG" >&2
exit 1
}
DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \
"$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \
| grep -q 'disk-only: ok'
echo "documents-scrape-smoke: ok"