mirror of
https://github.com/Tyrrrz/DiscordChatExporter.git
synced 2026-06-10 00:02:37 -06:00
feat(scrape): lock gate and salvage-before on documents scrape
Add archive-root lock preflight and --salvage-before-scrape to run-documents-scrape.sh so direct operator invocations match validation safety and KotOR catch-up workflow.
This commit is contained in:
parent
363749231d
commit
b883943e3a
|
|
@ -0,0 +1,50 @@
|
||||||
|
---
|
||||||
|
title: "feat: Documents scrape lock gate and salvage-before"
|
||||||
|
type: feat
|
||||||
|
status: active
|
||||||
|
date: 2026-06-04
|
||||||
|
origin: /lfg — validation/handoff expose salvage; direct run-documents-scrape.sh still lacks lock gate and --salvage-before-scrape
|
||||||
|
---
|
||||||
|
|
||||||
|
# feat: Documents scrape lock gate and salvage-before
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Add scrape lock preflight and `--salvage-before-scrape` to `run-documents-scrape.sh` so direct document scrapes match operator-validation safety and KotOR catch-up workflow.
|
||||||
|
|
||||||
|
## Problem Frame
|
||||||
|
|
||||||
|
Operators often invoke documents scrape directly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/run-documents-scrape.sh --target KotOR_discord_msgs --channel 221726893064454144
|
||||||
|
```
|
||||||
|
|
||||||
|
This bypasses `run-operator-validation.sh` lock gate. Salvage-before requires two commands today.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| ID | Requirement |
|
||||||
|
|----|-------------|
|
||||||
|
| R1 | `run-documents-scrape.sh` checks archive-root lock before salvage or Discord scrape |
|
||||||
|
| R2 | Lock gate skipped when `DCE_SKIP_SCRAPE_LOCK=1` |
|
||||||
|
| R3 | `--salvage-before-scrape` runs salvage then preflight/scrape |
|
||||||
|
| R4 | `--salvage-only`, `--salvage-before-scrape`, and `--dry-run` are mutually exclusive |
|
||||||
|
| R5 | Smokes cover lock block and salvage-before; `run-all-smokes.sh` passes |
|
||||||
|
|
||||||
|
## Implementation Units
|
||||||
|
|
||||||
|
### U1. Documents scrape lock + salvage-before
|
||||||
|
|
||||||
|
**Files:** `scripts/run-documents-scrape.sh`
|
||||||
|
|
||||||
|
### U2. Smoke coverage
|
||||||
|
|
||||||
|
**Files:** `scripts/tests/documents-scrape-smoke.sh`
|
||||||
|
|
||||||
|
## Scope Boundaries
|
||||||
|
|
||||||
|
### Deferred
|
||||||
|
|
||||||
|
- Operator checklist doc refresh
|
||||||
|
- Live KotOR catch-up on host
|
||||||
|
|
@ -11,13 +11,14 @@ DISCOVER_TOKEN="$REPO_ROOT/scripts/discover-discord-token.sh"
|
||||||
VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
|
VERIFY_SCRIPT="$REPO_ROOT/scripts/verify-documents-archives.sh"
|
||||||
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
|
VERIFY_READY="$REPO_ROOT/scripts/verify-operator-ready.sh"
|
||||||
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
|
SETUP_AUTH="$REPO_ROOT/scripts/setup-scrape-auth.sh"
|
||||||
|
LOCK_STATUS="$REPO_ROOT/scripts/scrape-lock-status.sh"
|
||||||
# shellcheck source=lib/scrape-run-plan.sh
|
# shellcheck source=lib/scrape-run-plan.sh
|
||||||
source "$SCRIPT_DIR/lib/scrape-run-plan.sh"
|
source "$SCRIPT_DIR/lib/scrape-run-plan.sh"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
Usage:
|
Usage:
|
||||||
$(basename "$0") [--dry-run] [--salvage-only] [--target NAME] [--config PATH]
|
$(basename "$0") [--dry-run] [--salvage-only] [--salvage-before-scrape] [--target NAME] [--config PATH]
|
||||||
|
|
||||||
End-to-end Documents scrape workflow:
|
End-to-end Documents scrape workflow:
|
||||||
1. Verify enabled targets have seeded archives under ~/Documents/<server>/
|
1. Verify enabled targets have seeded archives under ~/Documents/<server>/
|
||||||
|
|
@ -28,6 +29,7 @@ End-to-end Documents scrape workflow:
|
||||||
Options:
|
Options:
|
||||||
--dry-run Verify archives only; do not call Discord
|
--dry-run Verify archives only; do not call Discord
|
||||||
--salvage-only Merge quiescent stale .dce-temp exports only (no Discord export)
|
--salvage-only Merge quiescent stale .dce-temp exports only (no Discord export)
|
||||||
|
--salvage-before-scrape Run salvage-only pass before preflight and incremental scrape
|
||||||
--target NAME Limit preflight/scrape to one configured target
|
--target NAME Limit preflight/scrape to one configured target
|
||||||
--channel ID With exactly one --target, limit scrape to channel ID (repeatable)
|
--channel ID With exactly one --target, limit scrape to channel ID (repeatable)
|
||||||
--config PATH Scrape target config (default: config/scrape-targets.json)
|
--config PATH Scrape target config (default: config/scrape-targets.json)
|
||||||
|
|
@ -39,9 +41,37 @@ die() {
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensure_scrape_lock_available() {
|
||||||
|
if [[ "${DCE_SKIP_SCRAPE_LOCK:-0}" == "1" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
[[ -x "$LOCK_STATUS" ]] || return 0
|
||||||
|
if ! "$LOCK_STATUS" --config "$CONFIG_PATH"; then
|
||||||
|
die "Scrape lock is held; another scrape may be running. Inspect: $LOCK_STATUS --config $CONFIG_PATH"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_local_salvage() {
|
||||||
|
local -a salvage_args=(--config "$CONFIG_PATH")
|
||||||
|
local skip_next=0 arg
|
||||||
|
for arg in "$@"; do
|
||||||
|
if (( skip_next )); then
|
||||||
|
skip_next=0
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [[ "$arg" == "--config" ]]; then
|
||||||
|
skip_next=1
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
salvage_args+=("$arg")
|
||||||
|
done
|
||||||
|
"$HOST_RUNNER" salvage "${salvage_args[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
local dry_run=0
|
local dry_run=0
|
||||||
local salvage_only=0
|
local salvage_only=0
|
||||||
|
local salvage_before=0
|
||||||
local target=""
|
local target=""
|
||||||
local -a passthrough=()
|
local -a passthrough=()
|
||||||
|
|
||||||
|
|
@ -55,6 +85,10 @@ main() {
|
||||||
salvage_only=1
|
salvage_only=1
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--salvage-before-scrape)
|
||||||
|
salvage_before=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
--target)
|
--target)
|
||||||
[[ $# -ge 2 ]] || die "Missing value for --target."
|
[[ $# -ge 2 ]] || die "Missing value for --target."
|
||||||
target=$2
|
target=$2
|
||||||
|
|
@ -82,6 +116,14 @@ main() {
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
local exclusive=0
|
||||||
|
(( dry_run == 1 )) && exclusive=$((exclusive + 1))
|
||||||
|
(( salvage_only == 1 )) && exclusive=$((exclusive + 1))
|
||||||
|
(( salvage_before == 1 )) && exclusive=$((exclusive + 1))
|
||||||
|
if (( exclusive > 1 )); then
|
||||||
|
die "Use only one of --dry-run, --salvage-only, or --salvage-before-scrape."
|
||||||
|
fi
|
||||||
|
|
||||||
"$VERIFY_SCRIPT" --config "$CONFIG_PATH"
|
"$VERIFY_SCRIPT" --config "$CONFIG_PATH"
|
||||||
|
|
||||||
local -a plan_targets=()
|
local -a plan_targets=()
|
||||||
|
|
@ -97,24 +139,17 @@ main() {
|
||||||
|
|
||||||
"$VERIFY_READY" --disk-only --config "$CONFIG_PATH"
|
"$VERIFY_READY" --disk-only --config "$CONFIG_PATH"
|
||||||
|
|
||||||
|
ensure_scrape_lock_available
|
||||||
|
|
||||||
if (( salvage_only == 1 )); then
|
if (( salvage_only == 1 )); then
|
||||||
local -a salvage_args=(--config "$CONFIG_PATH")
|
run_local_salvage "${passthrough[@]}"
|
||||||
local skip_next=0 arg
|
|
||||||
for arg in "${passthrough[@]}"; do
|
|
||||||
if (( skip_next )); then
|
|
||||||
skip_next=0
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
if [[ "$arg" == "--config" ]]; then
|
|
||||||
skip_next=1
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
salvage_args+=("$arg")
|
|
||||||
done
|
|
||||||
"$HOST_RUNNER" salvage "${salvage_args[@]}"
|
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if (( salvage_before == 1 )); then
|
||||||
|
run_local_salvage "${passthrough[@]}"
|
||||||
|
fi
|
||||||
|
|
||||||
local -a container_args=("${passthrough[@]}")
|
local -a container_args=("${passthrough[@]}")
|
||||||
local has_config=0 idx=0
|
local has_config=0 idx=0
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,60 @@ grep -q 'salvage completed' "$SALVAGE_DOC_LOG" || {
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SALVAGE_BEFORE_LOG="$TMP_DIR/salvage-before.log"
|
||||||
|
: >"$ARGS_LOG"
|
||||||
|
DCE_MIN_FREE_MB=0 \
|
||||||
|
DCE_SKIP_SCRAPE_LOCK=1 \
|
||||||
|
DCE_DOCKER_BIN="$FAKE_DOCKER" \
|
||||||
|
FAKE_DOCKER_ARGS_LOG="$ARGS_LOG" \
|
||||||
|
DCE_ENV_FILE="$TMP_DIR/scrape.env" \
|
||||||
|
"$REPO_ROOT/scripts/run-documents-scrape.sh" \
|
||||||
|
--salvage-before-scrape --config "$TMP_DIR/config.json" --target demo >"$SALVAGE_BEFORE_LOG" 2>&1 || {
|
||||||
|
echo "salvage-before-scrape documents scrape failed" >&2
|
||||||
|
cat "$SALVAGE_BEFORE_LOG" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'salvage completed' "$SALVAGE_BEFORE_LOG" || {
|
||||||
|
echo "expected --salvage-before-scrape to run local salvage first" >&2
|
||||||
|
cat "$SALVAGE_BEFORE_LOG" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'compose' "$ARGS_LOG" || {
|
||||||
|
echo "expected --salvage-before-scrape to continue into container scrape" >&2
|
||||||
|
cat "$ARGS_LOG" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
command -v flock >/dev/null 2>&1 && {
|
||||||
|
LOCK_FILE="$TMP_DIR/.dce-scrape.lock"
|
||||||
|
HOLDER_PID=""
|
||||||
|
(
|
||||||
|
exec {lock_fd}>>"$LOCK_FILE"
|
||||||
|
flock -n "$lock_fd" || exit 1
|
||||||
|
sleep 120
|
||||||
|
) &
|
||||||
|
HOLDER_PID=$!
|
||||||
|
sleep 0.2
|
||||||
|
|
||||||
|
set +e
|
||||||
|
blocked_output=$(
|
||||||
|
DCE_MIN_FREE_MB=0 \
|
||||||
|
"$REPO_ROOT/scripts/run-documents-scrape.sh" \
|
||||||
|
--salvage-only --config "$TMP_DIR/config.json" --target demo 2>&1
|
||||||
|
)
|
||||||
|
blocked_status=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
kill "$HOLDER_PID" 2>/dev/null || true
|
||||||
|
wait "$HOLDER_PID" 2>/dev/null || true
|
||||||
|
|
||||||
|
if [[ "$blocked_status" -eq 0 ]] || ! grep -q 'Scrape lock is held' <<<"$blocked_output"; then
|
||||||
|
echo "expected documents scrape to fail when archive lock held" >&2
|
||||||
|
printf '%s\n' "$blocked_output" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \
|
DCE_MIN_FREE_MB=0 DCE_CONFIG_FILE="$TMP_DIR/config.json" \
|
||||||
"$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \
|
"$REPO_ROOT/scripts/verify-operator-ready.sh" --disk-only --config "$TMP_DIR/config.json" \
|
||||||
| grep -q 'disk-only: ok'
|
| grep -q 'disk-only: ok'
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue