fix(scrape): address residual review findings R1–R3

Use padded sort for last_message_id cursor, surface guild channel
discovery errors, and document DCE_ARCHIVE_ROOT in compose.
This commit is contained in:
Boden 2026-05-29 13:56:29 -05:00
parent c713ee5e64
commit 25e1a7e600
3 changed files with 65 additions and 4 deletions

View file

@ -17,5 +17,6 @@ services:
volumes: volumes:
- ./config:/config:ro,z - ./config:/config:ro,z
- ./scripts/run-discord-scrape.sh:/opt/dce-scheduler/run-discord-scrape.sh:ro,z - ./scripts/run-discord-scrape.sh:/opt/dce-scheduler/run-discord-scrape.sh:ro,z
# Host path must match archive_root in config/scrape-targets.json (override on other machines).
- ${DCE_ARCHIVE_ROOT:-/home/brunner56/Documents}:${DCE_ARCHIVE_ROOT:-/home/brunner56/Documents}:z - ${DCE_ARCHIVE_ROOT:-/home/brunner56/Documents}:${DCE_ARCHIVE_ROOT:-/home/brunner56/Documents}:z
command: ["help"] command: ["help"]

View file

@ -0,0 +1,47 @@
---
title: fix: Close recurring scrape residual review findings
type: fix
status: completed
date: 2026-05-29
origin: LFG — residual review findings on feat/recurring-cli-scrape (R1R3)
---
# fix: Close recurring scrape residual review findings
## Summary
Address manual review residuals from plan 011: correct incremental cursor selection, improve guild channel discovery errors, and document portable archive mount configuration.
## Requirements
| ID | Requirement |
|----|-------------|
| R1 | `last_message_id` picks highest snowflake reliably across mixed digit lengths |
| R2 | `load_guild_channel_cache` surfaces CLI failure output like `load_guild_cache` |
| R3 | `docker-compose.yml` documents required `DCE_ARCHIVE_ROOT` override |
| R4 | Existing smoke tests pass after changes |
## Implementation Units
### U1. Fix message cursor (`last_message_id`)
**Files:** `scripts/run-discord-scrape.sh`
**Approach:** Replace `max_by(.id)` with `sort_by(.id) | last | .id` for lexicographic ordering on zero-padded-equal-length snowflakes; Discord IDs in one channel are typically same length — sort_by is safer than max_by for strings.
### U2. Guild channel cache diagnostics
**Files:** `scripts/run-discord-scrape.sh`
**Approach:** Capture `channels` CLI stderr/stdout; `die` with context on failure.
### U3. Compose portability note
**Files:** `docker-compose.yml`
**Approach:** Comment above `DCE_ARCHIVE_ROOT` volume line.
## Verification
- `scripts/tests/run-discord-scrape-smoke.sh`
- `bash -n scripts/run-discord-scrape.sh`

View file

@ -374,7 +374,16 @@ last_message_id() {
[[ -f "$export_path" ]] || return 0 [[ -f "$export_path" ]] || return 0
jq -r ' jq -r '
(.messages // []) (.messages // [])
| if length == 0 then empty else (max_by(.id) | .id) end | if length == 0 then empty else (
sort_by(
.id as $id
| ($id | tostring) as $s
| (22 - ($s | length)) as $pad
| if $pad > 0 then ("0" * $pad) + $s else $s end
)
| last
| .id
) end
' "$export_path" ' "$export_path"
} }
@ -508,11 +517,15 @@ load_guild_channel_cache() {
local cache_file="$CACHE_ROOT/channels_${guild_id}_${include_voice}_${include_threads}.txt" local cache_file="$CACHE_ROOT/channels_${guild_id}_${include_voice}_${include_threads}.txt"
if [[ ! -f "$cache_file" ]]; then if [[ ! -f "$cache_file" ]]; then
"$CLI_BIN" channels \ local output
if ! output=$("$CLI_BIN" channels \
--guild "$guild_id" \ --guild "$guild_id" \
--include-vc "$include_voice" \ --include-vc "$include_voice" \
--include-threads "$include_threads" \ --include-threads "$include_threads" 2>&1); then
| parse_channel_listing >"$cache_file" die "Channel discovery failed for guild $guild_id. CLI output: $output"
fi
printf '%s\n' "$output" | parse_channel_listing >"$cache_file"
fi fi
cat "$cache_file" cat "$cache_file"