From 8ca55f299b5f4fd273f0a177821416199c852356 Mon Sep 17 00:00:00 2001 From: Copilot Date: Wed, 3 Jun 2026 09:55:33 -0500 Subject: [PATCH] feat(scrape): per-target container_memory in scrape config Single --target runs apply optional container_memory from scrape-targets.json when global DCE_CONTAINER_MEMORY is unset. KotOR_discord_msgs defaults to 8g; scrape.env still overrides. --- config/scrape-targets.json | 1 + ...7-feat-per-target-container-memory-plan.md | 54 +++++++++++++++++++ docs/recurring-scrape-merge-readiness.md | 2 + docs/recurring-scrape-operator-checklist.md | 5 +- scrape.env.example | 3 ++ scripts/lib/scrape-run-plan.sh | 38 ++++++++++++- scripts/run-discord-scrape-host.sh | 37 +++++++++---- .../tests/run-discord-scrape-host-smoke.sh | 50 +++++++++++++++++ 8 files changed, 178 insertions(+), 12 deletions(-) create mode 100644 docs/plans/2026-06-04-067-feat-per-target-container-memory-plan.md diff --git a/config/scrape-targets.json b/config/scrape-targets.json index 444e75a4..dcfeb525 100644 --- a/config/scrape-targets.json +++ b/config/scrape-targets.json @@ -81,6 +81,7 @@ "name": "KotOR_discord_msgs", "kind": "guild", "output_dir": "/home/brunner56/Documents/KotOR_discord_msgs", + "container_memory": "8g", "guild_ids": [], "channel_ids": [], "guild_name_patterns": ["KotOR_discord_msgs", "KotOR"] diff --git a/docs/plans/2026-06-04-067-feat-per-target-container-memory-plan.md b/docs/plans/2026-06-04-067-feat-per-target-container-memory-plan.md new file mode 100644 index 00000000..06a883e9 --- /dev/null +++ b/docs/plans/2026-06-04-067-feat-per-target-container-memory-plan.md @@ -0,0 +1,54 @@ +--- +title: "feat: Per-target container_memory in scrape-targets.json" +type: feat +status: complete +date: 2026-06-04 +origin: /lfg — plan 063 deferred per-target memory; KotOR yes_general catch-up should auto-apply 8g when scraping KotOR_discord_msgs alone without global scrape.env change +--- + +# feat: Per-target container_memory in scrape-targets.json + +## Summary + +Optional `container_memory` on each target in `scrape-targets.json`. When `run-discord-scrape-host.sh` runs with exactly one `--target`, apply that target's memory cap to compose unless `DCE_CONTAINER_MEMORY` is already set non-zero in the environment or `scrape.env`. + +## Problem + +Operators must set global `DCE_CONTAINER_MEMORY=8g` in `scrape.env` for KotOR `yes_general` catch-up, affecting every scrape including lightweight targets. A per-target knob keeps default runs unlimited while auto-raising memory for KotOR-only runs. + +## Requirements + +| ID | Requirement | +|----|-------------| +| R1 | Targets may include optional `container_memory` (e.g. `8g`) | +| R2 | Host runner applies it when exactly one `--target` is selected and global `DCE_CONTAINER_MEMORY` is unset or `0` | +| R3 | Explicit `DCE_CONTAINER_MEMORY` in shell or `scrape.env` wins over config | +| R4 | Run plan banner shows per-target memory when applied | +| R5 | `KotOR_discord_msgs` configured with `container_memory: "8g"` | +| R6 | Host smoke asserts config-driven memory when env omits global cap | +| R7 | `DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh` → 21/21 | + +## Implementation Units + +### U1. Config field + host apply + +**Files:** `scripts/lib/scrape-run-plan.sh`, `scripts/run-discord-scrape-host.sh`, `config/scrape-targets.json` + +### U2. Smoke + docs + +**Files:** `scripts/tests/run-discord-scrape-host-smoke.sh`, `docs/recurring-scrape-merge-readiness.md`, `docs/recurring-scrape-operator-checklist.md` + +## Verification + +```bash +./scripts/tests/run-discord-scrape-host-smoke.sh +DCE_MIN_FREE_MB=0 ./scripts/run-all-smokes.sh +``` + +## Scope Boundaries + +### Deferred + +- Live KotOR catch-up execution inside LFG +- Numeric max across multiple `--target` flags +- Structured JSON run logs diff --git a/docs/recurring-scrape-merge-readiness.md b/docs/recurring-scrape-merge-readiness.md index ec3cd915..58c94b8d 100644 --- a/docs/recurring-scrape-merge-readiness.md +++ b/docs/recurring-scrape-merge-readiness.md @@ -160,6 +160,8 @@ DCE_MIN_FREE_MB=0 ./scripts/run-operator-validation.sh \ **Plan 066 (2026-06-04):** `prove-incremental-append --channel` filters snapshots and grow-only comparison to selected channels. +**Plan 067 (2026-06-04):** Optional per-target `container_memory` in `scrape-targets.json` (single `--target` runs); `KotOR_discord_msgs` defaults to `8g`. + **Disk:** ~65 GiB free on `/home` (2026-05-30); large channel merges still need headroom. ## CI note (fork PRs) diff --git a/docs/recurring-scrape-operator-checklist.md b/docs/recurring-scrape-operator-checklist.md index 3869a565..b14954b6 100644 --- a/docs/recurring-scrape-operator-checklist.md +++ b/docs/recurring-scrape-operator-checklist.md @@ -58,12 +58,15 @@ Salvage then incremental scrape: ./scripts/run-operator-proof.sh --salvage-before-scrape --sync-gui --target NAME ``` -**KotOR yes_general** (`221726893064454144`): first catch-up after a 2021 archive cursor can take hours and may OOM; salvage preserved partials before retrying. Stop duplicate validation processes (MyBook vs Downloads checkouts share the same lock). For large catch-up, set `DCE_CONTAINER_MEMORY=8g` in `scrape.env` (or export before the run), then: +**KotOR yes_general** (`221726893064454144`): first catch-up after a 2021 archive cursor can take hours and may OOM; salvage preserved partials before retrying. Stop duplicate validation processes (MyBook vs Downloads checkouts share the same lock). `KotOR_discord_msgs` sets `container_memory: "8g"` in `scrape-targets.json` for single-target runs; override globally with `DCE_CONTAINER_MEMORY` in `scrape.env` if needed. Channel-scoped proof: ```bash ./scripts/run-operator-validation.sh --salvage-before-scrape \ --target KotOR_discord_msgs --channel 221726893064454144 \ --log-file logs/kotor-yes-general.log + +./scripts/prove-incremental-append.sh \ + --target KotOR_discord_msgs --channel 221726893064454144 ``` ## GUI zip only diff --git a/scrape.env.example b/scrape.env.example index bb6a3451..1dbefc2d 100644 --- a/scrape.env.example +++ b/scrape.env.example @@ -21,4 +21,7 @@ DCE_USERNS_MODE= # Optional: raise scrape container memory for multi-year channel catch-up (yes_general, etc.). # Examples: 8g, 8192m. Default 0 = no compose memory cap. +# Optional: raise container memory for large multi-year channel catch-up (compose mem_limit). +# Per-target: set container_memory on a target in config/scrape-targets.json (single --target runs). +# Global override (wins over config): uncomment below. # DCE_CONTAINER_MEMORY=8g diff --git a/scripts/lib/scrape-run-plan.sh b/scripts/lib/scrape-run-plan.sh index 7f3cc732..05750b17 100644 --- a/scripts/lib/scrape-run-plan.sh +++ b/scripts/lib/scrape-run-plan.sh @@ -18,7 +18,12 @@ print_scrape_config_plan() { printf 'Targets (%s selected):\n' "${#requested_targets[@]}" for name in "${requested_targets[@]}"; do output_dir=$(jq -r --arg name "$name" '.targets[] | select(.name == $name) | .output_dir' "$config_path") - printf ' - %s → %s\n' "$name" "$output_dir" + mem=$(target_container_memory "$config_path" "$name" 2>/dev/null || true) + if [[ -n "$mem" && "$mem" != "null" ]]; then + printf ' - %s → %s (container_memory: %s)\n' "$name" "$output_dir" "$mem" + else + printf ' - %s → %s\n' "$name" "$output_dir" + fi done return 0 fi @@ -35,3 +40,34 @@ enabled_target_names() { local config_path=$1 jq -r '.targets[] | select(.enabled != false) | .name' "$config_path" } + +target_container_memory() { + local config_path=$1 + local target_name=$2 + + if [[ ! -f "$config_path" ]]; then + printf '\n' + return 0 + fi + jq -r --arg name "$target_name" ' + .targets[] + | select(.name == $name) + | .container_memory // empty + ' "$config_path" +} + +apply_single_target_container_memory() { + local config_path=$1 + local target_name=$2 + local mem="" + + [[ -n "$target_name" ]] || return 0 + if [[ -n "${DCE_CONTAINER_MEMORY:-}" && "${DCE_CONTAINER_MEMORY:-0}" != "0" ]]; then + return 0 + fi + + mem=$(target_container_memory "$config_path" "$target_name") + [[ -n "$mem" && "$mem" != "null" ]] || return 0 + + export DCE_CONTAINER_MEMORY="$mem" +} diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index 0829f337..a3992371 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -43,6 +43,8 @@ Environment: DCE_COMPOSE_TTY When zero, compose run passes -T (no pseudo-TTY). Default omits -T so compose backends allocate a TTY for line-buffered progress logs. DCE_CONTAINER_MEMORY Optional container memory cap (e.g. 8g, 8192m). Default 0 = unlimited. + Targets may set container_memory in scrape-targets.json (used when + exactly one --target is selected and this env var is unset or 0). Notes: When $ENV_FILE is missing, exported DISCORD_TOKEN or DISCORD_TOKEN_FILE is used instead. @@ -507,6 +509,23 @@ run_subcommand_with_retry() { die "Container run failed for '$subcommand' after one auth refresh retry." } +collect_passthrough_targets() { + local -n _targets_out=$1 + shift + local -a args=("$@") + local idx=0 + + _targets_out=() + while (( idx < ${#args[@]} )); do + if [[ "${args[idx]}" == "--target" ]]; then + _targets_out+=("${args[idx + 1]:-}") + idx=$((idx + 2)) + continue + fi + idx=$((idx + 1)) + done +} + main() { local -a passthrough_args=() local subcommand="" @@ -562,22 +581,20 @@ main() { fi [[ -f "$COMPOSE_FILE" ]] || die "Missing compose file: $COMPOSE_FILE" + + local host_config host_targets=() + host_config=$(resolve_host_config_path "${passthrough_args[@]}") + collect_passthrough_targets host_targets "${passthrough_args[@]}" + if ((${#host_targets[@]} == 1)); then + apply_single_target_container_memory "$host_config" "${host_targets[0]}" + fi + if [[ "$subcommand" != "salvage" ]]; then prepare_compose_env fi REAUTH_COMMAND="${DCE_REAUTH_COMMAND:-}" run_disk_preflight_if_enabled "${passthrough_args[@]}" - local host_config host_targets=() arg_idx=0 - host_config=$(resolve_host_config_path "${passthrough_args[@]}") - while (( arg_idx < ${#passthrough_args[@]} )); do - if [[ "${passthrough_args[arg_idx]}" == "--target" ]]; then - host_targets+=("${passthrough_args[arg_idx + 1]:-}") - arg_idx=$((arg_idx + 2)) - continue - fi - arg_idx=$((arg_idx + 1)) - done print_scrape_config_plan "$host_config" "Host $subcommand" "${host_targets[@]}" case "$subcommand" in diff --git a/scripts/tests/run-discord-scrape-host-smoke.sh b/scripts/tests/run-discord-scrape-host-smoke.sh index bd120d24..5b2cd512 100755 --- a/scripts/tests/run-discord-scrape-host-smoke.sh +++ b/scripts/tests/run-discord-scrape-host-smoke.sh @@ -254,4 +254,54 @@ grep -q 'env:DCE_CONTAINER_MEMORY=8g' "$COMPOSE_MEM_LOG" || { exit 1 } +TARGET_MEM_CONFIG="$TMP_DIR/target-mem-config.json" +mkdir -p "$TMP_DIR/archive/demo" +cat >"$TARGET_MEM_CONFIG" <"$ENV_NO_MEM" +COMPOSE_TARGET_MEM_LOG="$TMP_DIR/compose-target-mem.log" +env -u DCE_CONTAINER_MEMORY \ + DCE_SKIP_SCRAPE_LOCK=1 \ + DCE_COMPOSE_BIN="$FAKE_COMPOSE" \ + DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_ENV_FILE="$ENV_NO_MEM" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + FAKE_COMPOSE_ARGS_LOG="$COMPOSE_TARGET_MEM_LOG" \ + "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape \ + --config "$TARGET_MEM_CONFIG" --target demo >/dev/null +grep -q 'env:DCE_CONTAINER_MEMORY=4g' "$COMPOSE_TARGET_MEM_LOG" || { + echo "expected target container_memory=4g in compose env when global unset" >&2 + cat "$COMPOSE_TARGET_MEM_LOG" >&2 + exit 1 +} + +ENV_OVERRIDE="$TMP_DIR/override-mem.env" +printf 'DISCORD_TOKEN=dummy\nDCE_CONTAINER_MEMORY=2g\n' >"$ENV_OVERRIDE" +COMPOSE_OVERRIDE_LOG="$TMP_DIR/compose-override-mem.log" +env -u DCE_CONTAINER_MEMORY \ + DCE_SKIP_SCRAPE_LOCK=1 \ + DCE_COMPOSE_BIN="$FAKE_COMPOSE" \ + DCE_REPO_ROOT="$REPO_ROOT" \ + DCE_ENV_FILE="$ENV_OVERRIDE" \ + DCE_COMPOSE_FILE="$COMPOSE_FILE" \ + FAKE_COMPOSE_ARGS_LOG="$COMPOSE_OVERRIDE_LOG" \ + "$REPO_ROOT/scripts/run-discord-scrape-host.sh" scrape \ + --config "$TARGET_MEM_CONFIG" --target demo >/dev/null +grep -q 'env:DCE_CONTAINER_MEMORY=2g' "$COMPOSE_OVERRIDE_LOG" || { + echo "expected scrape.env DCE_CONTAINER_MEMORY to override target config" >&2 + cat "$COMPOSE_OVERRIDE_LOG" >&2 + exit 1 +} + echo "run-discord-scrape-host smoke test passed"