From df499568d90763d07b7267350256737600c499eb Mon Sep 17 00:00:00 2001 From: Boden Date: Thu, 28 May 2026 00:08:22 -0500 Subject: [PATCH] fix: harden recurring scrape scripts from review residuals Use max message ID for incremental exports, validate custom cron expressions, drop eval from host/preflight paths, restrict reauth to executable repo scripts, and run smoke tests in CI. Co-authored-by: Cursor --- .github/workflows/main.yml | 24 +++++++ ...ecurring-scrape-residual-hardening-plan.md | 69 +++++++++++++++++++ scrape.env.example | 3 +- scripts/run-discord-scrape-host.sh | 53 ++++++++++---- scripts/run-discord-scrape.sh | 5 +- scripts/setup-cron.sh | 53 ++++++++++---- scripts/tests/run-discord-scrape-smoke.sh | 27 +++++++- scripts/tests/setup-cron-smoke.sh | 8 +++ .../test-fixtures/append-after-high-id.json | 23 +++++++ .../append-unordered-cursor.json | 33 +++++++++ 10 files changed, 266 insertions(+), 32 deletions(-) create mode 100644 docs/plans/2026-05-27-004-fix-recurring-scrape-residual-hardening-plan.md create mode 100644 scripts/tests/test-fixtures/append-after-high-id.json create mode 100644 scripts/tests/test-fixtures/append-unordered-cursor.json diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f857a9af..280b368d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,6 +50,30 @@ jobs: if: ${{ failure() && steps.verify.outcome == 'failure' }} run: echo "::error title=Bad formatting::Formatting issues detected. Please build the solution locally to fix them." + recurring-scrape-smoke: + runs-on: ubuntu-latest + timeout-minutes: 10 + + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install jq + run: sudo apt-get update && sudo apt-get install -y jq + + - name: Run recurring scrape smoke tests + run: | + chmod +x scripts/*.sh scripts/tests/*.sh + ./scripts/tests/run-discord-scrape-smoke.sh + ./scripts/tests/error-path-smoke.sh + ./scripts/tests/cron-idempotency-smoke.sh + ./scripts/tests/end-to-end-preflight-smoke.sh + ./scripts/tests/setup-cron-smoke.sh + ./scripts/tests/run-discord-scrape-host-smoke.sh + test: # Tests need access to secrets, so we can't run them against PRs because of limited trust if: ${{ github.event_name != 'pull_request' }} diff --git a/docs/plans/2026-05-27-004-fix-recurring-scrape-residual-hardening-plan.md b/docs/plans/2026-05-27-004-fix-recurring-scrape-residual-hardening-plan.md new file mode 100644 index 00000000..d35bcb21 --- /dev/null +++ b/docs/plans/2026-05-27-004-fix-recurring-scrape-residual-hardening-plan.md @@ -0,0 +1,69 @@ +--- +title: fix: Harden recurring scrape scripts from code review residuals +type: fix +status: completed +date: 2026-05-27 +origin: PR #1538 residual review findings +--- + +# fix: Harden recurring scrape scripts from code review residuals + +## Summary + +Address actionable security and correctness findings deferred from the finalization validation pass on `feat/recurring-cli-scrape`. Scope is limited to script-layer hardening and CI wiring—no C# exporter changes. + +## Problem Frame + +The recurring scrape automation is validated by smoke tests but still has review-flagged risks: arbitrary command execution via `eval`/`bash -lc`, unvalidated custom cron expressions, incorrect incremental cursor selection, and smoke tests not running in CI. + +## Requirements + +| ID | Requirement | Files | +|----|-------------|-------| +| U1 | Incremental exports must use the highest message snowflake ID, not the last array element | `scripts/run-discord-scrape.sh`, smoke fixture + assertion | +| U2 | `--cron` expressions must be validated (five fields, safe charset) before crontab install | `scripts/setup-cron.sh`, smoke test | +| U3 | Host runner and setup preflight must invoke compose without `eval` | `scripts/run-discord-scrape-host.sh`, `scripts/setup-cron.sh` | +| U4 | `DCE_REAUTH_COMMAND` must be an executable script under the repo root (no `bash -lc` arbitrary strings) | `scripts/run-discord-scrape-host.sh` | +| U5 | Recurring scrape smoke suite runs in GitHub Actions on PR/push | `.github/workflows/main.yml` | + +## Implementation Units + +### U1 — max message ID cursor + +- Change `last_message_id` to `max_by(.id)` with empty-array guard. +- Add fixture where max ID is not the last message; extend fake CLI to log/assert `--after` value. + +### U2 — cron validation + +- Add `validate_cron_expression` called when `CRON_EXPRESSION` is set. +- Reject invalid field counts and shell metacharacters. + +### U3 — remove eval + +- Refactor `compose_run_command` to populate a bash array via nameref; execute with `"${args[@]}"`. +- Refactor `build_target_args` / `run_preflight` similarly. + +### U4 — reauth script allowlist + +- Resolve and require absolute path under `$REPO_ROOT`, executable file. +- Execute directly instead of `bash -lc`. + +### U5 — CI smoke job + +- Add `recurring-scrape-smoke` job: install `jq`, run all `scripts/tests/*-smoke.sh` scripts. + +## Test Scenarios + +- Smoke: unordered archive still passes `--after` with max ID. +- Smoke: invalid `--cron` exits non-zero before crontab mutation. +- CI: new job passes on clean branch. + +## Out of Scope + +- Changing upstream PR approval for fork workflows. +- Replacing `%q` cron job line construction (values are operator-controlled paths). + +## Success Criteria + +- All existing smoke scripts pass locally. +- Residual P0/P1 script findings from review are resolved or explicitly superseded by this plan. diff --git a/scrape.env.example b/scrape.env.example index bac07444..b07686f9 100644 --- a/scrape.env.example +++ b/scrape.env.example @@ -3,7 +3,8 @@ DISCORD_TOKEN= # Optional: file whose first line contains DISCORD_TOKEN. Useful for token rotation without editing this env file. DISCORD_TOKEN_FILE= # Optional (manual runs only): command to refresh Discord auth/session before one retry. -# Example: DCE_REAUTH_COMMAND="agent-browser --headed open https://discord.com/channels/@me" +# Optional absolute path to an executable reauth script under the repository root. +# Example: DCE_REAUTH_COMMAND="/path/to/repo/scripts/reauth-discord.sh" DCE_REAUTH_COMMAND= TZ=UTC diff --git a/scripts/run-discord-scrape-host.sh b/scripts/run-discord-scrape-host.sh index 94eab4da..6f443011 100755 --- a/scripts/run-discord-scrape-host.sh +++ b/scripts/run-discord-scrape-host.sh @@ -29,7 +29,7 @@ Options: Environment: DISCORD_TOKEN Direct token value (highest precedence after refresh). DISCORD_TOKEN_FILE Optional path to a file containing the Discord token. - DCE_REAUTH_COMMAND Optional command to run for interactive/manual reauth when auth fails. + DCE_REAUTH_COMMAND Optional absolute path to an executable reauth script under the repo root. EOF } @@ -92,13 +92,14 @@ ensure_token_present() { [[ -n "${DISCORD_TOKEN:-}" ]] || die "DISCORD_TOKEN is not set. Set DISCORD_TOKEN or DISCORD_TOKEN_FILE in $ENV_FILE." } -compose_run_command() { - local subcommand=$1 - shift - local -a command_parts +compose_run_args() { + local -n _out=$1 + local subcommand=$2 + shift 2 + _out=() if [[ -n "$COMPOSE_BIN" ]]; then - command_parts=( + _out=( "$COMPOSE_BIN" --env-file "$ENV_FILE" -f "$COMPOSE_FILE" @@ -109,7 +110,7 @@ compose_run_command() { "$subcommand" ) elif (( DOCKER_BIN_OVERRIDDEN == 0 )) && command -v docker-compose >/dev/null 2>&1; then - command_parts=( + _out=( docker-compose --env-file "$ENV_FILE" -f "$COMPOSE_FILE" @@ -120,7 +121,7 @@ compose_run_command() { "$subcommand" ) else - command_parts=( + _out=( "$DOCKER_BIN" compose --env-file "$ENV_FILE" @@ -133,8 +134,26 @@ compose_run_command() { ) fi - command_parts+=("$@") - printf '%q ' "${command_parts[@]}" + _out+=("$@") +} + +resolve_reauth_command() { + local candidate=$1 + local resolved_dir resolved_path + + [[ -n "$candidate" ]] || return 1 + [[ "$candidate" == /* ]] || die "DCE_REAUTH_COMMAND must be an absolute path to an executable script under the repository." + + resolved_dir=$(cd "$(dirname "$candidate")" && pwd -P) + resolved_path="$resolved_dir/$(basename "$candidate")" + [[ -f "$resolved_path" ]] || die "DCE_REAUTH_COMMAND does not exist: $candidate" + [[ -x "$resolved_path" ]] || die "DCE_REAUTH_COMMAND is not executable: $candidate" + case "$resolved_path" in + "$REPO_ROOT"/*) ;; + *) die "DCE_REAUTH_COMMAND must be a script inside the repository root." ;; + esac + + printf '%s\n' "$resolved_path" } is_discord_auth_failure() { @@ -145,22 +164,26 @@ is_discord_auth_failure() { } try_interactive_reauth() { + local reauth_script + [[ -n "$REAUTH_COMMAND" ]] || return 1 [[ -t 0 && -t 1 ]] || return 1 + reauth_script=$(resolve_reauth_command "$REAUTH_COMMAND") printf 'Auth failed; running DCE_REAUTH_COMMAND...\n' >&2 - bash -lc "$REAUTH_COMMAND" + "$reauth_script" } run_subcommand_with_retry() { local subcommand=$1 shift - local run_command output_file + local -a run_args=() + local output_file ensure_token_present output_file=$(mktemp "${TMPDIR:-/tmp}/dce-host-run.XXXXXX.log") - run_command=$(compose_run_command "$subcommand" "$@") - if eval "$run_command" >"$output_file" 2>&1; then + compose_run_args run_args "$subcommand" "$@" + if "${run_args[@]}" >"$output_file" 2>&1; then cat "$output_file" rm -f "$output_file" return 0 @@ -178,7 +201,7 @@ run_subcommand_with_retry() { try_interactive_reauth || true ensure_token_present - if eval "$run_command" >"$output_file" 2>&1; then + if "${run_args[@]}" >"$output_file" 2>&1; then cat "$output_file" rm -f "$output_file" return 0 diff --git a/scripts/run-discord-scrape.sh b/scripts/run-discord-scrape.sh index c113bdf9..976c4b29 100755 --- a/scripts/run-discord-scrape.sh +++ b/scripts/run-discord-scrape.sh @@ -331,7 +331,10 @@ last_message_id() { local export_path=$1 [[ -f "$export_path" ]] || return 0 - jq -r '(.messages | last | .id) // empty' "$export_path" + jq -r ' + (.messages // []) + | if length == 0 then empty else (max_by(.id) | .id) end + ' "$export_path" } message_count() { diff --git a/scripts/setup-cron.sh b/scripts/setup-cron.sh index e6e601a1..069bc868 100755 --- a/scripts/setup-cron.sh +++ b/scripts/setup-cron.sh @@ -97,25 +97,37 @@ strip_existing_job() { ' <<<"$existing_crontab" } -build_target_args() { - local -a command_parts +validate_cron_expression() { + local expr=$1 + local -a fields=() + local field + + read -r -a fields <<<"$expr" + ((${#fields[@]} == 5)) || die "--cron must contain exactly five fields (minute hour day month weekday)." + + for field in "${fields[@]}"; do + [[ -n "$field" ]] || die "Empty field in --cron expression." + [[ "$field" =~ ^[0-9*,/-]+$ ]] || die "Invalid cron field '$field' in --cron expression." + done +} + +append_target_args() { + local -n _out=$1 local target for target in "${TARGETS[@]}"; do - command_parts+=(--target "$target") + _out+=(--target "$target") done local guild_id for guild_id in "${GUILDS[@]}"; do - command_parts+=(--guild "$guild_id") + _out+=(--guild "$guild_id") done local channel_id for channel_id in "${CHANNELS[@]}"; do - command_parts+=(--channel "$channel_id") + _out+=(--channel "$channel_id") done - - printf '%q ' "${command_parts[@]}" } ensure_target_directories() { @@ -162,12 +174,17 @@ validate_targets() { } run_preflight() { - local preflight_command target_args + local -a preflight_args=() [[ -f "$ENV_FILE" ]] || die "Missing env file: $ENV_FILE" - target_args=$(build_target_args) - preflight_command="$(printf '%q ' "$HOST_RUNNER") --env-file $(printf '%q' "$ENV_FILE") --compose-file $(printf '%q' "$COMPOSE_FILE") preflight ${target_args}" - eval "$preflight_command" + preflight_args=( + "$HOST_RUNNER" + --env-file "$ENV_FILE" + --compose-file "$COMPOSE_FILE" + preflight + ) + append_target_args preflight_args + "${preflight_args[@]}" } main() { @@ -263,6 +280,7 @@ main() { local cron_line if [[ -n "$CRON_EXPRESSION" ]]; then + validate_cron_expression "$CRON_EXPRESSION" cron_line=$CRON_EXPRESSION else cron_line=$(cron_from_schedule "$INTERVAL" "$RUN_AT") @@ -270,7 +288,8 @@ main() { local begin_marker="# BEGIN ${JOB_NAME}" local end_marker="# END ${JOB_NAME}" - local current_crontab cleaned_crontab scrape_command target_args job_line lock_prefix + local current_crontab cleaned_crontab scrape_command job_line lock_prefix + local -a scrape_args=() current_crontab=$("$CRONTAB_BIN" -l 2>/dev/null || true) cleaned_crontab=$(strip_existing_job "$current_crontab" "$begin_marker" "$end_marker") @@ -291,8 +310,14 @@ main() { run_preflight fi - target_args=$(build_target_args) - scrape_command="$(printf '%q ' "$HOST_RUNNER") --env-file $(printf '%q' "$ENV_FILE") --compose-file $(printf '%q' "$COMPOSE_FILE") scrape ${target_args}" + scrape_args=( + "$HOST_RUNNER" + --env-file "$ENV_FILE" + --compose-file "$COMPOSE_FILE" + scrape + ) + append_target_args scrape_args + scrape_command=$(printf '%q ' "${scrape_args[@]}") if command -v flock >/dev/null 2>&1; then lock_prefix=$(printf '%q ' "$(command -v flock)" "-n" "/tmp/${JOB_NAME}.lock") else diff --git a/scripts/tests/run-discord-scrape-smoke.sh b/scripts/tests/run-discord-scrape-smoke.sh index ee8cc382..8b6f9fb5 100755 --- a/scripts/tests/run-discord-scrape-smoke.sh +++ b/scripts/tests/run-discord-scrape-smoke.sh @@ -94,6 +94,14 @@ cat >"$CONFIG_PATH" <&2 + exit 1 + fi + shift 2 + ;; + --channel|--format) shift 2 ;; *) @@ -129,6 +146,7 @@ case "$subcommand" in case "$mode" in initial) cp "$fixture_dir/append-existing.json" "$output" ;; append) cp "$fixture_dir/append-incremental.json" "$output" ;; + append-after-high-id) cp "$fixture_dir/append-after-high-id.json" "$output" ;; partial-write) cp "$fixture_dir/append-partial-write.json" "$output" ;; concurrent-conflict) cp "$fixture_dir/append-concurrent-conflict.json" "$output" ;; wrong-channel) cp "$fixture_dir/wrong-channel.json" "$output" ;; @@ -153,6 +171,7 @@ run_wrapper() { DCE_FALLBACK_CONFIG="$CONFIG_PATH" \ FAKE_DCE_FIXTURE_DIR="$FIXTURE_DIR" \ FAKE_DCE_MODE="$mode" \ + FAKE_DCE_EXPECT_AFTER="${FAKE_DCE_EXPECT_AFTER:-}" \ "$REPO_ROOT/scripts/run-discord-scrape.sh" scrape --target "$target_name" } @@ -262,5 +281,11 @@ IDEMPOTENT_CHECKSUM_2=$(sha256sum "$IDEMPOTENT_DEST" | awk '{print $1}') [[ "$(jq -r '.channel.id' "$DEST")" == "111" ]] || { echo "expected channel id to be preserved after merge" >&2; exit 1; } [[ "$(jq -r '.messages[0] | has("id") and has("timestamp") and has("content")' "$DEST")" == "true" ]] || { echo "expected message structure to be complete after merge" >&2; exit 1; } +mkdir -p "$ARCHIVE_ROOT/cursor-max-id" +cp "$FIXTURE_DIR/append-unordered-cursor.json" "$ARCHIVE_ROOT/cursor-max-id/$DEFAULT_FILE_NAME" +FAKE_DCE_EXPECT_AFTER=999 run_wrapper cursor-max-id append-after-high-id +CURSOR_DEST="$ARCHIVE_ROOT/cursor-max-id/$DEFAULT_FILE_NAME" +[[ "$(jq -r '.messages | length' "$CURSOR_DEST")" == "4" ]] || { echo "expected cursor-max-id archive to contain four messages" >&2; exit 1; } + echo "U1: append-only merge test coverage passed" diff --git a/scripts/tests/setup-cron-smoke.sh b/scripts/tests/setup-cron-smoke.sh index 8028cdf7..9d44b621 100755 --- a/scripts/tests/setup-cron-smoke.sh +++ b/scripts/tests/setup-cron-smoke.sh @@ -129,4 +129,12 @@ cmp -s "$CRONTAB_FILE" "$TMP_DIR/crontab-before-preflight-fail.txt" || { exit 1 } +if run_setup --cron "0 2 * * *; touch /tmp/evil" --skip-preflight 2>/dev/null; then + echo "expected invalid --cron expression to fail validation" >&2 + exit 1 +fi + +preview_custom_cron=$(run_setup --cron "15 03 * * 0" --skip-preflight --dry-run) +grep -q '^15 03 \* \* 0 ' <<<"$preview_custom_cron" || { echo "expected validated custom cron in dry-run output" >&2; exit 1; } + echo "setup-cron smoke test passed" diff --git a/scripts/tests/test-fixtures/append-after-high-id.json b/scripts/tests/test-fixtures/append-after-high-id.json new file mode 100644 index 00000000..8a1b7564 --- /dev/null +++ b/scripts/tests/test-fixtures/append-after-high-id.json @@ -0,0 +1,23 @@ +{ + "guild": { + "id": "222", + "name": "Fixture Guild" + }, + "channel": { + "id": "111", + "name": "fixture-room", + "category": "Testing Grounds" + }, + "messages": [ + { + "id": "1000", + "timestamp": "2026-01-03T00:00:00Z", + "content": "after-high-id" + } + ], + "dateRange": { + "after": "999", + "before": null + }, + "exportedAt": "2026-01-03T00:00:00Z" +} diff --git a/scripts/tests/test-fixtures/append-unordered-cursor.json b/scripts/tests/test-fixtures/append-unordered-cursor.json new file mode 100644 index 00000000..4d031de0 --- /dev/null +++ b/scripts/tests/test-fixtures/append-unordered-cursor.json @@ -0,0 +1,33 @@ +{ + "guild": { + "id": "222", + "name": "Fixture Guild" + }, + "channel": { + "id": "111", + "name": "fixture-room", + "category": "Testing Grounds" + }, + "messages": [ + { + "id": "1", + "timestamp": "2026-01-01T00:00:00Z", + "content": "first" + }, + { + "id": "999", + "timestamp": "2026-01-01T12:00:00Z", + "content": "high-id-not-last" + }, + { + "id": "2", + "timestamp": "2026-01-02T00:00:00Z", + "content": "second" + } + ], + "dateRange": { + "after": null, + "before": null + }, + "exportedAt": "2026-01-02T00:00:00Z" +}