screentinker/server/lib/content-ack-limiter.js
ScreenTinker dbac699854 fix(#143): content-ack flood control — per-device rate budget + loop-lag valve
#142's content-ack dedup is insufficient: a device cycling 2-4 content IDs makes
every ack look unique so dedup never fires, while aggregate volume from ~30 devices
saturates the event loop (the #142 reconnect throttle kept the server responsive,
which is how this was even observable).

Folded ONE control on the content-ack path (no competing limiters; reconnect-
throttle.js untouched) in lib/content-ack-limiter.js:
- Step 1 — per-device RATE budget: caps TOTAL non-duplicate acks per device per
  window regardless of differing content_id (the case dedup misses). Over budget =
  DROP silently (the per-ack log+emit is the cost); log ONCE per device per window
  when shedding starts. Keeps the #142 dedup (dedup'd repeats don't consume budget).
  Per-device, in-memory, resets on restart (modeled on lastPlayLogAt; does NOT reuse
  reconnect-throttle's ban-semantics bucket).
  Env (TUNING GUESSES, validate vs Bold's fleet): CONTENT_ACK_MAX_PER_WINDOW=20,
  CONTENT_ACK_RATE_WINDOW_MS=10000 (=2/s, above legit ~<=1/s, below the flood).
- Step 2 — global pressure valve: reuses the #142 loop-lag band (+ its hysteresis,
  no second control loop). Under CRITICAL band, shed content-acks even for an
  in-budget device; reconnects + dashboard/HTTP are ALWAYS processed; a healthy
  device in a non-critical band is never touched by the valve. Valve open/close
  logged once at the band edge in services/loop-lag.js (not per shed message).

Tests (unique ports 3985/3986, not the 3982/3983/3984 set):
- unit: the #143 regression (cycling ids evading dedup IS rate-limited), under/over
  budget, dedup still works + doesn't consume budget, valve sheds in-budget under
  critical while normal is untouched, rate precedence, window reset, per-device
  isolation.
- integration: socket flood is capped to budget with a single shed-start log;
  under-budget passes every ack; valve OPEN sheds content-acks while a reconnect +
  /api/status still succeed.
Full suite green serial AND parallel (208 tests).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 22:21:57 -05:00

65 lines
3.2 KiB
JavaScript

// #143 — content-ack flood control (the single control on the content-ack path).
//
// Folds three concerns into ONE per-device limiter so there are no competing
// limiters on this path (reconnect-throttle.js is left untouched):
// 1. #142 dedup — drop an exact (content_id, status) repeat within the dedup
// window. Legit repeat suppression; does NOT consume rate budget.
// 2. #143 per-device RATE budget — cap TOTAL non-duplicate acks per device per
// window regardless of differing content_id. This is what dedup misses: a
// device cycling 2-4 ids makes each ack look unique, so dedup never fires,
// but aggregate volume still floods the loop. Over budget -> shed silently.
// 3. #143 global pressure valve — when loop-lag (services/loop-lag.js) reports
// the CRITICAL band, shed non-essential acks even for a device within its own
// budget. Reuses the existing band + hysteresis; never fires below critical.
//
// Per-device, in-memory, resets on restart (like lastPlayLogAt / pair-lockout).
// Fixed window (counter reset per window) — simple and makes "log once per window"
// natural. `band` is injected so this is testable without the loop-lag monitor.
const config = require('../config');
// deviceId -> { winStart, count, shedNotified, dup: Map(content|status -> ts) }
const state = new Map();
// Returns one of:
// { action: 'pass' } -> caller logs + emits
// { action: 'dedup' } -> drop (exact repeat)
// { action: 'shed-rate', logStart, observed, budget } -> drop (over per-device budget)
// { action: 'shed-valve' } -> drop (global critical-lag valve)
function check(deviceId, contentId, status, band = 'normal', now = Date.now()) {
let s = state.get(deviceId);
if (!s) { s = { winStart: now, count: 0, shedNotified: false, dup: new Map() }; state.set(deviceId, s); }
// Roll the fixed rate window.
if (now - s.winStart >= config.contentAckRateWindowMs) {
s.winStart = now;
s.count = 0;
s.shedNotified = false;
// Bound the dedup map: drop entries older than the dedup window.
for (const [k, t] of s.dup) if (now - t >= config.contentAckDedupMs) s.dup.delete(k);
}
// 1) Dedup — exact (content, status) repeat within the dedup window. Does NOT
// consume rate budget (it's a legit repeat we simply suppress).
const key = `${contentId}|${status}`;
if (now - (s.dup.get(key) || 0) < config.contentAckDedupMs) return { action: 'dedup' };
s.dup.set(key, now);
// 2) Per-device rate budget — always applies, counts all non-duplicate acks.
s.count++;
if (s.count > config.contentAckMaxPerWindow) {
const logStart = !s.shedNotified; // log ONCE per device per window when shedding starts
s.shedNotified = true;
return { action: 'shed-rate', logStart, observed: s.count, budget: config.contentAckMaxPerWindow };
}
// 3) Global valve — extra shedding only under critical lag; a within-budget device
// in a non-critical band is never touched here.
if (band === 'critical') return { action: 'shed-valve' };
return { action: 'pass' };
}
function reset() { state.clear(); } // tests
module.exports = { check, reset };