screentinker/server/services/loop-lag.js
ScreenTinker dbac699854 fix(#143): content-ack flood control — per-device rate budget + loop-lag valve
#142's content-ack dedup is insufficient: a device cycling 2-4 content IDs makes
every ack look unique so dedup never fires, while aggregate volume from ~30 devices
saturates the event loop (the #142 reconnect throttle kept the server responsive,
which is how this was even observable).

Folded ONE control on the content-ack path (no competing limiters; reconnect-
throttle.js untouched) in lib/content-ack-limiter.js:
- Step 1 — per-device RATE budget: caps TOTAL non-duplicate acks per device per
  window regardless of differing content_id (the case dedup misses). Over budget =
  DROP silently (the per-ack log+emit is the cost); log ONCE per device per window
  when shedding starts. Keeps the #142 dedup (dedup'd repeats don't consume budget).
  Per-device, in-memory, resets on restart (modeled on lastPlayLogAt; does NOT reuse
  reconnect-throttle's ban-semantics bucket).
  Env (TUNING GUESSES, validate vs Bold's fleet): CONTENT_ACK_MAX_PER_WINDOW=20,
  CONTENT_ACK_RATE_WINDOW_MS=10000 (=2/s, above legit ~<=1/s, below the flood).
- Step 2 — global pressure valve: reuses the #142 loop-lag band (+ its hysteresis,
  no second control loop). Under CRITICAL band, shed content-acks even for an
  in-budget device; reconnects + dashboard/HTTP are ALWAYS processed; a healthy
  device in a non-critical band is never touched by the valve. Valve open/close
  logged once at the band edge in services/loop-lag.js (not per shed message).

Tests (unique ports 3985/3986, not the 3982/3983/3984 set):
- unit: the #143 regression (cycling ids evading dedup IS rate-limited), under/over
  budget, dedup still works + doesn't consume budget, valve sheds in-budget under
  critical while normal is untouched, rate precedence, window reset, per-device
  isolation.
- integration: socket flood is capped to budget with a single shed-start log;
  under-budget passes every ack; valve OPEN sheds content-acks while a reconnect +
  /api/status still succeed.
Full suite green serial AND parallel (208 tests).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 22:21:57 -05:00

116 lines
5.3 KiB
JavaScript

// #142 — Event-loop lag telemetry (the data subsystem; ships before the throttle).
//
// Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay()
// (a C++-backed histogram — cheap). Each window we read mean/p50/p99/max, persist
// a row to the bounded `event_loop_lag` table, and recompute a coarse load BAND
// (normal | elevated | critical) from the window p99.
//
// The band is consumed by the reconnect throttle (#142 step 3), but this module
// has standalone value: getLag() is surfaced on /api/status and band changes are
// logged, so site connectivity/lag is diagnosable independent of any throttling.
//
// Band transitions are deliberately asymmetric (see nextBand): jump UP immediately
// when an up-threshold is crossed (tighten fast), step DOWN only one level at a
// time after lagReleaseSamples consecutive calm samples below a deadband (release
// slow). This avoids band flap from transient blips.
const { monitorEventLoopDelay } = require('perf_hooks');
const { db } = require('../db/database');
const config = require('../config');
const NS_PER_MS = 1e6;
// A band releases only once p99 falls below this fraction of the band's entry
// threshold — the deadband that stops small fluctuations from flapping the band.
const DEADBAND = 0.5;
const LEVEL = { normal: 0, elevated: 1, critical: 2 };
let histogram = null;
let band = 'normal';
let calmSamples = 0;
let current = { mean_ms: 0, p50_ms: 0, p99_ms: 0, max_ms: 0, band: 'normal', sampled_at: 0 };
// Pure band-transition function (exported for deterministic unit tests). Given the
// current band, the window p99 (ms), and the running calm-sample count, returns the
// next [band, calmSamples]. Up is immediate (may skip a level); down is one step
// per release window, gated by a deadband.
function nextBand(cur, p99, calm) {
const level = LEVEL[cur] ?? 0;
// UP — immediate, tighten fast (normal can jump straight to critical).
if (p99 >= config.lagCriticalMs && level < LEVEL.critical) return ['critical', 0];
if (p99 >= config.lagElevatedMs && level < LEVEL.elevated) return ['elevated', 0];
// DOWN — slow, one step, only below the current band's deadband.
if (level === LEVEL.critical && p99 <= config.lagCriticalMs * DEADBAND) {
const c = calm + 1;
return c >= config.lagReleaseSamples ? ['elevated', 0] : ['critical', c];
}
if (level === LEVEL.elevated && p99 <= config.lagElevatedMs * DEADBAND) {
const c = calm + 1;
return c >= config.lagReleaseSamples ? ['normal', 0] : ['elevated', c];
}
// Hold (inside deadband, or already normal): reset the calm counter.
return [cur, 0];
}
const round2 = (x) => Math.round(x * 100) / 100;
function sample() {
const p99 = histogram.percentile(99) / NS_PER_MS;
const snap = {
mean_ms: round2(histogram.mean / NS_PER_MS),
p50_ms: round2(histogram.percentile(50) / NS_PER_MS),
p99_ms: round2(p99),
max_ms: round2(histogram.max / NS_PER_MS),
};
histogram.reset();
const prev = band;
[band, calmSamples] = nextBand(band, snap.p99_ms, calmSamples);
current = { ...snap, band, sampled_at: Math.floor(Date.now() / 1000) };
try {
db.prepare(
'INSERT INTO event_loop_lag (sampled_at, mean_ms, p50_ms, p99_ms, max_ms, band) VALUES (?, ?, ?, ?, ?, ?)'
).run(current.sampled_at, snap.mean_ms, snap.p50_ms, snap.p99_ms, snap.max_ms, band);
} catch (_) { /* table may not exist on a partially-migrated DB */ }
// Observable: log whenever we're loaded or when the band changes (incl. back to
// normal). Healthy steady state stays quiet.
if (band !== 'normal' || prev !== 'normal') {
const tag = band !== prev ? ` (was ${prev})` : '';
console.log(`[loop-lag] band=${band}${tag} mean=${snap.mean_ms}ms p99=${snap.p99_ms}ms max=${snap.max_ms}ms`);
}
// #143 global pressure valve — log ONLY the band edge (open/close), not per shed
// message. When critical, deviceSocket sheds non-essential acks (it reads getBand()).
if (band === 'critical' && prev !== 'critical') {
console.warn(`[shed] global valve OPEN — loop-lag critical (p99=${snap.p99_ms}ms); shedding non-essential device messages (content-acks). reconnects + dashboard still processed.`);
} else if (prev === 'critical' && band !== 'critical') {
console.log(`[shed] global valve CLOSED — loop-lag recovered (band=${band}, p99=${snap.p99_ms}ms)`);
}
}
function pruneLag() {
try {
const cutoff = Math.floor(Date.now() / 1000) - Math.round(config.lagTelemetryRetentionDays * 86400);
const n = db.prepare('DELETE FROM event_loop_lag WHERE sampled_at < ?').run(cutoff).changes;
if (n > 0) console.log(`[loop-lag] pruned ${n} sample(s) older than ${config.lagTelemetryRetentionDays}d`);
} catch (_) { /* ignore */ }
}
function startLoopLagMonitor() {
if (histogram) return; // idempotent
histogram = monitorEventLoopDelay({ resolution: config.lagResolutionMs });
histogram.enable();
const t1 = setInterval(sample, config.lagSampleIntervalMs);
pruneLag(); // sweep stale rows on boot
const t2 = setInterval(pruneLag, config.lagPruneIntervalMs);
// Don't keep the process alive on these timers (matters for tests / clean exit).
if (t1.unref) t1.unref();
if (t2.unref) t2.unref();
}
function getBand() { return band; }
function getLag() { return { ...current }; }
module.exports = { startLoopLagMonitor, getBand, getLag, nextBand };