screentinker/server/lib/reconnect-throttle.js
ScreenTinker 101f086204 fix(#142): load-aware per-device reconnect throttle (the outage fix)
Gates genuine reconnects PER DEVICE before the heavy register work (DB writes +
playlist build) runs, so a single flapping device can no longer saturate the
event loop and take down the server.

- Actuator is per-device, keyed on device_id (modeled on lastPlayLogAt). A device
  is flagged only when it exceeds reconnectBaseMax genuine reconnects per window.
  Same-socket playlist refreshes (isPlaylistRefresh) are exempt.
- Load-awareness is BANDED (normal/elevated/critical from the step-2 lag signal),
  not a continuous controller. The band only MULTIPLIES an already-flagged
  device's backoff; global lag never gates a healthy device.
- Hysteresis: escalate immediately while storming (tighten fast); decay one level
  per reconnectReleaseMs of calm (release slow).
- HARD CEILING per device, independent of band and warm-up — a slow-ramp attacker
  can't train through it.
- COLD START: for reconnectWarmupMs after boot, force the normal band and apply
  only the hard ceiling, so a full-fleet reconnect after a deploy doesn't throttle
  healthy screens. State is in-memory, resets on restart.
- Observability: every throttle engagement logs device, band, observed vs allowed
  rate, and backoff. Throttled device gets device:throttled + a deferred disconnect.

Tests (api.test.js style):
- unit: healthy-never-throttled, storm-throttled-with-growing-backoff, band
  multiplies backoff, hard-ceiling-even-in-warmup, warm-up leniency, neighbor
  isolation, slow release.
- integration GATE (the required one): full-fleet reconnect right after restart
  throttles NO healthy device; a single device storming IS throttled; a neighbor
  stays unaffected while another storms.
- also fixes pre-existing test PORT collisions (my new integration files clashed
  with totp.test.js:3979 and totp-keyrotation.test.js:3980 -> moved to 3982/3983);
  full suite now green serially AND in parallel.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 19:18:00 -05:00

99 lines
4.3 KiB
JavaScript

// #142 step 3 — load-aware per-device reconnect throttle (the outage fix).
//
// A single device stuck in a tight websocket reconnect loop can flood the server
// with full register cycles (DB writes + playlist build) and saturate the event
// loop. This module gates genuine reconnects PER DEVICE, before that heavy work
// runs in deviceSocket.js.
//
// Design (mirrors the issue's suggested mitigation + the lastPlayLogAt pattern):
// - WHO is always per-device: a device is "flagged" only when it exceeds
// reconnectBaseMax genuine reconnects within reconnectWindowMs. Global lag
// NEVER flags a healthy device.
// - Load-awareness is BANDED (normal/elevated/critical from services/loop-lag),
// not a continuous controller — deterministic and testable. The band only
// MULTIPLIES the backoff applied to an ALREADY-flagged device.
// - Hysteresis: escalate immediately while storming (tighten fast); decay the
// escalation level one step per reconnectReleaseMs of calm (release slow).
// - HARD CEILING: independent of band and of warm-up, no device may exceed
// reconnectHardCeiling/window — a slow-ramp attacker can't train through it.
// - COLD START: for reconnectWarmupMs after process start, force the 'normal'
// band and apply only the hard ceiling, so a full-fleet reconnect right after
// a deploy doesn't throttle healthy screens.
// - State is in-memory (resets on restart), like pair-lockout / totp-lockout.
const config = require('../config');
const loopLag = require('../services/loop-lag');
// deviceId -> { hits: number[], level: number, blockedUntil: ms, lastThrottleAt: ms }
const state = new Map();
let startedAt = Date.now();
function bandMultiplier(band) {
if (band === 'critical') return config.reconnectBandCriticalMult;
if (band === 'elevated') return config.reconnectBandElevatedMult;
return 1;
}
function reject(s, now, band, reason, observed, allowed) {
s.level = Math.min(s.level + 1, config.reconnectMaxLevel);
const backoff = Math.min(
config.reconnectBaseBackoffMs * Math.pow(2, s.level - 1) * bandMultiplier(band),
config.reconnectMaxBackoffMs
);
s.blockedUntil = now + backoff;
s.lastThrottleAt = now;
return { allow: false, retryAfterMs: backoff, reason, observed, allowed, band, level: s.level };
}
// Decide whether to allow a genuine reconnect for `deviceId`.
// `now` and `bandOverride` are injectable for deterministic tests; production
// passes only deviceId.
function check(deviceId, now = Date.now(), bandOverride = null) {
const warmup = (now - startedAt) < config.reconnectWarmupMs;
const band = bandOverride !== null ? bandOverride : (warmup ? 'normal' : loopLag.getBand());
let s = state.get(deviceId);
if (!s) { s = { hits: [], level: 0, blockedUntil: 0, lastThrottleAt: 0 }; state.set(deviceId, s); }
// Already inside an enforced backoff window: reject and escalate (tighten fast).
if (now < s.blockedUntil) {
return reject(s, now, band, 'in-backoff', s.hits.length, config.reconnectBaseMax);
}
// Sliding window of genuine reconnects.
s.hits = s.hits.filter((t) => now - t < config.reconnectWindowMs);
s.hits.push(now);
const observed = s.hits.length;
// Hard ceiling — always enforced, regardless of band or warm-up.
if (observed > config.reconnectHardCeiling) {
return reject(s, now, band, 'hard-ceiling', observed, config.reconnectHardCeiling);
}
// Cold start: only the hard ceiling applies; never rate-throttle during warm-up.
if (warmup) return allow(s, now, band);
// Healthy device: under the per-device threshold -> always allowed.
if (observed <= config.reconnectBaseMax) return allow(s, now, band);
// Flagged: storming beyond the per-device threshold -> throttle (band-scaled).
return reject(s, now, band, 'rate', observed, config.reconnectBaseMax);
}
function allow(s, now, band) {
// Release slow: decay one escalation level per reconnectReleaseMs of calm.
if (s.level > 0 && now - s.lastThrottleAt > config.reconnectReleaseMs) {
s.level = Math.max(0, s.level - 1);
s.lastThrottleAt = now;
}
return { allow: true, band, level: s.level };
}
// Test-only: clear state and optionally rewind the warm-up origin.
function __resetForTest(opts = {}) {
state.clear();
if (opts.startedAt !== undefined) startedAt = opts.startedAt;
}
module.exports = { check, __resetForTest };