mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
Gates genuine reconnects PER DEVICE before the heavy register work (DB writes + playlist build) runs, so a single flapping device can no longer saturate the event loop and take down the server. - Actuator is per-device, keyed on device_id (modeled on lastPlayLogAt). A device is flagged only when it exceeds reconnectBaseMax genuine reconnects per window. Same-socket playlist refreshes (isPlaylistRefresh) are exempt. - Load-awareness is BANDED (normal/elevated/critical from the step-2 lag signal), not a continuous controller. The band only MULTIPLIES an already-flagged device's backoff; global lag never gates a healthy device. - Hysteresis: escalate immediately while storming (tighten fast); decay one level per reconnectReleaseMs of calm (release slow). - HARD CEILING per device, independent of band and warm-up — a slow-ramp attacker can't train through it. - COLD START: for reconnectWarmupMs after boot, force the normal band and apply only the hard ceiling, so a full-fleet reconnect after a deploy doesn't throttle healthy screens. State is in-memory, resets on restart. - Observability: every throttle engagement logs device, band, observed vs allowed rate, and backoff. Throttled device gets device:throttled + a deferred disconnect. Tests (api.test.js style): - unit: healthy-never-throttled, storm-throttled-with-growing-backoff, band multiplies backoff, hard-ceiling-even-in-warmup, warm-up leniency, neighbor isolation, slow release. - integration GATE (the required one): full-fleet reconnect right after restart throttles NO healthy device; a single device storming IS throttled; a neighbor stays unaffected while another storms. - also fixes pre-existing test PORT collisions (my new integration files clashed with totp.test.js:3979 and totp-keyrotation.test.js:3980 -> moved to 3982/3983); full suite now green serially AND in parallel. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
99 lines
4.3 KiB
JavaScript
99 lines
4.3 KiB
JavaScript
// #142 step 3 — load-aware per-device reconnect throttle (the outage fix).
|
|
//
|
|
// A single device stuck in a tight websocket reconnect loop can flood the server
|
|
// with full register cycles (DB writes + playlist build) and saturate the event
|
|
// loop. This module gates genuine reconnects PER DEVICE, before that heavy work
|
|
// runs in deviceSocket.js.
|
|
//
|
|
// Design (mirrors the issue's suggested mitigation + the lastPlayLogAt pattern):
|
|
// - WHO is always per-device: a device is "flagged" only when it exceeds
|
|
// reconnectBaseMax genuine reconnects within reconnectWindowMs. Global lag
|
|
// NEVER flags a healthy device.
|
|
// - Load-awareness is BANDED (normal/elevated/critical from services/loop-lag),
|
|
// not a continuous controller — deterministic and testable. The band only
|
|
// MULTIPLIES the backoff applied to an ALREADY-flagged device.
|
|
// - Hysteresis: escalate immediately while storming (tighten fast); decay the
|
|
// escalation level one step per reconnectReleaseMs of calm (release slow).
|
|
// - HARD CEILING: independent of band and of warm-up, no device may exceed
|
|
// reconnectHardCeiling/window — a slow-ramp attacker can't train through it.
|
|
// - COLD START: for reconnectWarmupMs after process start, force the 'normal'
|
|
// band and apply only the hard ceiling, so a full-fleet reconnect right after
|
|
// a deploy doesn't throttle healthy screens.
|
|
// - State is in-memory (resets on restart), like pair-lockout / totp-lockout.
|
|
|
|
const config = require('../config');
|
|
const loopLag = require('../services/loop-lag');
|
|
|
|
// deviceId -> { hits: number[], level: number, blockedUntil: ms, lastThrottleAt: ms }
|
|
const state = new Map();
|
|
let startedAt = Date.now();
|
|
|
|
function bandMultiplier(band) {
|
|
if (band === 'critical') return config.reconnectBandCriticalMult;
|
|
if (band === 'elevated') return config.reconnectBandElevatedMult;
|
|
return 1;
|
|
}
|
|
|
|
function reject(s, now, band, reason, observed, allowed) {
|
|
s.level = Math.min(s.level + 1, config.reconnectMaxLevel);
|
|
const backoff = Math.min(
|
|
config.reconnectBaseBackoffMs * Math.pow(2, s.level - 1) * bandMultiplier(band),
|
|
config.reconnectMaxBackoffMs
|
|
);
|
|
s.blockedUntil = now + backoff;
|
|
s.lastThrottleAt = now;
|
|
return { allow: false, retryAfterMs: backoff, reason, observed, allowed, band, level: s.level };
|
|
}
|
|
|
|
// Decide whether to allow a genuine reconnect for `deviceId`.
|
|
// `now` and `bandOverride` are injectable for deterministic tests; production
|
|
// passes only deviceId.
|
|
function check(deviceId, now = Date.now(), bandOverride = null) {
|
|
const warmup = (now - startedAt) < config.reconnectWarmupMs;
|
|
const band = bandOverride !== null ? bandOverride : (warmup ? 'normal' : loopLag.getBand());
|
|
|
|
let s = state.get(deviceId);
|
|
if (!s) { s = { hits: [], level: 0, blockedUntil: 0, lastThrottleAt: 0 }; state.set(deviceId, s); }
|
|
|
|
// Already inside an enforced backoff window: reject and escalate (tighten fast).
|
|
if (now < s.blockedUntil) {
|
|
return reject(s, now, band, 'in-backoff', s.hits.length, config.reconnectBaseMax);
|
|
}
|
|
|
|
// Sliding window of genuine reconnects.
|
|
s.hits = s.hits.filter((t) => now - t < config.reconnectWindowMs);
|
|
s.hits.push(now);
|
|
const observed = s.hits.length;
|
|
|
|
// Hard ceiling — always enforced, regardless of band or warm-up.
|
|
if (observed > config.reconnectHardCeiling) {
|
|
return reject(s, now, band, 'hard-ceiling', observed, config.reconnectHardCeiling);
|
|
}
|
|
|
|
// Cold start: only the hard ceiling applies; never rate-throttle during warm-up.
|
|
if (warmup) return allow(s, now, band);
|
|
|
|
// Healthy device: under the per-device threshold -> always allowed.
|
|
if (observed <= config.reconnectBaseMax) return allow(s, now, band);
|
|
|
|
// Flagged: storming beyond the per-device threshold -> throttle (band-scaled).
|
|
return reject(s, now, band, 'rate', observed, config.reconnectBaseMax);
|
|
}
|
|
|
|
function allow(s, now, band) {
|
|
// Release slow: decay one escalation level per reconnectReleaseMs of calm.
|
|
if (s.level > 0 && now - s.lastThrottleAt > config.reconnectReleaseMs) {
|
|
s.level = Math.max(0, s.level - 1);
|
|
s.lastThrottleAt = now;
|
|
}
|
|
return { allow: true, band, level: s.level };
|
|
}
|
|
|
|
// Test-only: clear state and optionally rewind the warm-up origin.
|
|
function __resetForTest(opts = {}) {
|
|
state.clear();
|
|
if (opts.startedAt !== undefined) startedAt = opts.startedAt;
|
|
}
|
|
|
|
module.exports = { check, __resetForTest };
|