screentinker/server/test/reconnect-throttle.test.js
ScreenTinker 101f086204 fix(#142): load-aware per-device reconnect throttle (the outage fix)
Gates genuine reconnects PER DEVICE before the heavy register work (DB writes +
playlist build) runs, so a single flapping device can no longer saturate the
event loop and take down the server.

- Actuator is per-device, keyed on device_id (modeled on lastPlayLogAt). A device
  is flagged only when it exceeds reconnectBaseMax genuine reconnects per window.
  Same-socket playlist refreshes (isPlaylistRefresh) are exempt.
- Load-awareness is BANDED (normal/elevated/critical from the step-2 lag signal),
  not a continuous controller. The band only MULTIPLIES an already-flagged
  device's backoff; global lag never gates a healthy device.
- Hysteresis: escalate immediately while storming (tighten fast); decay one level
  per reconnectReleaseMs of calm (release slow).
- HARD CEILING per device, independent of band and warm-up — a slow-ramp attacker
  can't train through it.
- COLD START: for reconnectWarmupMs after boot, force the normal band and apply
  only the hard ceiling, so a full-fleet reconnect after a deploy doesn't throttle
  healthy screens. State is in-memory, resets on restart.
- Observability: every throttle engagement logs device, band, observed vs allowed
  rate, and backoff. Throttled device gets device:throttled + a deferred disconnect.

Tests (api.test.js style):
- unit: healthy-never-throttled, storm-throttled-with-growing-backoff, band
  multiplies backoff, hard-ceiling-even-in-warmup, warm-up leniency, neighbor
  isolation, slow release.
- integration GATE (the required one): full-fleet reconnect right after restart
  throttles NO healthy device; a single device storming IS throttled; a neighbor
  stays unaffected while another storms.
- also fixes pre-existing test PORT collisions (my new integration files clashed
  with totp.test.js:3979 and totp-keyrotation.test.js:3980 -> moved to 3982/3983);
  full suite now green serially AND in parallel.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 19:18:00 -05:00

99 lines
4.3 KiB
JavaScript

'use strict';
// #142 step 3 — deterministic unit tests for the per-device reconnect throttle.
// Pure logic with injected `now` / band; isolate the DB before require (the module
// pulls in services/loop-lag -> db/database which initialises a DB on load).
const os = require('node:os');
const path = require('node:path');
const crypto = require('node:crypto');
process.env.DATA_DIR = path.join(os.tmpdir(), 'st-thr-unit-' + crypto.randomBytes(4).toString('hex'));
const { test, beforeEach } = require('node:test');
const assert = require('node:assert/strict');
const throttle = require('../lib/reconnect-throttle');
// config defaults: window=10000, baseMax=5, hardCeiling=20, baseBackoff=1000,
// maxBackoff=60000, releaseMs=30000, warmup=30000, elevMult=2, critMult=4.
const T0 = 1_000_000; // arbitrary epoch-ms origin for the warm-up clock
const POST = T0 + 40_000; // safely past the 30s warm-up
const WARM = T0 + 1_000; // inside the warm-up window
beforeEach(() => throttle.__resetForTest({ startedAt: T0 }));
test('healthy device is never throttled (<= baseMax genuine reconnects)', () => {
for (let i = 0; i < 5; i++) {
const v = throttle.check('A', POST + i, 'normal');
assert.ok(v.allow, `reconnect ${i + 1} (<=baseMax) must be allowed`);
}
});
test('a per-device storm IS throttled and the backoff GROWS (tighten fast)', () => {
let v;
for (let i = 0; i < 5; i++) v = throttle.check('B', POST + i, 'normal'); // 5 allowed
v = throttle.check('B', POST + 5, 'normal'); // 6th -> flagged
assert.equal(v.allow, false);
assert.equal(v.reason, 'rate');
assert.equal(v.observed, 6);
assert.equal(v.allowed, 5);
const b1 = v.retryAfterMs;
// keep hammering while blocked -> escalate, longer backoff each time
const b2 = throttle.check('B', POST + 6, 'normal').retryAfterMs;
const b3 = throttle.check('B', POST + 7, 'normal').retryAfterMs;
assert.ok(b2 > b1 && b3 > b2, `backoff must grow: ${b1} < ${b2} < ${b3}`);
});
test('lag band multiplies an already-flagged device\'s backoff (critical > normal)', () => {
let v;
for (let i = 0; i < 5; i++) throttle.check('N', POST + i, 'normal');
v = throttle.check('N', POST + 5, 'normal');
const normalBackoff = v.retryAfterMs;
throttle.__resetForTest({ startedAt: T0 });
for (let i = 0; i < 5; i++) throttle.check('C', POST + i, 'critical');
v = throttle.check('C', POST + 5, 'critical');
assert.ok(v.retryAfterMs > normalBackoff, `critical backoff ${v.retryAfterMs} > normal ${normalBackoff}`);
});
test('a healthy device is NOT throttled even when the band is critical (lag never gates the healthy)', () => {
for (let i = 0; i < 5; i++) {
const v = throttle.check('H', POST + i, 'critical');
assert.ok(v.allow, 'healthy device stays allowed regardless of band');
}
});
test('COLD START: during warm-up, moderate flapping (>baseMax, <ceiling) is NOT throttled', () => {
for (let i = 0; i < 12; i++) { // 12 > baseMax(5) but < hardCeiling(20)
const v = throttle.check('W', WARM + i, 'critical'); // band forced normal in warm-up anyway
assert.ok(v.allow, `warm-up reconnect ${i + 1} must be lenient`);
}
});
test('HARD CEILING is enforced even during warm-up (slow-ramp cannot train through)', () => {
let v;
for (let i = 0; i < 20; i++) {
v = throttle.check('K', WARM + i, 'normal');
assert.ok(v.allow, `warm-up reconnect ${i + 1} (<=ceiling) allowed`);
}
v = throttle.check('K', WARM + 20, 'normal'); // 21st -> over ceiling(20)
assert.equal(v.allow, false);
assert.equal(v.reason, 'hard-ceiling');
});
test('neighbor isolation: one device storming does not throttle another', () => {
for (let i = 0; i < 10; i++) throttle.check('STORM', POST + i, 'normal'); // STORM gets throttled
const v = throttle.check('NEIGHBOR', POST + 11, 'normal');
assert.ok(v.allow, 'a different device must be unaffected');
});
test('release slow: escalation level decays after a calm period', () => {
let v;
for (let i = 0; i < 6; i++) v = throttle.check('R', POST + i, 'normal'); // flagged, level 1
assert.ok(v.level >= 1);
const peak = v.level;
// a calm reconnect well past the window AND past releaseMs(30000)
v = throttle.check('R', POST + 6 + 40_000, 'normal');
assert.ok(v.allow, 'calm reconnect after the storm is allowed');
assert.ok(v.level < peak, `level decays after calm: ${v.level} < ${peak}`);
});