screentinker/server/test/reconnect-throttle-integration.test.js
ScreenTinker 101f086204 fix(#142): load-aware per-device reconnect throttle (the outage fix)
Gates genuine reconnects PER DEVICE before the heavy register work (DB writes +
playlist build) runs, so a single flapping device can no longer saturate the
event loop and take down the server.

- Actuator is per-device, keyed on device_id (modeled on lastPlayLogAt). A device
  is flagged only when it exceeds reconnectBaseMax genuine reconnects per window.
  Same-socket playlist refreshes (isPlaylistRefresh) are exempt.
- Load-awareness is BANDED (normal/elevated/critical from the step-2 lag signal),
  not a continuous controller. The band only MULTIPLIES an already-flagged
  device's backoff; global lag never gates a healthy device.
- Hysteresis: escalate immediately while storming (tighten fast); decay one level
  per reconnectReleaseMs of calm (release slow).
- HARD CEILING per device, independent of band and warm-up — a slow-ramp attacker
  can't train through it.
- COLD START: for reconnectWarmupMs after boot, force the normal band and apply
  only the hard ceiling, so a full-fleet reconnect after a deploy doesn't throttle
  healthy screens. State is in-memory, resets on restart.
- Observability: every throttle engagement logs device, band, observed vs allowed
  rate, and backoff. Throttled device gets device:throttled + a deferred disconnect.

Tests (api.test.js style):
- unit: healthy-never-throttled, storm-throttled-with-growing-backoff, band
  multiplies backoff, hard-ceiling-even-in-warmup, warm-up leniency, neighbor
  isolation, slow release.
- integration GATE (the required one): full-fleet reconnect right after restart
  throttles NO healthy device; a single device storming IS throttled; a neighbor
  stays unaffected while another storms.
- also fixes pre-existing test PORT collisions (my new integration files clashed
  with totp.test.js:3979 and totp-keyrotation.test.js:3980 -> moved to 3982/3983);
  full suite now green serially AND in parallel.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 19:18:00 -05:00

114 lines
5.2 KiB
JavaScript

'use strict';
// #142 step 3 — REQUIRED GATE TEST + storm + neighbor, over real sockets.
//
// Boots the real server with warm-up ACTIVE (default) so the whole suite runs in
// the cold-start window — the exact "right after a deploy" scenario. Hard ceiling
// and window are tightened so the storm trips quickly without thousands of connects;
// fleet devices stay well under the ceiling.
const { test, before, after } = require('node:test');
const assert = require('node:assert/strict');
const { spawn } = require('node:child_process');
const path = require('node:path');
const os = require('node:os');
const fs = require('node:fs');
const crypto = require('node:crypto');
const ioClient = require('socket.io-client');
const PORT = 3983;
const BASE = `http://127.0.0.1:${PORT}`;
const DATA_DIR = path.join(os.tmpdir(), 'st-thr-int-' + crypto.randomBytes(4).toString('hex'));
const LOG = path.join(os.tmpdir(), 'st-thr-int-' + crypto.randomBytes(4).toString('hex') + '.log');
let proc;
before(async () => {
const logFd = fs.openSync(LOG, 'w');
proc = spawn('node', ['server.js'], {
cwd: path.join(__dirname, '..'),
env: {
...process.env, DATA_DIR, SELF_HOSTED: 'true', PORT: String(PORT), NODE_ENV: 'test',
// warm-up left at default (30s) so the whole test runs in the cold-start window
RECONNECT_HARD_CEILING: '8',
RECONNECT_WINDOW_MS: '5000',
RECONNECT_BASE_MAX: '3',
},
stdio: ['ignore', logFd, logFd],
});
let up = false;
for (let i = 0; i < 80; i++) {
try { const r = await fetch(BASE + '/api/status'); if (r.ok) { up = true; break; } } catch { /* */ }
await new Promise(r => setTimeout(r, 250));
}
if (!up) throw new Error('server did not boot:\n' + fs.readFileSync(LOG, 'utf8').slice(-2000));
});
after(() => { try { proc.kill('SIGKILL'); } catch { /* */ } });
// Provision a brand-new device via a UNIQUE pairing code -> returns {device_id, device_token}.
function provision() {
const code = String(crypto.randomInt(100000, 1000000));
return new Promise((resolve) => {
const sock = ioClient(`${BASE}/device`, { transports: ['websocket'], reconnection: false, forceNew: true });
sock.on('connect', () => sock.emit('device:register', { pairing_code: code }));
sock.on('device:registered', (d) => { try { sock.close(); } catch { /* */ } resolve({ id: d.device_id, token: d.device_token }); });
setTimeout(() => { try { sock.close(); } catch { /* */ } resolve(null); }, 4000);
});
}
// One genuine reconnect (new socket). Resolves {registered, throttled}.
function reconnect(dev) {
return new Promise((resolve) => {
const sock = ioClient(`${BASE}/device`, { transports: ['websocket'], reconnection: false, forceNew: true });
let done = false;
const finish = (r) => { if (done) return; done = true; try { sock.close(); } catch { /* */ } resolve(r); };
sock.on('connect', () => sock.emit('device:register', { device_id: dev.id, device_token: dev.token, device_info: { app_version: 'test' } }));
sock.on('device:registered', () => finish({ registered: true, throttled: false }));
sock.on('device:throttled', () => finish({ registered: false, throttled: true }));
setTimeout(() => finish({ registered: false, throttled: false }), 1500);
});
}
test('GATE: full-fleet reconnect right after restart throttles NO healthy device', async () => {
// 12 distinct devices, each reconnecting twice in quick succession — a deploy-time
// herd. The loop is transiently busy, but per-device keying means none is flagged.
const fleet = [];
for (let i = 0; i < 12; i++) { const d = await provision(); assert.ok(d, 'device provisioned'); fleet.push(d); }
let registered = 0, throttled = 0;
// two reconnect rounds across the whole fleet
for (let round = 0; round < 2; round++) {
const results = await Promise.all(fleet.map(reconnect));
for (const r of results) { if (r.registered) registered++; if (r.throttled) throttled++; }
}
assert.equal(throttled, 0, 'NO healthy fleet device may be throttled at cold start');
assert.equal(registered, 24, 'every fleet reconnect registered');
});
test('a single device storming IS throttled (backoff engages)', async () => {
const dev = await provision();
assert.ok(dev);
let registered = 0, throttled = 0;
// 12 sequential reconnects within the 5s window -> exceeds the hard ceiling (8)
for (let i = 0; i < 12; i++) {
const r = await reconnect(dev);
if (r.registered) registered++;
if (r.throttled) throttled++;
}
assert.ok(throttled >= 1, `storming device must be throttled (got ${throttled} throttle(s))`);
assert.ok(registered < 12, `not all storm reconnects should succeed (got ${registered}/12)`);
});
test('neighbor isolation: a healthy device is unaffected while another storms', async () => {
const stormer = await provision();
const neighbor = await provision();
assert.ok(stormer && neighbor);
// storm the stormer hard
for (let i = 0; i < 12; i++) await reconnect(stormer);
// neighbor reconnects normally a couple of times -> must still register
const a = await reconnect(neighbor);
const b = await reconnect(neighbor);
assert.ok(a.registered && b.registered, 'neighbor must register normally while another device storms');
assert.ok(!a.throttled && !b.throttled, 'neighbor must not be throttled by another device');
});