mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
Gates genuine reconnects PER DEVICE before the heavy register work (DB writes + playlist build) runs, so a single flapping device can no longer saturate the event loop and take down the server. - Actuator is per-device, keyed on device_id (modeled on lastPlayLogAt). A device is flagged only when it exceeds reconnectBaseMax genuine reconnects per window. Same-socket playlist refreshes (isPlaylistRefresh) are exempt. - Load-awareness is BANDED (normal/elevated/critical from the step-2 lag signal), not a continuous controller. The band only MULTIPLIES an already-flagged device's backoff; global lag never gates a healthy device. - Hysteresis: escalate immediately while storming (tighten fast); decay one level per reconnectReleaseMs of calm (release slow). - HARD CEILING per device, independent of band and warm-up — a slow-ramp attacker can't train through it. - COLD START: for reconnectWarmupMs after boot, force the normal band and apply only the hard ceiling, so a full-fleet reconnect after a deploy doesn't throttle healthy screens. State is in-memory, resets on restart. - Observability: every throttle engagement logs device, band, observed vs allowed rate, and backoff. Throttled device gets device:throttled + a deferred disconnect. Tests (api.test.js style): - unit: healthy-never-throttled, storm-throttled-with-growing-backoff, band multiplies backoff, hard-ceiling-even-in-warmup, warm-up leniency, neighbor isolation, slow release. - integration GATE (the required one): full-fleet reconnect right after restart throttles NO healthy device; a single device storming IS throttled; a neighbor stays unaffected while another storms. - also fixes pre-existing test PORT collisions (my new integration files clashed with totp.test.js:3979 and totp-keyrotation.test.js:3980 -> moved to 3982/3983); full suite now green serially AND in parallel. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
65 lines
2.9 KiB
JavaScript
65 lines
2.9 KiB
JavaScript
'use strict';
|
|
|
|
// #142 step 2 — integration: the lag monitor samples, persists to a BOUNDED table,
|
|
// and surfaces current lag on /api/status. Boots the real server with fast sampling
|
|
// and a tiny (fractional-day) retention so the prune is observable within the test.
|
|
|
|
const { test, before, after } = require('node:test');
|
|
const assert = require('node:assert/strict');
|
|
const { spawn } = require('node:child_process');
|
|
const path = require('node:path');
|
|
const os = require('node:os');
|
|
const fs = require('node:fs');
|
|
const crypto = require('node:crypto');
|
|
const Database = require('better-sqlite3');
|
|
|
|
const PORT = 3982;
|
|
const BASE = `http://127.0.0.1:${PORT}`;
|
|
const DATA_DIR = path.join(os.tmpdir(), 'st-lag-int-' + crypto.randomBytes(4).toString('hex'));
|
|
const LOG = path.join(os.tmpdir(), 'st-lag-int-' + crypto.randomBytes(4).toString('hex') + '.log');
|
|
let proc;
|
|
|
|
before(async () => {
|
|
const logFd = fs.openSync(LOG, 'w');
|
|
proc = spawn('node', ['server.js'], {
|
|
cwd: path.join(__dirname, '..'),
|
|
env: {
|
|
...process.env, DATA_DIR, SELF_HOSTED: 'true', PORT: String(PORT), NODE_ENV: 'test',
|
|
LAG_SAMPLE_INTERVAL_MS: '200', // sample fast
|
|
LAG_TELEMETRY_RETENTION_DAYS: '0.00001', // ~0.86s retention
|
|
LAG_PRUNE_INTERVAL_MS: '400', // prune often
|
|
},
|
|
stdio: ['ignore', logFd, logFd],
|
|
});
|
|
let up = false;
|
|
for (let i = 0; i < 80; i++) {
|
|
try { const r = await fetch(BASE + '/api/status'); if (r.ok) { up = true; break; } } catch { /* not yet */ }
|
|
await new Promise(r => setTimeout(r, 250));
|
|
}
|
|
if (!up) throw new Error('server did not boot:\n' + fs.readFileSync(LOG, 'utf8').slice(-2000));
|
|
});
|
|
|
|
after(() => { try { proc.kill('SIGKILL'); } catch { /* */ } });
|
|
|
|
test('/api/status exposes a current loop_lag snapshot', async () => {
|
|
const r = await fetch(BASE + '/api/status');
|
|
const body = await r.json();
|
|
assert.ok(body.loop_lag, 'loop_lag present on /api/status');
|
|
assert.ok(['normal', 'elevated', 'critical'].includes(body.loop_lag.band), 'band is a valid level');
|
|
assert.equal(typeof body.loop_lag.p99_ms, 'number', 'p99_ms is numeric');
|
|
assert.equal(typeof body.loop_lag.mean_ms, 'number', 'mean_ms is numeric');
|
|
});
|
|
|
|
test('lag samples are persisted AND bounded by retention prune (not unbounded)', async () => {
|
|
// Let it sample for ~3s. At 200ms/sample that is ~15 inserts, but with ~0.86s
|
|
// retention pruned every 400ms the table must stay small — proving the table
|
|
// can never become a second unbounded-growth table.
|
|
await new Promise(r => setTimeout(r, 1800));
|
|
const dbPath = path.join(DATA_DIR, 'db', 'remote_display.db');
|
|
const db = new Database(dbPath, { readonly: true });
|
|
const count = db.prepare('SELECT COUNT(*) c FROM event_loop_lag').get().c;
|
|
db.close();
|
|
assert.ok(count >= 1, 'lag samples are being persisted');
|
|
assert.ok(count < 15, `table is bounded by the prune (held ${count} rows over ~3s of 200ms sampling)`);
|
|
});
|