screentinker/server/test/loop-lag-integration.test.js
ScreenTinker ed3cf72b82 feat(#142): event-loop lag telemetry (perf_hooks) + bounded storage
Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay()
(C++-backed histogram; cheap). Each window persists mean/p50/p99/max to a new
event_loop_lag table and recomputes a coarse load band (normal/elevated/critical)
from the window p99. Standalone value: current lag is exposed on /api/status and
band changes are logged, so site lag is diagnosable independent of throttling.

The band feeds the #142 reconnect throttle (next commit) but ships first as its
own subsystem.

- event_loop_lag is bounded from day one: indexed on sampled_at + scheduled prune
  (LAG_TELEMETRY_RETENTION_DAYS, small default) modeled on the play_logs prune.
  Deliberately NOT another unbounded-growth table.
- Band transitions are asymmetric: jump up immediately (tighten fast), release one
  level at a time after N calm samples below a deadband (release slow, no flap).
  Pure nextBand() function, unit-tested deterministically.
- config: LAG_SAMPLE_INTERVAL_MS, LAG_RESOLUTION_MS, LAG_TELEMETRY_RETENTION_DAYS,
  LAG_PRUNE_INTERVAL_MS, LAG_ELEVATED_MS, LAG_CRITICAL_MS, LAG_RELEASE_SAMPLES.
- tests: band-transition unit tests; integration proves sampling persists, stays
  bounded under the prune, and surfaces on /api/status.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 19:01:08 -05:00

65 lines
2.9 KiB
JavaScript

'use strict';
// #142 step 2 — integration: the lag monitor samples, persists to a BOUNDED table,
// and surfaces current lag on /api/status. Boots the real server with fast sampling
// and a tiny (fractional-day) retention so the prune is observable within the test.
const { test, before, after } = require('node:test');
const assert = require('node:assert/strict');
const { spawn } = require('node:child_process');
const path = require('node:path');
const os = require('node:os');
const fs = require('node:fs');
const crypto = require('node:crypto');
const Database = require('better-sqlite3');
const PORT = 3979;
const BASE = `http://127.0.0.1:${PORT}`;
const DATA_DIR = path.join(os.tmpdir(), 'st-lag-int-' + crypto.randomBytes(4).toString('hex'));
const LOG = path.join(os.tmpdir(), 'st-lag-int-' + crypto.randomBytes(4).toString('hex') + '.log');
let proc;
before(async () => {
const logFd = fs.openSync(LOG, 'w');
proc = spawn('node', ['server.js'], {
cwd: path.join(__dirname, '..'),
env: {
...process.env, DATA_DIR, SELF_HOSTED: 'true', PORT: String(PORT), NODE_ENV: 'test',
LAG_SAMPLE_INTERVAL_MS: '200', // sample fast
LAG_TELEMETRY_RETENTION_DAYS: '0.00001', // ~0.86s retention
LAG_PRUNE_INTERVAL_MS: '400', // prune often
},
stdio: ['ignore', logFd, logFd],
});
let up = false;
for (let i = 0; i < 80; i++) {
try { const r = await fetch(BASE + '/api/status'); if (r.ok) { up = true; break; } } catch { /* not yet */ }
await new Promise(r => setTimeout(r, 250));
}
if (!up) throw new Error('server did not boot:\n' + fs.readFileSync(LOG, 'utf8').slice(-2000));
});
after(() => { try { proc.kill('SIGKILL'); } catch { /* */ } });
test('/api/status exposes a current loop_lag snapshot', async () => {
const r = await fetch(BASE + '/api/status');
const body = await r.json();
assert.ok(body.loop_lag, 'loop_lag present on /api/status');
assert.ok(['normal', 'elevated', 'critical'].includes(body.loop_lag.band), 'band is a valid level');
assert.equal(typeof body.loop_lag.p99_ms, 'number', 'p99_ms is numeric');
assert.equal(typeof body.loop_lag.mean_ms, 'number', 'mean_ms is numeric');
});
test('lag samples are persisted AND bounded by retention prune (not unbounded)', async () => {
// Let it sample for ~3s. At 200ms/sample that is ~15 inserts, but with ~0.86s
// retention pruned every 400ms the table must stay small — proving the table
// can never become a second unbounded-growth table.
await new Promise(r => setTimeout(r, 3000));
const dbPath = path.join(DATA_DIR, 'db', 'remote_display.db');
const db = new Database(dbPath, { readonly: true });
const count = db.prepare('SELECT COUNT(*) c FROM event_loop_lag').get().c;
db.close();
assert.ok(count >= 1, 'lag samples are being persisted');
assert.ok(count < 15, `table is bounded by the prune (held ${count} rows over ~3s of 200ms sampling)`);
});