screentinker/server/services/loop-lag.js
ScreenTinker ed3cf72b82 feat(#142): event-loop lag telemetry (perf_hooks) + bounded storage
Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay()
(C++-backed histogram; cheap). Each window persists mean/p50/p99/max to a new
event_loop_lag table and recomputes a coarse load band (normal/elevated/critical)
from the window p99. Standalone value: current lag is exposed on /api/status and
band changes are logged, so site lag is diagnosable independent of throttling.

The band feeds the #142 reconnect throttle (next commit) but ships first as its
own subsystem.

- event_loop_lag is bounded from day one: indexed on sampled_at + scheduled prune
  (LAG_TELEMETRY_RETENTION_DAYS, small default) modeled on the play_logs prune.
  Deliberately NOT another unbounded-growth table.
- Band transitions are asymmetric: jump up immediately (tighten fast), release one
  level at a time after N calm samples below a deadband (release slow, no flap).
  Pure nextBand() function, unit-tested deterministically.
- config: LAG_SAMPLE_INTERVAL_MS, LAG_RESOLUTION_MS, LAG_TELEMETRY_RETENTION_DAYS,
  LAG_PRUNE_INTERVAL_MS, LAG_ELEVATED_MS, LAG_CRITICAL_MS, LAG_RELEASE_SAMPLES.
- tests: band-transition unit tests; integration proves sampling persists, stays
  bounded under the prune, and surfaces on /api/status.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 19:01:08 -05:00

108 lines
4.7 KiB
JavaScript

// #142 — Event-loop lag telemetry (the data subsystem; ships before the throttle).
//
// Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay()
// (a C++-backed histogram — cheap). Each window we read mean/p50/p99/max, persist
// a row to the bounded `event_loop_lag` table, and recompute a coarse load BAND
// (normal | elevated | critical) from the window p99.
//
// The band is consumed by the reconnect throttle (#142 step 3), but this module
// has standalone value: getLag() is surfaced on /api/status and band changes are
// logged, so site connectivity/lag is diagnosable independent of any throttling.
//
// Band transitions are deliberately asymmetric (see nextBand): jump UP immediately
// when an up-threshold is crossed (tighten fast), step DOWN only one level at a
// time after lagReleaseSamples consecutive calm samples below a deadband (release
// slow). This avoids band flap from transient blips.
const { monitorEventLoopDelay } = require('perf_hooks');
const { db } = require('../db/database');
const config = require('../config');
const NS_PER_MS = 1e6;
// A band releases only once p99 falls below this fraction of the band's entry
// threshold — the deadband that stops small fluctuations from flapping the band.
const DEADBAND = 0.5;
const LEVEL = { normal: 0, elevated: 1, critical: 2 };
let histogram = null;
let band = 'normal';
let calmSamples = 0;
let current = { mean_ms: 0, p50_ms: 0, p99_ms: 0, max_ms: 0, band: 'normal', sampled_at: 0 };
// Pure band-transition function (exported for deterministic unit tests). Given the
// current band, the window p99 (ms), and the running calm-sample count, returns the
// next [band, calmSamples]. Up is immediate (may skip a level); down is one step
// per release window, gated by a deadband.
function nextBand(cur, p99, calm) {
const level = LEVEL[cur] ?? 0;
// UP — immediate, tighten fast (normal can jump straight to critical).
if (p99 >= config.lagCriticalMs && level < LEVEL.critical) return ['critical', 0];
if (p99 >= config.lagElevatedMs && level < LEVEL.elevated) return ['elevated', 0];
// DOWN — slow, one step, only below the current band's deadband.
if (level === LEVEL.critical && p99 <= config.lagCriticalMs * DEADBAND) {
const c = calm + 1;
return c >= config.lagReleaseSamples ? ['elevated', 0] : ['critical', c];
}
if (level === LEVEL.elevated && p99 <= config.lagElevatedMs * DEADBAND) {
const c = calm + 1;
return c >= config.lagReleaseSamples ? ['normal', 0] : ['elevated', c];
}
// Hold (inside deadband, or already normal): reset the calm counter.
return [cur, 0];
}
const round2 = (x) => Math.round(x * 100) / 100;
function sample() {
const p99 = histogram.percentile(99) / NS_PER_MS;
const snap = {
mean_ms: round2(histogram.mean / NS_PER_MS),
p50_ms: round2(histogram.percentile(50) / NS_PER_MS),
p99_ms: round2(p99),
max_ms: round2(histogram.max / NS_PER_MS),
};
histogram.reset();
const prev = band;
[band, calmSamples] = nextBand(band, snap.p99_ms, calmSamples);
current = { ...snap, band, sampled_at: Math.floor(Date.now() / 1000) };
try {
db.prepare(
'INSERT INTO event_loop_lag (sampled_at, mean_ms, p50_ms, p99_ms, max_ms, band) VALUES (?, ?, ?, ?, ?, ?)'
).run(current.sampled_at, snap.mean_ms, snap.p50_ms, snap.p99_ms, snap.max_ms, band);
} catch (_) { /* table may not exist on a partially-migrated DB */ }
// Observable: log whenever we're loaded or when the band changes (incl. back to
// normal). Healthy steady state stays quiet.
if (band !== 'normal' || prev !== 'normal') {
const tag = band !== prev ? ` (was ${prev})` : '';
console.log(`[loop-lag] band=${band}${tag} mean=${snap.mean_ms}ms p99=${snap.p99_ms}ms max=${snap.max_ms}ms`);
}
}
function pruneLag() {
try {
const cutoff = Math.floor(Date.now() / 1000) - Math.round(config.lagTelemetryRetentionDays * 86400);
const n = db.prepare('DELETE FROM event_loop_lag WHERE sampled_at < ?').run(cutoff).changes;
if (n > 0) console.log(`[loop-lag] pruned ${n} sample(s) older than ${config.lagTelemetryRetentionDays}d`);
} catch (_) { /* ignore */ }
}
function startLoopLagMonitor() {
if (histogram) return; // idempotent
histogram = monitorEventLoopDelay({ resolution: config.lagResolutionMs });
histogram.enable();
const t1 = setInterval(sample, config.lagSampleIntervalMs);
pruneLag(); // sweep stale rows on boot
const t2 = setInterval(pruneLag, config.lagPruneIntervalMs);
// Don't keep the process alive on these timers (matters for tests / clean exit).
if (t1.unref) t1.unref();
if (t2.unref) t2.unref();
}
function getBand() { return band; }
function getLag() { return { ...current }; }
module.exports = { startLoopLagMonitor, getBand, getLag, nextBand };