mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay() (C++-backed histogram; cheap). Each window persists mean/p50/p99/max to a new event_loop_lag table and recomputes a coarse load band (normal/elevated/critical) from the window p99. Standalone value: current lag is exposed on /api/status and band changes are logged, so site lag is diagnosable independent of throttling. The band feeds the #142 reconnect throttle (next commit) but ships first as its own subsystem. - event_loop_lag is bounded from day one: indexed on sampled_at + scheduled prune (LAG_TELEMETRY_RETENTION_DAYS, small default) modeled on the play_logs prune. Deliberately NOT another unbounded-growth table. - Band transitions are asymmetric: jump up immediately (tighten fast), release one level at a time after N calm samples below a deadband (release slow, no flap). Pure nextBand() function, unit-tested deterministically. - config: LAG_SAMPLE_INTERVAL_MS, LAG_RESOLUTION_MS, LAG_TELEMETRY_RETENTION_DAYS, LAG_PRUNE_INTERVAL_MS, LAG_ELEVATED_MS, LAG_CRITICAL_MS, LAG_RELEASE_SAMPLES. - tests: band-transition unit tests; integration proves sampling persists, stays bounded under the prune, and surfaces on /api/status. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
108 lines
4.7 KiB
JavaScript
108 lines
4.7 KiB
JavaScript
// #142 — Event-loop lag telemetry (the data subsystem; ships before the throttle).
|
|
//
|
|
// Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay()
|
|
// (a C++-backed histogram — cheap). Each window we read mean/p50/p99/max, persist
|
|
// a row to the bounded `event_loop_lag` table, and recompute a coarse load BAND
|
|
// (normal | elevated | critical) from the window p99.
|
|
//
|
|
// The band is consumed by the reconnect throttle (#142 step 3), but this module
|
|
// has standalone value: getLag() is surfaced on /api/status and band changes are
|
|
// logged, so site connectivity/lag is diagnosable independent of any throttling.
|
|
//
|
|
// Band transitions are deliberately asymmetric (see nextBand): jump UP immediately
|
|
// when an up-threshold is crossed (tighten fast), step DOWN only one level at a
|
|
// time after lagReleaseSamples consecutive calm samples below a deadband (release
|
|
// slow). This avoids band flap from transient blips.
|
|
|
|
const { monitorEventLoopDelay } = require('perf_hooks');
|
|
const { db } = require('../db/database');
|
|
const config = require('../config');
|
|
|
|
const NS_PER_MS = 1e6;
|
|
// A band releases only once p99 falls below this fraction of the band's entry
|
|
// threshold — the deadband that stops small fluctuations from flapping the band.
|
|
const DEADBAND = 0.5;
|
|
const LEVEL = { normal: 0, elevated: 1, critical: 2 };
|
|
|
|
let histogram = null;
|
|
let band = 'normal';
|
|
let calmSamples = 0;
|
|
let current = { mean_ms: 0, p50_ms: 0, p99_ms: 0, max_ms: 0, band: 'normal', sampled_at: 0 };
|
|
|
|
// Pure band-transition function (exported for deterministic unit tests). Given the
|
|
// current band, the window p99 (ms), and the running calm-sample count, returns the
|
|
// next [band, calmSamples]. Up is immediate (may skip a level); down is one step
|
|
// per release window, gated by a deadband.
|
|
function nextBand(cur, p99, calm) {
|
|
const level = LEVEL[cur] ?? 0;
|
|
// UP — immediate, tighten fast (normal can jump straight to critical).
|
|
if (p99 >= config.lagCriticalMs && level < LEVEL.critical) return ['critical', 0];
|
|
if (p99 >= config.lagElevatedMs && level < LEVEL.elevated) return ['elevated', 0];
|
|
// DOWN — slow, one step, only below the current band's deadband.
|
|
if (level === LEVEL.critical && p99 <= config.lagCriticalMs * DEADBAND) {
|
|
const c = calm + 1;
|
|
return c >= config.lagReleaseSamples ? ['elevated', 0] : ['critical', c];
|
|
}
|
|
if (level === LEVEL.elevated && p99 <= config.lagElevatedMs * DEADBAND) {
|
|
const c = calm + 1;
|
|
return c >= config.lagReleaseSamples ? ['normal', 0] : ['elevated', c];
|
|
}
|
|
// Hold (inside deadband, or already normal): reset the calm counter.
|
|
return [cur, 0];
|
|
}
|
|
|
|
const round2 = (x) => Math.round(x * 100) / 100;
|
|
|
|
function sample() {
|
|
const p99 = histogram.percentile(99) / NS_PER_MS;
|
|
const snap = {
|
|
mean_ms: round2(histogram.mean / NS_PER_MS),
|
|
p50_ms: round2(histogram.percentile(50) / NS_PER_MS),
|
|
p99_ms: round2(p99),
|
|
max_ms: round2(histogram.max / NS_PER_MS),
|
|
};
|
|
histogram.reset();
|
|
|
|
const prev = band;
|
|
[band, calmSamples] = nextBand(band, snap.p99_ms, calmSamples);
|
|
current = { ...snap, band, sampled_at: Math.floor(Date.now() / 1000) };
|
|
|
|
try {
|
|
db.prepare(
|
|
'INSERT INTO event_loop_lag (sampled_at, mean_ms, p50_ms, p99_ms, max_ms, band) VALUES (?, ?, ?, ?, ?, ?)'
|
|
).run(current.sampled_at, snap.mean_ms, snap.p50_ms, snap.p99_ms, snap.max_ms, band);
|
|
} catch (_) { /* table may not exist on a partially-migrated DB */ }
|
|
|
|
// Observable: log whenever we're loaded or when the band changes (incl. back to
|
|
// normal). Healthy steady state stays quiet.
|
|
if (band !== 'normal' || prev !== 'normal') {
|
|
const tag = band !== prev ? ` (was ${prev})` : '';
|
|
console.log(`[loop-lag] band=${band}${tag} mean=${snap.mean_ms}ms p99=${snap.p99_ms}ms max=${snap.max_ms}ms`);
|
|
}
|
|
}
|
|
|
|
function pruneLag() {
|
|
try {
|
|
const cutoff = Math.floor(Date.now() / 1000) - Math.round(config.lagTelemetryRetentionDays * 86400);
|
|
const n = db.prepare('DELETE FROM event_loop_lag WHERE sampled_at < ?').run(cutoff).changes;
|
|
if (n > 0) console.log(`[loop-lag] pruned ${n} sample(s) older than ${config.lagTelemetryRetentionDays}d`);
|
|
} catch (_) { /* ignore */ }
|
|
}
|
|
|
|
function startLoopLagMonitor() {
|
|
if (histogram) return; // idempotent
|
|
histogram = monitorEventLoopDelay({ resolution: config.lagResolutionMs });
|
|
histogram.enable();
|
|
const t1 = setInterval(sample, config.lagSampleIntervalMs);
|
|
pruneLag(); // sweep stale rows on boot
|
|
const t2 = setInterval(pruneLag, config.lagPruneIntervalMs);
|
|
// Don't keep the process alive on these timers (matters for tests / clean exit).
|
|
if (t1.unref) t1.unref();
|
|
if (t2.unref) t2.unref();
|
|
}
|
|
|
|
function getBand() { return band; }
|
|
function getLag() { return { ...current }; }
|
|
|
|
module.exports = { startLoopLagMonitor, getBand, getLag, nextBand };
|