mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
#142's content-ack dedup is insufficient: a device cycling 2-4 content IDs makes every ack look unique so dedup never fires, while aggregate volume from ~30 devices saturates the event loop (the #142 reconnect throttle kept the server responsive, which is how this was even observable). Folded ONE control on the content-ack path (no competing limiters; reconnect- throttle.js untouched) in lib/content-ack-limiter.js: - Step 1 — per-device RATE budget: caps TOTAL non-duplicate acks per device per window regardless of differing content_id (the case dedup misses). Over budget = DROP silently (the per-ack log+emit is the cost); log ONCE per device per window when shedding starts. Keeps the #142 dedup (dedup'd repeats don't consume budget). Per-device, in-memory, resets on restart (modeled on lastPlayLogAt; does NOT reuse reconnect-throttle's ban-semantics bucket). Env (TUNING GUESSES, validate vs Bold's fleet): CONTENT_ACK_MAX_PER_WINDOW=20, CONTENT_ACK_RATE_WINDOW_MS=10000 (=2/s, above legit ~<=1/s, below the flood). - Step 2 — global pressure valve: reuses the #142 loop-lag band (+ its hysteresis, no second control loop). Under CRITICAL band, shed content-acks even for an in-budget device; reconnects + dashboard/HTTP are ALWAYS processed; a healthy device in a non-critical band is never touched by the valve. Valve open/close logged once at the band edge in services/loop-lag.js (not per shed message). Tests (unique ports 3985/3986, not the 3982/3983/3984 set): - unit: the #143 regression (cycling ids evading dedup IS rate-limited), under/over budget, dedup still works + doesn't consume budget, valve sheds in-budget under critical while normal is untouched, rate precedence, window reset, per-device isolation. - integration: socket flood is capped to budget with a single shed-start log; under-budget passes every ack; valve OPEN sheds content-acks while a reconnect + /api/status still succeed. Full suite green serial AND parallel (208 tests). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
116 lines
5.3 KiB
JavaScript
116 lines
5.3 KiB
JavaScript
// #142 — Event-loop lag telemetry (the data subsystem; ships before the throttle).
|
|
//
|
|
// Continuously samples event-loop delay via perf_hooks.monitorEventLoopDelay()
|
|
// (a C++-backed histogram — cheap). Each window we read mean/p50/p99/max, persist
|
|
// a row to the bounded `event_loop_lag` table, and recompute a coarse load BAND
|
|
// (normal | elevated | critical) from the window p99.
|
|
//
|
|
// The band is consumed by the reconnect throttle (#142 step 3), but this module
|
|
// has standalone value: getLag() is surfaced on /api/status and band changes are
|
|
// logged, so site connectivity/lag is diagnosable independent of any throttling.
|
|
//
|
|
// Band transitions are deliberately asymmetric (see nextBand): jump UP immediately
|
|
// when an up-threshold is crossed (tighten fast), step DOWN only one level at a
|
|
// time after lagReleaseSamples consecutive calm samples below a deadband (release
|
|
// slow). This avoids band flap from transient blips.
|
|
|
|
const { monitorEventLoopDelay } = require('perf_hooks');
|
|
const { db } = require('../db/database');
|
|
const config = require('../config');
|
|
|
|
const NS_PER_MS = 1e6;
|
|
// A band releases only once p99 falls below this fraction of the band's entry
|
|
// threshold — the deadband that stops small fluctuations from flapping the band.
|
|
const DEADBAND = 0.5;
|
|
const LEVEL = { normal: 0, elevated: 1, critical: 2 };
|
|
|
|
let histogram = null;
|
|
let band = 'normal';
|
|
let calmSamples = 0;
|
|
let current = { mean_ms: 0, p50_ms: 0, p99_ms: 0, max_ms: 0, band: 'normal', sampled_at: 0 };
|
|
|
|
// Pure band-transition function (exported for deterministic unit tests). Given the
|
|
// current band, the window p99 (ms), and the running calm-sample count, returns the
|
|
// next [band, calmSamples]. Up is immediate (may skip a level); down is one step
|
|
// per release window, gated by a deadband.
|
|
function nextBand(cur, p99, calm) {
|
|
const level = LEVEL[cur] ?? 0;
|
|
// UP — immediate, tighten fast (normal can jump straight to critical).
|
|
if (p99 >= config.lagCriticalMs && level < LEVEL.critical) return ['critical', 0];
|
|
if (p99 >= config.lagElevatedMs && level < LEVEL.elevated) return ['elevated', 0];
|
|
// DOWN — slow, one step, only below the current band's deadband.
|
|
if (level === LEVEL.critical && p99 <= config.lagCriticalMs * DEADBAND) {
|
|
const c = calm + 1;
|
|
return c >= config.lagReleaseSamples ? ['elevated', 0] : ['critical', c];
|
|
}
|
|
if (level === LEVEL.elevated && p99 <= config.lagElevatedMs * DEADBAND) {
|
|
const c = calm + 1;
|
|
return c >= config.lagReleaseSamples ? ['normal', 0] : ['elevated', c];
|
|
}
|
|
// Hold (inside deadband, or already normal): reset the calm counter.
|
|
return [cur, 0];
|
|
}
|
|
|
|
const round2 = (x) => Math.round(x * 100) / 100;
|
|
|
|
function sample() {
|
|
const p99 = histogram.percentile(99) / NS_PER_MS;
|
|
const snap = {
|
|
mean_ms: round2(histogram.mean / NS_PER_MS),
|
|
p50_ms: round2(histogram.percentile(50) / NS_PER_MS),
|
|
p99_ms: round2(p99),
|
|
max_ms: round2(histogram.max / NS_PER_MS),
|
|
};
|
|
histogram.reset();
|
|
|
|
const prev = band;
|
|
[band, calmSamples] = nextBand(band, snap.p99_ms, calmSamples);
|
|
current = { ...snap, band, sampled_at: Math.floor(Date.now() / 1000) };
|
|
|
|
try {
|
|
db.prepare(
|
|
'INSERT INTO event_loop_lag (sampled_at, mean_ms, p50_ms, p99_ms, max_ms, band) VALUES (?, ?, ?, ?, ?, ?)'
|
|
).run(current.sampled_at, snap.mean_ms, snap.p50_ms, snap.p99_ms, snap.max_ms, band);
|
|
} catch (_) { /* table may not exist on a partially-migrated DB */ }
|
|
|
|
// Observable: log whenever we're loaded or when the band changes (incl. back to
|
|
// normal). Healthy steady state stays quiet.
|
|
if (band !== 'normal' || prev !== 'normal') {
|
|
const tag = band !== prev ? ` (was ${prev})` : '';
|
|
console.log(`[loop-lag] band=${band}${tag} mean=${snap.mean_ms}ms p99=${snap.p99_ms}ms max=${snap.max_ms}ms`);
|
|
}
|
|
|
|
// #143 global pressure valve — log ONLY the band edge (open/close), not per shed
|
|
// message. When critical, deviceSocket sheds non-essential acks (it reads getBand()).
|
|
if (band === 'critical' && prev !== 'critical') {
|
|
console.warn(`[shed] global valve OPEN — loop-lag critical (p99=${snap.p99_ms}ms); shedding non-essential device messages (content-acks). reconnects + dashboard still processed.`);
|
|
} else if (prev === 'critical' && band !== 'critical') {
|
|
console.log(`[shed] global valve CLOSED — loop-lag recovered (band=${band}, p99=${snap.p99_ms}ms)`);
|
|
}
|
|
}
|
|
|
|
function pruneLag() {
|
|
try {
|
|
const cutoff = Math.floor(Date.now() / 1000) - Math.round(config.lagTelemetryRetentionDays * 86400);
|
|
const n = db.prepare('DELETE FROM event_loop_lag WHERE sampled_at < ?').run(cutoff).changes;
|
|
if (n > 0) console.log(`[loop-lag] pruned ${n} sample(s) older than ${config.lagTelemetryRetentionDays}d`);
|
|
} catch (_) { /* ignore */ }
|
|
}
|
|
|
|
function startLoopLagMonitor() {
|
|
if (histogram) return; // idempotent
|
|
histogram = monitorEventLoopDelay({ resolution: config.lagResolutionMs });
|
|
histogram.enable();
|
|
const t1 = setInterval(sample, config.lagSampleIntervalMs);
|
|
pruneLag(); // sweep stale rows on boot
|
|
const t2 = setInterval(pruneLag, config.lagPruneIntervalMs);
|
|
// Don't keep the process alive on these timers (matters for tests / clean exit).
|
|
if (t1.unref) t1.unref();
|
|
if (t2.unref) t2.unref();
|
|
}
|
|
|
|
function getBand() { return band; }
|
|
function getLag() { return { ...current }; }
|
|
|
|
module.exports = { startLoopLagMonitor, getBand, getLag, nextBand };
|