screentinker/server/config.js
ScreenTinker dbac699854 fix(#143): content-ack flood control — per-device rate budget + loop-lag valve
#142's content-ack dedup is insufficient: a device cycling 2-4 content IDs makes
every ack look unique so dedup never fires, while aggregate volume from ~30 devices
saturates the event loop (the #142 reconnect throttle kept the server responsive,
which is how this was even observable).

Folded ONE control on the content-ack path (no competing limiters; reconnect-
throttle.js untouched) in lib/content-ack-limiter.js:
- Step 1 — per-device RATE budget: caps TOTAL non-duplicate acks per device per
  window regardless of differing content_id (the case dedup misses). Over budget =
  DROP silently (the per-ack log+emit is the cost); log ONCE per device per window
  when shedding starts. Keeps the #142 dedup (dedup'd repeats don't consume budget).
  Per-device, in-memory, resets on restart (modeled on lastPlayLogAt; does NOT reuse
  reconnect-throttle's ban-semantics bucket).
  Env (TUNING GUESSES, validate vs Bold's fleet): CONTENT_ACK_MAX_PER_WINDOW=20,
  CONTENT_ACK_RATE_WINDOW_MS=10000 (=2/s, above legit ~<=1/s, below the flood).
- Step 2 — global pressure valve: reuses the #142 loop-lag band (+ its hysteresis,
  no second control loop). Under CRITICAL band, shed content-acks even for an
  in-budget device; reconnects + dashboard/HTTP are ALWAYS processed; a healthy
  device in a non-critical band is never touched by the valve. Valve open/close
  logged once at the band edge in services/loop-lag.js (not per shed message).

Tests (unique ports 3985/3986, not the 3982/3983/3984 set):
- unit: the #143 regression (cycling ids evading dedup IS rate-limited), under/over
  budget, dedup still works + doesn't consume budget, valve sheds in-budget under
  critical while normal is untouched, rate precedence, window reset, per-device
  isolation.
- integration: socket flood is capped to budget with a single shed-start log;
  under-budget passes every ack; valve OPEN sheds content-acks while a reconnect +
  /api/status still succeed.
Full suite green serial AND parallel (208 tests).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-27 22:21:57 -05:00

163 lines
10 KiB
JavaScript

const path = require('path');
// Data locations. Everything defaults to the in-repo layout, so existing installs
// (including production) are byte-for-byte unchanged when these are unset. Set
// DATA_DIR - or the individual *_PATH / *_DIR vars - to relocate state onto a
// mounted volume (used by the Docker image). UNSET resolves to exactly the legacy
// paths: server/db/remote_display.db, server/uploads/, server/certs/.
const DATA_DIR = process.env.DATA_DIR || __dirname;
const uploadsDir = process.env.UPLOADS_DIR || path.join(DATA_DIR, 'uploads');
const certsDir = process.env.CERTS_DIR || path.join(DATA_DIR, 'certs');
module.exports = {
port: process.env.PORT || 3001,
httpsPort: process.env.HTTPS_PORT || 3443,
dataDir: DATA_DIR,
dbPath: process.env.DB_PATH || path.join(DATA_DIR, 'db', 'remote_display.db'),
uploadsDir,
contentDir: path.join(uploadsDir, 'content'),
screenshotsDir: path.join(uploadsDir, 'screenshots'),
certsDir,
frontendDir: path.join(__dirname, '..', 'frontend'),
// App-level heartbeat. Checker runs every heartbeatInterval and marks
// devices offline if last_heartbeat is older than heartbeatTimeout.
// Env override for self-hosters on slow/jittery networks (issue #3:
// reporter found raising HEARTBEAT_TIMEOUT to 60s reduced false offlines).
heartbeatInterval: parseInt(process.env.HEARTBEAT_INTERVAL) || 10000,
heartbeatTimeout: parseInt(process.env.HEARTBEAT_TIMEOUT) || 45000,
// How long the server holds commands/playlist-updates for a device that's
// offline at emit time (ms). On reconnect within this window, queued events
// are flushed in order. Past TTL they're dropped. See lib/command-queue.js.
commandQueueTtlMs: parseInt(process.env.COMMAND_QUEUE_TTL_MS) || 30000,
// Engine.IO transport-level ping/pong. Raised from Socket.IO defaults
// (25000/20000) because TV WebKits (LG webOS, older Tizen) miss pongs
// under decode load - tighter values cause spurious transport drops.
// Worst-case dead-socket detection: pingInterval + pingTimeout = 60s.
pingInterval: parseInt(process.env.PING_INTERVAL) || 30000,
pingTimeout: parseInt(process.env.PING_TIMEOUT) || 30000,
maxFileSize: 500 * 1024 * 1024, // 500MB
thumbnailWidth: 320,
screenshotQuality: 70,
// SSL: drop your Cloudflare Origin cert + key in certs/ folder
// or set env vars SSL_CERT and SSL_KEY to custom paths
sslCert: process.env.SSL_CERT || path.join(certsDir, 'cert.pem'),
sslKey: process.env.SSL_KEY || path.join(certsDir, 'key.pem'),
// Auth
jwtSecret: process.env.JWT_SECRET || (() => {
const secretFile = path.join(certsDir, '.jwt_secret');
const fs = require('fs');
if (fs.existsSync(secretFile)) return fs.readFileSync(secretFile, 'utf8').trim();
const secret = require('crypto').randomBytes(64).toString('hex');
try { fs.mkdirSync(path.dirname(secretFile), { recursive: true }); fs.writeFileSync(secretFile, secret); } catch {}
return secret;
})(),
jwtExpiry: '7d',
// Google OAuth - set these in env or here
googleClientId: process.env.GOOGLE_CLIENT_ID || '',
// Microsoft OAuth - set these in env or here
microsoftClientId: process.env.MICROSOFT_CLIENT_ID || '',
microsoftTenantId: process.env.MICROSOFT_TENANT_ID || 'common',
// Stripe (optional - for paid subscriptions)
stripeSecretKey: process.env.STRIPE_SECRET_KEY || '',
stripeWebhookSecret: process.env.STRIPE_WEBHOOK_SECRET || '',
// Microsoft Graph email sender (services/email.js). Required for actual
// delivery; absent values short-circuit to a stdout fallback for local dev.
graphTenantId: process.env.GRAPH_TENANT_ID || '',
graphClientId: process.env.GRAPH_CLIENT_ID || '',
graphClientSecret: process.env.GRAPH_CLIENT_SECRET || '',
graphSenderEmail: process.env.GRAPH_SENDER_EMAIL || '',
graphSenderName: process.env.GRAPH_SENDER_NAME || 'ScreenTinker',
// Dev safety net: comma-separated allow-list of recipient emails. When set,
// sends to any address NOT in the list are suppressed (logged but not posted
// to Graph). Intended for local dev that pulls fresh prod DB copies - keeps
// us from accidentally emailing real prod users. UNSET on prod systemd unit.
graphDevRestrictTo: process.env.GRAPH_DEV_RESTRICT_TO || '',
// Self-hosted mode: if true, first user gets enterprise plan and no billing
selfHosted: process.env.SELF_HOSTED === 'true',
// #116: opt-in UI gate. When true, hides the Subscription nav item + billing view
// and bounces #/billing to the dashboard. Default off, so existing deployments are
// unchanged. UI-only — /api/subscription/* stays in place (internal usage reads).
hideBilling: process.env.HIDE_BILLING === 'true',
// Disable public registration (OAuth auto-signup is also blocked when set).
// First-user setup is still allowed so a fresh install can be initialized.
disableRegistration: ['true', '1'].includes(String(process.env.DISABLE_REGISTRATION || '').toLowerCase()),
// Redirect / -> /app instead of serving the marketing landing page.
// For self-hosted internal deployments that don't want the public homepage.
disableHomepage: ['true', '1'].includes(String(process.env.DISABLE_HOMEPAGE || '').toLowerCase()),
// Issue #12: auto-create a personal org + Default workspace for self-service
// signups (public register + OAuth). Defaults TRUE so single-tenant and the
// hosted self-service flow are unaffected; set AUTO_CREATE_ORG_ON_SIGNUP=false
// on MSP-style deployments where an admin/operator assigns users to existing
// orgs after signup instead.
autoCreateOrgOnSignup: !['false', '0'].includes(String(process.env.AUTO_CREATE_ORG_ON_SIGNUP || '').toLowerCase()),
// #142 event-loop lag telemetry (services/loop-lag.js). perf_hooks
// monitorEventLoopDelay is C++-backed, so continuous sampling is cheap. Each
// window's p99 is persisted to event_loop_lag (bounded: indexed + pruned from
// day one) and drives the banded load level the reconnect throttle reads.
lagSampleIntervalMs: parseInt(process.env.LAG_SAMPLE_INTERVAL_MS) || 1000,
lagResolutionMs: parseInt(process.env.LAG_RESOLUTION_MS) || 20,
lagTelemetryRetentionDays: parseFloat(process.env.LAG_TELEMETRY_RETENTION_DAYS) || 3,
lagPruneIntervalMs: parseInt(process.env.LAG_PRUNE_INTERVAL_MS) || 3600000,
// Banded load levels from the window p99 (ms). Asymmetric by design: a band is
// entered immediately when its up-threshold is crossed (tighten fast), but
// released only one step at a time after lagReleaseSamples consecutive samples
// fall below a deadband (release slow), so small fluctuations don't flap it.
// Bands ONLY scale how hard an already-flagged device is throttled; a healthy
// device is never gated by global lag.
lagElevatedMs: parseInt(process.env.LAG_ELEVATED_MS) || 100,
lagCriticalMs: parseInt(process.env.LAG_CRITICAL_MS) || 250,
lagReleaseSamples: parseInt(process.env.LAG_RELEASE_SAMPLES) || 5,
// #142 load-aware per-device reconnect throttle (lib/reconnect-throttle.js).
// The verdict of WHO is misbehaving is ALWAYS per-device (keyed on device_id):
// a device is flagged only when it exceeds reconnectBaseMax genuine reconnects
// per reconnectWindowMs. Global lag never flags a healthy device — the lag band
// only MULTIPLIES how hard an already-flagged device is backed off.
reconnectWindowMs: parseInt(process.env.RECONNECT_WINDOW_MS) || 10000,
reconnectBaseMax: parseInt(process.env.RECONNECT_BASE_MAX) || 5,
// Absolute per-device ceiling, independent of band AND of warm-up: no device may
// exceed this many reconnects/window no matter what the adaptive logic computes,
// so a slow-ramp attacker can't train its way through.
reconnectHardCeiling: parseInt(process.env.RECONNECT_HARD_CEILING) || 20,
// Server-enforced backoff for a flagged device: baseBackoff * 2^(level-1) * band
// multiplier, capped at maxBackoff. Level escalates while it keeps storming
// (tighten fast) and decays one step per reconnectReleaseMs of calm (release slow).
reconnectBaseBackoffMs: parseInt(process.env.RECONNECT_BASE_BACKOFF_MS) || 1000,
reconnectMaxBackoffMs: parseInt(process.env.RECONNECT_MAX_BACKOFF_MS) || 60000,
reconnectMaxLevel: parseInt(process.env.RECONNECT_MAX_LEVEL) || 10,
reconnectReleaseMs: parseInt(process.env.RECONNECT_RELEASE_MS) || 30000,
// Cold start: for this long after process start, lag is high while the whole
// fleet reconnects at once. Treat leniently — force the 'normal' band and apply
// only the hard ceiling (no rate-band throttle) so a deploy can't throttle
// healthy screens. Throttle state is in-memory and resets on restart.
reconnectWarmupMs: parseInt(process.env.RECONNECT_WARMUP_MS) || 30000,
reconnectBandElevatedMult: parseFloat(process.env.RECONNECT_BAND_ELEVATED_MULT) || 2,
reconnectBandCriticalMult: parseFloat(process.env.RECONNECT_BAND_CRITICAL_MULT) || 4,
// #142 device_status_log retention. A GLOBAL scheduled sweep (pruneStatusLog in
// db/database.js, run on startup + the heartbeat interval) deletes rows older
// than this across ALL devices — covering what the per-device insert-time prune
// in deviceSocket.js misses: removed/idle devices that never insert again, and
// the heartbeat.js offline_timeout insert that bypasses logDeviceStatus. Default
// is LOWER than the old hardcoded 7 days (the reporter's bloat happened under 7d);
// 2-3 days is plenty for the dashboard's 24h uptime view + diagnostics.
statusLogRetentionDays: parseFloat(process.env.STATUS_LOG_RETENTION_DAYS) || 3,
// #142 content-ack dedup window (deviceSocket.js). A device (esp. older apps)
// can spam "content <id>: ready" for the same item; suppress identical
// (device_id, content_id, status) reports within this window. A status CHANGE
// has a different key and passes immediately. In-memory; resets on restart.
contentAckDedupMs: parseInt(process.env.CONTENT_ACK_DEDUP_MS) || 10000,
// #143 content-ack RATE budget (lib/content-ack-limiter.js), layered on top of the
// dedup above. Caps TOTAL acks per device per window REGARDLESS of differing
// content_id — the flood the dedup misses (a device cycling 2-4 ids makes every
// ack look unique, so dedup never fires, yet aggregate volume blocks the loop).
// TUNING GUESSES — validate against Bold's real fleet. Legit playlist cadence is
// roughly <=1 ack/s/device; the flood is many/s. 20 per 10s (=2/s) sits above
// legit and below the flood. Easy to retune via env.
contentAckMaxPerWindow: parseInt(process.env.CONTENT_ACK_MAX_PER_WINDOW) || 20,
contentAckRateWindowMs: parseInt(process.env.CONTENT_ACK_RATE_WINDOW_MS) || 10000,
};