From e73428182da23e6ef31894e0edd0f16c69c30a72 Mon Sep 17 00:00:00 2001 From: ScreenTinker Date: Sat, 27 Jun 2026 22:56:48 -0500 Subject: [PATCH] =?UTF-8?q?fix(#143):=20fingerprint-reclaim=20stuck=20loop?= =?UTF-8?q?=20=E2=80=94=20reclaim=20by=20runtime=20liveness,=20throttle=20?= =?UTF-8?q?log?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bold beta1: three devices spam "Fingerprint reclaim rejected ... device active (status=offline, ~2500s since heartbeat, liveConn=false)" twice/~2s indefinitely — contradictory: gone by every signal yet treated as active. Root cause (NOT a missing clear — corrected the hypothesis). The reject condition was `liveConn || status==='online' || secondsSince < RECLAIM_GRACE_SECONDS(24h)`. For the observed devices liveConn=false and status=offline, so the ONLY true term is `secondsSince < 24h` — an effective 24h CALENDAR grace, not a stale flag. Audited the clears: liveConn (deviceConnections) is removed on the debounced disconnect (heartbeat.removeConnection) AND the offline_timeout sweep (deviceConnections.delete); status is set 'offline' on both. liveConn=false + status=offline PROVE the clears ran — there is nothing stale to clear. The 24h time gate (mislabeled "device active") blocked a legitimately-gone device from reclaiming for up to 24h, so it retried every ~2s forever-in-practice. The "twice per ~2s" is two reclaim ATTEMPTS per cycle (client reconnect + re-pair-on-auth-error), each hitting the single console.warn — not double-logging in one attempt. Fix: - Decide "still alive" from RUNTIME signals: `!!liveConn || secondsSince < reclaimSettleSeconds`. A device with no live socket and a heartbeat older than the settle window is gone -> reclaimable. A live (or just-seen) device is still rejected, so reclaim-abuse protection holds. NOT just ignoring "active" — it fixes WHY it was stuck (the 24h gate). RECLAIM_SETTLE_SECONDS default 300 (was 24h). SECURITY TRADEOFF flagged in config: shortens the anti-fingerprint-theft window; raise to re-tighten. Tuning guess to validate vs Bold. - Log throttle: the deferral logs at most once per device per RECLAIM_REJECT_LOG_ WINDOW_MS (default 60s) — collapses the double-log + the per-2s flood (same discipline as the content-ack shed log). Cleared when a reclaim proceeds. Recovery of the 3 wedged devices (2febcaa9, 1984694c, 139159eb): they SELF-HEAL on their next reclaim attempt (~2s) once this ships — their heartbeats are ~2500s stale (>300s settle) and liveConn=false, so the reclaim now succeeds. No operator SQL needed. Tests (port 3988): gone device reclaims; live device still rejected; clear-on-leave (disconnect clears liveConn -> stale device reclaims); deferral log <=1 per window. Full suite green serial+parallel (217). reconnect-throttle.js, the dbac699 content-ack limiter, and the 404c330 block/auth code untouched. Co-Authored-By: Claude Opus 4.8 (1M context) --- server/config.js | 15 +++ server/test/fingerprint-reclaim.test.js | 116 ++++++++++++++++++++++++ server/ws/deviceSocket.js | 29 ++++-- 3 files changed, 153 insertions(+), 7 deletions(-) create mode 100644 server/test/fingerprint-reclaim.test.js diff --git a/server/config.js b/server/config.js index 1c6d26d..feb2a4e 100644 --- a/server/config.js +++ b/server/config.js @@ -159,4 +159,19 @@ module.exports = { // legit and below the flood. Easy to retune via env. contentAckMaxPerWindow: parseInt(process.env.CONTENT_ACK_MAX_PER_WINDOW) || 20, contentAckRateWindowMs: parseInt(process.env.CONTENT_ACK_RATE_WINDOW_MS) || 10000, + + // #143 fingerprint-reclaim liveness. A reinstalled app (same fingerprint, no + // device_id, has pairing_code) may reclaim its old device's identity once that + // device is gone by RUNTIME signals: no live socket AND last heartbeat older than + // this settle window. Previously an effective 24h calendar grace treated a device + // merely offline <24h as "active", so a legitimately-gone device (liveConn=false, + // status=offline, stale heartbeat) could never reclaim and retried every ~2s, + // flooding logs (Bold beta1). Settle = the max reclaim wait; keep it comfortably + // above heartbeatTimeout (45s) so a brief blip isn't mistaken for "gone". + // SECURITY TRADEOFF: this also shortens the anti-(fingerprint-theft) window from + // 24h — raise it to re-tighten, at the cost of reinstall latency. Tuning guess. + reclaimSettleSeconds: parseInt(process.env.RECLAIM_SETTLE_SECONDS) || 300, + // #143 throttle the reclaim-deferred log to once per device per window, so a + // retrying/stuck device can't flood stdout (same discipline as the content-ack shed log). + reclaimRejectLogWindowMs: parseInt(process.env.RECLAIM_REJECT_LOG_WINDOW_MS) || 60000, }; diff --git a/server/test/fingerprint-reclaim.test.js b/server/test/fingerprint-reclaim.test.js new file mode 100644 index 0000000..6b22819 --- /dev/null +++ b/server/test/fingerprint-reclaim.test.js @@ -0,0 +1,116 @@ +'use strict'; + +// #143 — fingerprint-reclaim stuck loop. A device gone by every RUNTIME signal +// (no live socket + stale heartbeat) must be reclaimable; a genuinely-live device +// must still be rejected; the deferral log must not flood. Devices are seeded by +// direct SQLite (mimics the real DB state + avoids the disconnect-debounce window +// leaving a stale liveConn). Unique PORT 3988 (not 3982-3987). + +const { test, before, after } = require('node:test'); +const assert = require('node:assert/strict'); +const { spawn } = require('node:child_process'); +const path = require('node:path'); +const os = require('node:os'); +const fs = require('node:fs'); +const crypto = require('node:crypto'); +const ioClient = require('socket.io-client'); +const Database = require('better-sqlite3'); + +const PORT = 3988; +const BASE = `http://127.0.0.1:${PORT}`; +const DATA_DIR = path.join(os.tmpdir(), 'st-recl-' + crypto.randomBytes(4).toString('hex')); +const LOG = path.join(os.tmpdir(), 'st-recl-' + crypto.randomBytes(4).toString('hex') + '.log'); +const DB_PATH = path.join(DATA_DIR, 'db', 'remote_display.db'); +let proc, tdb; +const sleep = (ms) => new Promise(r => setTimeout(r, ms)); + +before(async () => { + const logFd = fs.openSync(LOG, 'w'); + proc = spawn('node', ['server.js'], { + cwd: path.join(__dirname, '..'), + env: { ...process.env, DATA_DIR, SELF_HOSTED: 'true', PORT: String(PORT), NODE_ENV: 'test', RECLAIM_SETTLE_SECONDS: '300', RECLAIM_REJECT_LOG_WINDOW_MS: '60000' }, + stdio: ['ignore', logFd, logFd], + }); + let up = false; + for (let i = 0; i < 80; i++) { try { const r = await fetch(BASE + '/api/status'); if (r.ok) { up = true; break; } } catch { /* */ } await sleep(250); } + if (!up) throw new Error('server did not boot:\n' + fs.readFileSync(LOG, 'utf8').slice(-2000)); + tdb = new Database(DB_PATH); tdb.pragma('busy_timeout = 3000'); tdb.pragma('foreign_keys = OFF'); +}); +after(() => { try { tdb && tdb.close(); } catch { /* */ } try { proc.kill('SIGKILL'); } catch { /* */ } }); + +// Seed a device + its fingerprint link directly (no socket -> no lingering liveConn). +function seedDevice(fp, { token, heartbeatAgo }) { + const id = crypto.randomUUID(); + tdb.prepare("INSERT INTO devices (id, status, last_heartbeat, device_token) VALUES (?, 'offline', strftime('%s','now') - ?, ?)").run(id, heartbeatAgo, token); + tdb.prepare('INSERT INTO device_fingerprints (fingerprint, device_id) VALUES (?, ?)').run(fp, id); + return { id, token }; +} +function staleHeartbeat(id, ago) { tdb.prepare("UPDATE devices SET last_heartbeat = strftime('%s','now') - ? WHERE id = ?").run(ago, id); } + +function attempt(payload) { // one-shot register; resolves and closes + return new Promise((resolve) => { + const sock = ioClient(`${BASE}/device`, { transports: ['websocket'], reconnection: false, forceNew: true }); + const got = { registered: false, newId: null, authError: false, errorMsg: null }; + const finish = () => { try { sock.close(); } catch { /* */ } resolve(got); }; + sock.on('connect', () => sock.emit('device:register', payload)); + sock.on('device:registered', (d) => { got.registered = true; got.newId = d.device_id; setTimeout(finish, 150); }); + sock.on('device:auth-error', (e) => { got.authError = true; got.errorMsg = e && e.error; finish(); }); + setTimeout(finish, 4000); + }); +} +function connectLive(payload) { // keeps the socket open (live connection); caller closes + return new Promise((resolve) => { + const sock = ioClient(`${BASE}/device`, { transports: ['websocket'], reconnection: false, forceNew: true }); + sock.on('connect', () => sock.emit('device:register', payload)); + sock.on('device:registered', () => resolve({ sock, registered: true })); + sock.on('device:auth-error', () => resolve({ sock, registered: false })); + setTimeout(() => resolve({ sock, registered: false }), 4000); + }); +} +const rnd = () => String(crypto.randomInt(100000, 1000000)); + +test('#143 repro: a gone device (no live conn + stale heartbeat) is reclaimable', async () => { + const fp = 'fp-gone-' + crypto.randomBytes(4).toString('hex'); + const dev = seedDevice(fp, { token: 'tok', heartbeatAgo: 99999 }); // ~27h stale, never connected + const r = await attempt({ pairing_code: rnd(), fingerprint: fp }); // no device_id -> reclaim path + assert.ok(r.registered, 'reclaim SUCCEEDS for a gone device'); + assert.equal(r.newId, dev.id, 'it reclaims the SAME device identity'); + assert.ok(!r.authError, 'no rejection'); +}); + +test('no regression: a genuinely live device REJECTS a fingerprint reclaim', async () => { + const fp = 'fp-live-' + crypto.randomBytes(4).toString('hex'); + const dev = seedDevice(fp, { token: 'tok2', heartbeatAgo: 10 }); + const live = await connectLive({ device_id: dev.id, device_token: 'tok2', device_info: {} }); + assert.ok(live.registered, 'device is live (has a connection)'); + const r = await attempt({ pairing_code: rnd(), fingerprint: fp }); + assert.ok(r.authError && !r.registered, 'reclaim of a LIVE device is rejected (abuse protection intact)'); + try { live.sock.close(); } catch { /* */ } +}); + +test('clear-on-leave: after disconnect, liveConn is cleared so a (stale) device reclaims', async () => { + const fp = 'fp-leave-' + crypto.randomBytes(4).toString('hex'); + const dev = seedDevice(fp, { token: 'tok3', heartbeatAgo: 99999 }); + const live = await connectLive({ device_id: dev.id, device_token: 'tok3', device_info: {} }); + assert.ok(live.registered); + // while live, reclaim is rejected (liveConn present) + let r = await attempt({ pairing_code: rnd(), fingerprint: fp }); + assert.ok(!r.registered, 'rejected while a live connection exists'); + // leave: close + wait past the 5s offline-debounce so removeConnection runs + try { live.sock.close(); } catch { /* */ } + await sleep(6000); + staleHeartbeat(dev.id, 99999); // the live register bumped last_heartbeat; re-stale it + r = await attempt({ pairing_code: rnd(), fingerprint: fp }); + assert.ok(r.registered, 'after disconnect cleared liveConn, the gone device reclaims'); +}); + +test('log noise: a retried reclaim logs at most once per device per window', async () => { + const fp = 'fp-log-' + crypto.randomBytes(4).toString('hex'); + const dev = seedDevice(fp, { token: 'tok4', heartbeatAgo: 5 }); // recent -> reclaim deferred + const live = await connectLive({ device_id: dev.id, device_token: 'tok4', device_info: {} }); + for (let i = 0; i < 4; i++) { const r = await attempt({ pairing_code: rnd(), fingerprint: fp }); assert.ok(r.authError, 'each retry is deferred'); } + try { live.sock.close(); } catch { /* */ } + await sleep(200); + const lines = fs.readFileSync(LOG, 'utf8').split('\n').filter(l => l.includes('reclaim deferred for ' + dev.id)).length; + assert.ok(lines <= 1, `at most one deferral log per window (got ${lines}); no double-log / per-2s flood`); +}); diff --git a/server/ws/deviceSocket.js b/server/ws/deviceSocket.js index 3a0ed94..191067d 100644 --- a/server/ws/deviceSocket.js +++ b/server/ws/deviceSocket.js @@ -34,6 +34,10 @@ const PLAY_LOG_MIN_GAP_MS = 2000; // #142 dedup + #143 per-device rate budget + global loop-lag valve for content-acks // all live in one control: lib/content-ack-limiter.js (required above as // contentAckLimiter). Kept out of this file so there is a single limiter on the path. + +// #143 fingerprint-reclaim deferral log throttle: deviceId -> last-logged ms, so a +// device retrying reclaim every ~2s logs at most once per reclaimRejectLogWindowMs. +const lastReclaimRejectLogAt = new Map(); const { getUserPlan, getUserDeviceCount } = require('../middleware/subscription'); // Phase 2.3: deviceRoom() resolves a device_id to its workspace room so // dashboardNs.emit can be scoped instead of broadcast platform-wide. @@ -294,20 +298,31 @@ module.exports = function setupDeviceSocket(io) { const oldDevice = db.prepare('SELECT * FROM devices WHERE id = ?').get(existing.device_id); if (oldDevice) { // Fingerprint reclaim guard: a leaked/duplicated fingerprint shouldn't be enough - // to take over a live device. Reject the reclaim if the device is currently - // online OR has been online within the last 24h — by then a real reinstall has - // had plenty of time to come back, but a credential thief is more likely caught. + // to take over a LIVE device. #143: decide "still alive" from RUNTIME signals — + // a live socket, OR a genuinely recent heartbeat (within the settle window). The + // old check used `secondsSince < 24h`, which treated a device merely offline <24h + // as "active": a legitimately-gone device (liveConn=false, status=offline, stale + // heartbeat) could never reclaim and retried every ~2s, flooding logs (Bold beta1 + // / 2febcaa9, 1984694c, 139159eb). NOT a missing clear — liveConn IS removed on + // disconnect + the offline-timeout sweep, and status IS set offline on both; the + // 24h TIME gate was the cause. A device gone by every runtime signal is reclaimable. const liveConn = heartbeat.getConnection(existing.device_id); - const RECLAIM_GRACE_SECONDS = 24 * 60 * 60; const lastBeat = oldDevice.last_heartbeat || 0; const secondsSince = Math.floor(Date.now() / 1000) - lastBeat; - if (liveConn || (oldDevice.status === 'online') || secondsSince < RECLAIM_GRACE_SECONDS) { - console.warn(`Fingerprint reclaim rejected for ${existing.device_id}: device active (status=${oldDevice.status}, ${secondsSince}s since last heartbeat, liveConn=${!!liveConn})`); + const stillAlive = !!liveConn || secondsSince < config.reclaimSettleSeconds; + if (stillAlive) { + // Log at most once per device per window so a retrying/stuck device can't flood stdout. + const nowMs = Date.now(); + if (nowMs - (lastReclaimRejectLogAt.get(existing.device_id) || 0) >= config.reclaimRejectLogWindowMs) { + lastReclaimRejectLogAt.set(existing.device_id, nowMs); + console.warn(`Fingerprint reclaim deferred for ${existing.device_id}: still settling (status=${oldDevice.status}, ${secondsSince}s since heartbeat, liveConn=${!!liveConn}); reclaimable after ${config.reclaimSettleSeconds}s offline`); + } socket.emit('device:auth-error', { - error: 'This display is currently active. If you reinstalled the app, the original device must be offline for 24 hours before its slot can be reclaimed.' + error: `This display was recently active. If you reinstalled the app, retry after it has been offline for ${config.reclaimSettleSeconds} seconds.` }); return; } + lastReclaimRejectLogAt.delete(existing.device_id); // reclaim proceeding — clear any deferral log state // Fingerprint matched — this is a reinstalled app reconnecting to its old device. // Issue a fresh token so the app can authenticate going forward.