mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
fix(#143): fingerprint-reclaim stuck loop — reclaim by runtime liveness, throttle log
Bold beta1: three devices spam "Fingerprint reclaim rejected ... device active (status=offline, ~2500s since heartbeat, liveConn=false)" twice/~2s indefinitely — contradictory: gone by every signal yet treated as active. Root cause (NOT a missing clear — corrected the hypothesis). The reject condition was `liveConn || status==='online' || secondsSince < RECLAIM_GRACE_SECONDS(24h)`. For the observed devices liveConn=false and status=offline, so the ONLY true term is `secondsSince < 24h` — an effective 24h CALENDAR grace, not a stale flag. Audited the clears: liveConn (deviceConnections) is removed on the debounced disconnect (heartbeat.removeConnection) AND the offline_timeout sweep (deviceConnections.delete); status is set 'offline' on both. liveConn=false + status=offline PROVE the clears ran — there is nothing stale to clear. The 24h time gate (mislabeled "device active") blocked a legitimately-gone device from reclaiming for up to 24h, so it retried every ~2s forever-in-practice. The "twice per ~2s" is two reclaim ATTEMPTS per cycle (client reconnect + re-pair-on-auth-error), each hitting the single console.warn — not double-logging in one attempt. Fix: - Decide "still alive" from RUNTIME signals: `!!liveConn || secondsSince < reclaimSettleSeconds`. A device with no live socket and a heartbeat older than the settle window is gone -> reclaimable. A live (or just-seen) device is still rejected, so reclaim-abuse protection holds. NOT just ignoring "active" — it fixes WHY it was stuck (the 24h gate). RECLAIM_SETTLE_SECONDS default 300 (was 24h). SECURITY TRADEOFF flagged in config: shortens the anti-fingerprint-theft window; raise to re-tighten. Tuning guess to validate vs Bold. - Log throttle: the deferral logs at most once per device per RECLAIM_REJECT_LOG_ WINDOW_MS (default 60s) — collapses the double-log + the per-2s flood (same discipline as the content-ack shed log). Cleared when a reclaim proceeds. Recovery of the 3 wedged devices (2febcaa9, 1984694c, 139159eb): they SELF-HEAL on their next reclaim attempt (~2s) once this ships — their heartbeats are ~2500s stale (>300s settle) and liveConn=false, so the reclaim now succeeds. No operator SQL needed. Tests (port 3988): gone device reclaims; live device still rejected; clear-on-leave (disconnect clears liveConn -> stale device reclaims); deferral log <=1 per window. Full suite green serial+parallel (217). reconnect-throttle.js, thedbac699content-ack limiter, and the404c330block/auth code untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
404c3301dd
commit
e73428182d
|
|
@ -159,4 +159,19 @@ module.exports = {
|
||||||
// legit and below the flood. Easy to retune via env.
|
// legit and below the flood. Easy to retune via env.
|
||||||
contentAckMaxPerWindow: parseInt(process.env.CONTENT_ACK_MAX_PER_WINDOW) || 20,
|
contentAckMaxPerWindow: parseInt(process.env.CONTENT_ACK_MAX_PER_WINDOW) || 20,
|
||||||
contentAckRateWindowMs: parseInt(process.env.CONTENT_ACK_RATE_WINDOW_MS) || 10000,
|
contentAckRateWindowMs: parseInt(process.env.CONTENT_ACK_RATE_WINDOW_MS) || 10000,
|
||||||
|
|
||||||
|
// #143 fingerprint-reclaim liveness. A reinstalled app (same fingerprint, no
|
||||||
|
// device_id, has pairing_code) may reclaim its old device's identity once that
|
||||||
|
// device is gone by RUNTIME signals: no live socket AND last heartbeat older than
|
||||||
|
// this settle window. Previously an effective 24h calendar grace treated a device
|
||||||
|
// merely offline <24h as "active", so a legitimately-gone device (liveConn=false,
|
||||||
|
// status=offline, stale heartbeat) could never reclaim and retried every ~2s,
|
||||||
|
// flooding logs (Bold beta1). Settle = the max reclaim wait; keep it comfortably
|
||||||
|
// above heartbeatTimeout (45s) so a brief blip isn't mistaken for "gone".
|
||||||
|
// SECURITY TRADEOFF: this also shortens the anti-(fingerprint-theft) window from
|
||||||
|
// 24h — raise it to re-tighten, at the cost of reinstall latency. Tuning guess.
|
||||||
|
reclaimSettleSeconds: parseInt(process.env.RECLAIM_SETTLE_SECONDS) || 300,
|
||||||
|
// #143 throttle the reclaim-deferred log to once per device per window, so a
|
||||||
|
// retrying/stuck device can't flood stdout (same discipline as the content-ack shed log).
|
||||||
|
reclaimRejectLogWindowMs: parseInt(process.env.RECLAIM_REJECT_LOG_WINDOW_MS) || 60000,
|
||||||
};
|
};
|
||||||
|
|
|
||||||
116
server/test/fingerprint-reclaim.test.js
Normal file
116
server/test/fingerprint-reclaim.test.js
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
// #143 — fingerprint-reclaim stuck loop. A device gone by every RUNTIME signal
|
||||||
|
// (no live socket + stale heartbeat) must be reclaimable; a genuinely-live device
|
||||||
|
// must still be rejected; the deferral log must not flood. Devices are seeded by
|
||||||
|
// direct SQLite (mimics the real DB state + avoids the disconnect-debounce window
|
||||||
|
// leaving a stale liveConn). Unique PORT 3988 (not 3982-3987).
|
||||||
|
|
||||||
|
const { test, before, after } = require('node:test');
|
||||||
|
const assert = require('node:assert/strict');
|
||||||
|
const { spawn } = require('node:child_process');
|
||||||
|
const path = require('node:path');
|
||||||
|
const os = require('node:os');
|
||||||
|
const fs = require('node:fs');
|
||||||
|
const crypto = require('node:crypto');
|
||||||
|
const ioClient = require('socket.io-client');
|
||||||
|
const Database = require('better-sqlite3');
|
||||||
|
|
||||||
|
const PORT = 3988;
|
||||||
|
const BASE = `http://127.0.0.1:${PORT}`;
|
||||||
|
const DATA_DIR = path.join(os.tmpdir(), 'st-recl-' + crypto.randomBytes(4).toString('hex'));
|
||||||
|
const LOG = path.join(os.tmpdir(), 'st-recl-' + crypto.randomBytes(4).toString('hex') + '.log');
|
||||||
|
const DB_PATH = path.join(DATA_DIR, 'db', 'remote_display.db');
|
||||||
|
let proc, tdb;
|
||||||
|
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||||
|
|
||||||
|
before(async () => {
|
||||||
|
const logFd = fs.openSync(LOG, 'w');
|
||||||
|
proc = spawn('node', ['server.js'], {
|
||||||
|
cwd: path.join(__dirname, '..'),
|
||||||
|
env: { ...process.env, DATA_DIR, SELF_HOSTED: 'true', PORT: String(PORT), NODE_ENV: 'test', RECLAIM_SETTLE_SECONDS: '300', RECLAIM_REJECT_LOG_WINDOW_MS: '60000' },
|
||||||
|
stdio: ['ignore', logFd, logFd],
|
||||||
|
});
|
||||||
|
let up = false;
|
||||||
|
for (let i = 0; i < 80; i++) { try { const r = await fetch(BASE + '/api/status'); if (r.ok) { up = true; break; } } catch { /* */ } await sleep(250); }
|
||||||
|
if (!up) throw new Error('server did not boot:\n' + fs.readFileSync(LOG, 'utf8').slice(-2000));
|
||||||
|
tdb = new Database(DB_PATH); tdb.pragma('busy_timeout = 3000'); tdb.pragma('foreign_keys = OFF');
|
||||||
|
});
|
||||||
|
after(() => { try { tdb && tdb.close(); } catch { /* */ } try { proc.kill('SIGKILL'); } catch { /* */ } });
|
||||||
|
|
||||||
|
// Seed a device + its fingerprint link directly (no socket -> no lingering liveConn).
|
||||||
|
function seedDevice(fp, { token, heartbeatAgo }) {
|
||||||
|
const id = crypto.randomUUID();
|
||||||
|
tdb.prepare("INSERT INTO devices (id, status, last_heartbeat, device_token) VALUES (?, 'offline', strftime('%s','now') - ?, ?)").run(id, heartbeatAgo, token);
|
||||||
|
tdb.prepare('INSERT INTO device_fingerprints (fingerprint, device_id) VALUES (?, ?)').run(fp, id);
|
||||||
|
return { id, token };
|
||||||
|
}
|
||||||
|
function staleHeartbeat(id, ago) { tdb.prepare("UPDATE devices SET last_heartbeat = strftime('%s','now') - ? WHERE id = ?").run(ago, id); }
|
||||||
|
|
||||||
|
function attempt(payload) { // one-shot register; resolves and closes
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const sock = ioClient(`${BASE}/device`, { transports: ['websocket'], reconnection: false, forceNew: true });
|
||||||
|
const got = { registered: false, newId: null, authError: false, errorMsg: null };
|
||||||
|
const finish = () => { try { sock.close(); } catch { /* */ } resolve(got); };
|
||||||
|
sock.on('connect', () => sock.emit('device:register', payload));
|
||||||
|
sock.on('device:registered', (d) => { got.registered = true; got.newId = d.device_id; setTimeout(finish, 150); });
|
||||||
|
sock.on('device:auth-error', (e) => { got.authError = true; got.errorMsg = e && e.error; finish(); });
|
||||||
|
setTimeout(finish, 4000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function connectLive(payload) { // keeps the socket open (live connection); caller closes
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const sock = ioClient(`${BASE}/device`, { transports: ['websocket'], reconnection: false, forceNew: true });
|
||||||
|
sock.on('connect', () => sock.emit('device:register', payload));
|
||||||
|
sock.on('device:registered', () => resolve({ sock, registered: true }));
|
||||||
|
sock.on('device:auth-error', () => resolve({ sock, registered: false }));
|
||||||
|
setTimeout(() => resolve({ sock, registered: false }), 4000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const rnd = () => String(crypto.randomInt(100000, 1000000));
|
||||||
|
|
||||||
|
test('#143 repro: a gone device (no live conn + stale heartbeat) is reclaimable', async () => {
|
||||||
|
const fp = 'fp-gone-' + crypto.randomBytes(4).toString('hex');
|
||||||
|
const dev = seedDevice(fp, { token: 'tok', heartbeatAgo: 99999 }); // ~27h stale, never connected
|
||||||
|
const r = await attempt({ pairing_code: rnd(), fingerprint: fp }); // no device_id -> reclaim path
|
||||||
|
assert.ok(r.registered, 'reclaim SUCCEEDS for a gone device');
|
||||||
|
assert.equal(r.newId, dev.id, 'it reclaims the SAME device identity');
|
||||||
|
assert.ok(!r.authError, 'no rejection');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('no regression: a genuinely live device REJECTS a fingerprint reclaim', async () => {
|
||||||
|
const fp = 'fp-live-' + crypto.randomBytes(4).toString('hex');
|
||||||
|
const dev = seedDevice(fp, { token: 'tok2', heartbeatAgo: 10 });
|
||||||
|
const live = await connectLive({ device_id: dev.id, device_token: 'tok2', device_info: {} });
|
||||||
|
assert.ok(live.registered, 'device is live (has a connection)');
|
||||||
|
const r = await attempt({ pairing_code: rnd(), fingerprint: fp });
|
||||||
|
assert.ok(r.authError && !r.registered, 'reclaim of a LIVE device is rejected (abuse protection intact)');
|
||||||
|
try { live.sock.close(); } catch { /* */ }
|
||||||
|
});
|
||||||
|
|
||||||
|
test('clear-on-leave: after disconnect, liveConn is cleared so a (stale) device reclaims', async () => {
|
||||||
|
const fp = 'fp-leave-' + crypto.randomBytes(4).toString('hex');
|
||||||
|
const dev = seedDevice(fp, { token: 'tok3', heartbeatAgo: 99999 });
|
||||||
|
const live = await connectLive({ device_id: dev.id, device_token: 'tok3', device_info: {} });
|
||||||
|
assert.ok(live.registered);
|
||||||
|
// while live, reclaim is rejected (liveConn present)
|
||||||
|
let r = await attempt({ pairing_code: rnd(), fingerprint: fp });
|
||||||
|
assert.ok(!r.registered, 'rejected while a live connection exists');
|
||||||
|
// leave: close + wait past the 5s offline-debounce so removeConnection runs
|
||||||
|
try { live.sock.close(); } catch { /* */ }
|
||||||
|
await sleep(6000);
|
||||||
|
staleHeartbeat(dev.id, 99999); // the live register bumped last_heartbeat; re-stale it
|
||||||
|
r = await attempt({ pairing_code: rnd(), fingerprint: fp });
|
||||||
|
assert.ok(r.registered, 'after disconnect cleared liveConn, the gone device reclaims');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('log noise: a retried reclaim logs at most once per device per window', async () => {
|
||||||
|
const fp = 'fp-log-' + crypto.randomBytes(4).toString('hex');
|
||||||
|
const dev = seedDevice(fp, { token: 'tok4', heartbeatAgo: 5 }); // recent -> reclaim deferred
|
||||||
|
const live = await connectLive({ device_id: dev.id, device_token: 'tok4', device_info: {} });
|
||||||
|
for (let i = 0; i < 4; i++) { const r = await attempt({ pairing_code: rnd(), fingerprint: fp }); assert.ok(r.authError, 'each retry is deferred'); }
|
||||||
|
try { live.sock.close(); } catch { /* */ }
|
||||||
|
await sleep(200);
|
||||||
|
const lines = fs.readFileSync(LOG, 'utf8').split('\n').filter(l => l.includes('reclaim deferred for ' + dev.id)).length;
|
||||||
|
assert.ok(lines <= 1, `at most one deferral log per window (got ${lines}); no double-log / per-2s flood`);
|
||||||
|
});
|
||||||
|
|
@ -34,6 +34,10 @@ const PLAY_LOG_MIN_GAP_MS = 2000;
|
||||||
// #142 dedup + #143 per-device rate budget + global loop-lag valve for content-acks
|
// #142 dedup + #143 per-device rate budget + global loop-lag valve for content-acks
|
||||||
// all live in one control: lib/content-ack-limiter.js (required above as
|
// all live in one control: lib/content-ack-limiter.js (required above as
|
||||||
// contentAckLimiter). Kept out of this file so there is a single limiter on the path.
|
// contentAckLimiter). Kept out of this file so there is a single limiter on the path.
|
||||||
|
|
||||||
|
// #143 fingerprint-reclaim deferral log throttle: deviceId -> last-logged ms, so a
|
||||||
|
// device retrying reclaim every ~2s logs at most once per reclaimRejectLogWindowMs.
|
||||||
|
const lastReclaimRejectLogAt = new Map();
|
||||||
const { getUserPlan, getUserDeviceCount } = require('../middleware/subscription');
|
const { getUserPlan, getUserDeviceCount } = require('../middleware/subscription');
|
||||||
// Phase 2.3: deviceRoom() resolves a device_id to its workspace room so
|
// Phase 2.3: deviceRoom() resolves a device_id to its workspace room so
|
||||||
// dashboardNs.emit can be scoped instead of broadcast platform-wide.
|
// dashboardNs.emit can be scoped instead of broadcast platform-wide.
|
||||||
|
|
@ -294,20 +298,31 @@ module.exports = function setupDeviceSocket(io) {
|
||||||
const oldDevice = db.prepare('SELECT * FROM devices WHERE id = ?').get(existing.device_id);
|
const oldDevice = db.prepare('SELECT * FROM devices WHERE id = ?').get(existing.device_id);
|
||||||
if (oldDevice) {
|
if (oldDevice) {
|
||||||
// Fingerprint reclaim guard: a leaked/duplicated fingerprint shouldn't be enough
|
// Fingerprint reclaim guard: a leaked/duplicated fingerprint shouldn't be enough
|
||||||
// to take over a live device. Reject the reclaim if the device is currently
|
// to take over a LIVE device. #143: decide "still alive" from RUNTIME signals —
|
||||||
// online OR has been online within the last 24h — by then a real reinstall has
|
// a live socket, OR a genuinely recent heartbeat (within the settle window). The
|
||||||
// had plenty of time to come back, but a credential thief is more likely caught.
|
// old check used `secondsSince < 24h`, which treated a device merely offline <24h
|
||||||
|
// as "active": a legitimately-gone device (liveConn=false, status=offline, stale
|
||||||
|
// heartbeat) could never reclaim and retried every ~2s, flooding logs (Bold beta1
|
||||||
|
// / 2febcaa9, 1984694c, 139159eb). NOT a missing clear — liveConn IS removed on
|
||||||
|
// disconnect + the offline-timeout sweep, and status IS set offline on both; the
|
||||||
|
// 24h TIME gate was the cause. A device gone by every runtime signal is reclaimable.
|
||||||
const liveConn = heartbeat.getConnection(existing.device_id);
|
const liveConn = heartbeat.getConnection(existing.device_id);
|
||||||
const RECLAIM_GRACE_SECONDS = 24 * 60 * 60;
|
|
||||||
const lastBeat = oldDevice.last_heartbeat || 0;
|
const lastBeat = oldDevice.last_heartbeat || 0;
|
||||||
const secondsSince = Math.floor(Date.now() / 1000) - lastBeat;
|
const secondsSince = Math.floor(Date.now() / 1000) - lastBeat;
|
||||||
if (liveConn || (oldDevice.status === 'online') || secondsSince < RECLAIM_GRACE_SECONDS) {
|
const stillAlive = !!liveConn || secondsSince < config.reclaimSettleSeconds;
|
||||||
console.warn(`Fingerprint reclaim rejected for ${existing.device_id}: device active (status=${oldDevice.status}, ${secondsSince}s since last heartbeat, liveConn=${!!liveConn})`);
|
if (stillAlive) {
|
||||||
|
// Log at most once per device per window so a retrying/stuck device can't flood stdout.
|
||||||
|
const nowMs = Date.now();
|
||||||
|
if (nowMs - (lastReclaimRejectLogAt.get(existing.device_id) || 0) >= config.reclaimRejectLogWindowMs) {
|
||||||
|
lastReclaimRejectLogAt.set(existing.device_id, nowMs);
|
||||||
|
console.warn(`Fingerprint reclaim deferred for ${existing.device_id}: still settling (status=${oldDevice.status}, ${secondsSince}s since heartbeat, liveConn=${!!liveConn}); reclaimable after ${config.reclaimSettleSeconds}s offline`);
|
||||||
|
}
|
||||||
socket.emit('device:auth-error', {
|
socket.emit('device:auth-error', {
|
||||||
error: 'This display is currently active. If you reinstalled the app, the original device must be offline for 24 hours before its slot can be reclaimed.'
|
error: `This display was recently active. If you reinstalled the app, retry after it has been offline for ${config.reclaimSettleSeconds} seconds.`
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
lastReclaimRejectLogAt.delete(existing.device_id); // reclaim proceeding — clear any deferral log state
|
||||||
|
|
||||||
// Fingerprint matched — this is a reinstalled app reconnecting to its old device.
|
// Fingerprint matched — this is a reinstalled app reconnecting to its old device.
|
||||||
// Issue a fresh token so the app can authenticate going forward.
|
// Issue a fresh token so the app can authenticate going forward.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue