mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
fix(#142): global device_status_log retention sweep + STATUS_LOG_RETENTION_DAYS
The per-device insert-time prune (deviceSocket.js) only ever touches a device that is actively inserting, so it misses two paths: removed/idle devices whose rows linger forever, and heartbeat.js's offline_timeout insert that bypasses logDeviceStatus entirely. The reporter's 1.2M-row bloat accumulated UNDER a 7-day per-device prune for exactly this reason. - pruneStatusLog() (db/database.js): a GLOBAL time-range sweep across ALL devices, modeled on the play_logs prune. Run once on startup (recovers a bloated table right after deploy) and on the heartbeat interval (services/heartbeat.js). - STATUS_LOG_RETENTION_DAYS env, default 3 (lower than the old hardcoded 7d; the dashboard only shows a 24h uptime window, so 2-3d is ample for diagnostics). - Deliberately NO per-device row cap: Step 3's throttle already bounds how fast a storming device can generate status rows, so a cap would add sweep complexity for little gain (noted for later if needed). - NO VACUUM / auto_vacuum here (kept off the hot path); space reclaim is left as a separate decision (see report). test: deterministic in-process unit test proves the sweep deletes over-retention rows across all devices — including a device absent from the devices table and an offline_timeout row — while keeping recent rows; idempotent on an empty table. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
101f086204
commit
29a8896aa8
|
|
@ -134,4 +134,13 @@ module.exports = {
|
|||
reconnectWarmupMs: parseInt(process.env.RECONNECT_WARMUP_MS) || 30000,
|
||||
reconnectBandElevatedMult: parseFloat(process.env.RECONNECT_BAND_ELEVATED_MULT) || 2,
|
||||
reconnectBandCriticalMult: parseFloat(process.env.RECONNECT_BAND_CRITICAL_MULT) || 4,
|
||||
|
||||
// #142 device_status_log retention. A GLOBAL scheduled sweep (pruneStatusLog in
|
||||
// db/database.js, run on startup + the heartbeat interval) deletes rows older
|
||||
// than this across ALL devices — covering what the per-device insert-time prune
|
||||
// in deviceSocket.js misses: removed/idle devices that never insert again, and
|
||||
// the heartbeat.js offline_timeout insert that bypasses logDeviceStatus. Default
|
||||
// is LOWER than the old hardcoded 7 days (the reporter's bloat happened under 7d);
|
||||
// 2-3 days is plenty for the dashboard's 24h uptime view + diagnostics.
|
||||
statusLogRetentionDays: parseFloat(process.env.STATUS_LOG_RETENTION_DAYS) || 3,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -750,6 +750,21 @@ const { applyTenantDeleteCascade } = require('../lib/tenant-cascade-migration');
|
|||
}
|
||||
})();
|
||||
|
||||
// #142 GLOBAL device_status_log retention sweep across ALL devices. Run on startup
|
||||
// and on the heartbeat interval (services/heartbeat.js). This covers the rows the
|
||||
// per-device insert-time prune in deviceSocket.js misses: removed/idle devices that
|
||||
// never insert again, and the heartbeat offline_timeout insert that bypasses
|
||||
// logDeviceStatus. A plain time-range delete (like the play_logs prune) — runs off
|
||||
// the hot path; after the first sweep the table is small, so the cost is negligible.
|
||||
function pruneStatusLog() {
|
||||
try {
|
||||
const maxAgeSec = Math.round(config.statusLogRetentionDays * 86400);
|
||||
const n = db.prepare("DELETE FROM device_status_log WHERE timestamp < strftime('%s','now') - ?").run(maxAgeSec).changes;
|
||||
if (n > 0) console.log(`[status-log] pruned ${n} row(s) older than ${config.statusLogRetentionDays}d`);
|
||||
return n;
|
||||
} catch (_) { return 0; }
|
||||
}
|
||||
|
||||
// Prune old telemetry (keep last 24h worth at 15s intervals = ~5760, cap at 6000)
|
||||
function pruneTelemetry(deviceId) {
|
||||
db.prepare(`
|
||||
|
|
@ -822,4 +837,4 @@ try {
|
|||
const { verifyAndRepairSchema } = require('../lib/schema-check');
|
||||
verifyAndRepairSchema(db);
|
||||
|
||||
module.exports = { db, pruneTelemetry, pruneScreenshots };
|
||||
module.exports = { db, pruneTelemetry, pruneScreenshots, pruneStatusLog };
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
const { db } = require('../db/database');
|
||||
const { db, pruneStatusLog } = require('../db/database');
|
||||
const config = require('../config');
|
||||
const { deviceRoom, emitToWorkspace } = require('../lib/socket-rooms');
|
||||
|
||||
|
|
@ -6,6 +6,10 @@ const { deviceRoom, emitToWorkspace } = require('../lib/socket-rooms');
|
|||
const deviceConnections = new Map();
|
||||
|
||||
function startHeartbeatChecker(io) {
|
||||
// #142: sweep stale device_status_log rows once at startup (recovers a bloated
|
||||
// table immediately after a deploy), then again on each interval below.
|
||||
pruneStatusLog();
|
||||
|
||||
setInterval(() => {
|
||||
const now = Date.now();
|
||||
const dashboardNs = io.of('/dashboard');
|
||||
|
|
@ -49,6 +53,10 @@ function startHeartbeatChecker(io) {
|
|||
DELETE FROM play_logs WHERE started_at < strftime('%s','now') - (90 * 86400)
|
||||
`).run();
|
||||
|
||||
// #142: global device_status_log retention sweep (all devices, incl. removed/idle
|
||||
// and the offline_timeout insert path that bypasses the per-device prune).
|
||||
pruneStatusLog();
|
||||
|
||||
// Cleanup: expired team invites
|
||||
db.prepare(`
|
||||
DELETE FROM team_invites WHERE expires_at < strftime('%s','now')
|
||||
|
|
|
|||
48
server/test/status-log-prune.test.js
Normal file
48
server/test/status-log-prune.test.js
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
'use strict';
|
||||
|
||||
// #142 step 4 — global device_status_log retention sweep. Deterministic, in-process
|
||||
// (no server/port). Isolate the DB and set retention BEFORE requiring the module
|
||||
// (config reads env at load; database.js initialises a DB on load).
|
||||
|
||||
const os = require('node:os');
|
||||
const path = require('node:path');
|
||||
const crypto = require('node:crypto');
|
||||
process.env.DATA_DIR = path.join(os.tmpdir(), 'st-statusprune-' + crypto.randomBytes(4).toString('hex'));
|
||||
process.env.STATUS_LOG_RETENTION_DAYS = '2';
|
||||
|
||||
const { test } = require('node:test');
|
||||
const assert = require('node:assert/strict');
|
||||
const { db, pruneStatusLog } = require('../db/database');
|
||||
|
||||
test('global sweep deletes rows older than retention across ALL devices, keeps recent', () => {
|
||||
db.exec('DELETE FROM device_status_log'); // clean slate
|
||||
const old = db.prepare("INSERT INTO device_status_log (device_id, status, timestamp) VALUES (?, ?, strftime('%s','now') - ?)");
|
||||
|
||||
// 5 days old (> 2d retention): an active device, a device NOT in the devices
|
||||
// table (removed/idle — what the per-device insert-time prune never revisits),
|
||||
// and the heartbeat offline_timeout status that bypasses logDeviceStatus.
|
||||
old.run('live-dev', 'online', 5 * 86400);
|
||||
old.run('removed-idle-dev', 'offline', 5 * 86400);
|
||||
old.run('hb-dev', 'offline_timeout', 5 * 86400);
|
||||
// recent (< retention): must survive, regardless of device existence / status.
|
||||
old.run('live-dev', 'online', 0);
|
||||
old.run('hb-dev', 'offline_timeout', 3600);
|
||||
|
||||
assert.equal(db.prepare('SELECT COUNT(*) c FROM device_status_log').get().c, 5, 'seeded 5 rows');
|
||||
|
||||
const deleted = pruneStatusLog();
|
||||
assert.equal(deleted, 3, 'the 3 over-retention rows pruned (incl. removed-idle + offline_timeout paths)');
|
||||
|
||||
const remaining = db.prepare('SELECT device_id, status FROM device_status_log ORDER BY device_id').all();
|
||||
assert.equal(remaining.length, 2);
|
||||
// both survivors are the recent rows; no old row of any device/status survived
|
||||
assert.deepEqual(remaining.map(r => r.device_id).sort(), ['hb-dev', 'live-dev']);
|
||||
const oldestNow = db.prepare("SELECT MIN(timestamp) m FROM device_status_log").get().m;
|
||||
const cutoff = Math.floor(Date.now() / 1000) - 2 * 86400;
|
||||
assert.ok(oldestNow >= cutoff, 'no surviving row is older than the retention cutoff');
|
||||
});
|
||||
|
||||
test('sweep is safe and idempotent on an empty/already-clean table', () => {
|
||||
db.exec('DELETE FROM device_status_log');
|
||||
assert.equal(pruneStatusLog(), 0, 'nothing to delete -> 0, no throw');
|
||||
});
|
||||
Loading…
Reference in a new issue