mirror of
https://github.com/screentinker/screentinker.git
synced 2026-06-29 09:23:16 -06:00
/api/update/check offered the update whenever client !== latest (raw string inequality, not semver) with no backoff. A device that can't APPLY the update (broken OTA client 1.7.12, signing/Fire OS) keeps reporting the same version and is told update_available=true on every poll; a fast poll loop saturates the event loop (prod loop-lag 49s). All requests share one NAT IP, so IP-keying is useless. server-only breaker (lib/ota-breaker.js), two independent axes: - RATE breaker (primary, immediate): a key checking >THRESHOLD (3) times within WINDOW (60s) is looping -> throttle update_available with exponential backoff (30s->2m->8m->cap 30m). Healthy devices poll ~12 min and never approach this, so rollout/stragglers are inherently safe -- NO grace-for-flood timer; slow == safe. - PHANTOM guard (immediate): unrecognized version, or a prerelease of an OLDER core (superseded old-minor beta e.g. 1.9.1-beta4), gets no-offer on the first check. A RECENT real older version (beta3 vs latest beta4; stable 1.7.12) stays offerable. - Never offers a downgrade (client >= latest -> no offer). KEYING (#144 option 3): keyed on device_id when present, else reported version. - server.js:581 accepts + logs ?device_id=, passes it to the breaker. - UpdateChecker.kt:122 appends &device_id=<config.deviceId> (existing registered id; omitted until provisioned). One-line client change. beta4+ clients get precise per-device throttling; stuck legacy clients sending only ?version= are caught by the version-keyed + rate + phantom logic. Response gains additive `reason` + `retry_after_seconds` (old clients ignore). BOUNDED STATE: a periodic sweep (startSweep, wired in server.js) evicts buckets idle > IDLE_RESET_MS so the keyed Map can't grow unbounded (churned device_ids); not reset-on-access only. SCOPE (deliberate): this targets the FAST flood + phantoms. The slow #144 drip (stable 1.7.12 polling ~every 12 min, ~20/hr) stays below >3/60s and is NOT throttled -- catching it needs #144 option-3 "skip-this-version after N cycles", which is intentionally NOT in this build. NOTE: carries a CLIENT/APK change -> versionCode must increment at the beta4 bump and the release keystore is required for the APK. The device_id path only helps devices that can install beta4+; the stuck legacy fleet is covered by the version-keyed path. Tests: unit (lib/ota-breaker, injected time) a-f + comparator + escalation + sweep + slow-drip-scope; HTTP integration (real endpoint, device_id passthrough). Full suite green serial AND parallel (234). OTA-only delta -- reconnect/reclaim/shed/content-ack/ block untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
119 lines
6.9 KiB
JavaScript
119 lines
6.9 KiB
JavaScript
// #144 — OTA update-check circuit-breaker + phantom-version guard.
|
|
//
|
|
// The /api/update/check handler offered the update whenever client !== latest (raw
|
|
// string inequality, not semver). A device that can't APPLY the update (old/broken
|
|
// OTA client like 1.7.12, signing mismatch, Fire OS) keeps reporting the same old
|
|
// version and is told update_available=true on every poll. A fast poll loop (10-30s)
|
|
// then saturates the event loop (prod loop-lag 49s).
|
|
//
|
|
// Two independent axes (kept separate on purpose):
|
|
//
|
|
// 1. RATE breaker (primary, immediate). Healthy devices poll ~every 12 min, so a key
|
|
// checking MORE than THRESHOLD times within WINDOW (default >3 / 60s) is by
|
|
// definition looping -> throttle update_available for that key with exponential
|
|
// backoff. Catches the fast flood within seconds. A normally-polling device never
|
|
// approaches this rate, so rollout/straggler updates are inherently safe — there
|
|
// is deliberately NO "tolerate the flood for N minutes" grace; slow == safe.
|
|
//
|
|
// 2. PHANTOM guard (immediate). An unrecognized version, or a prerelease of an OLDER
|
|
// core (a superseded old-minor beta — e.g. 1.9.1-beta4 when latest is 1.9.2-beta3),
|
|
// gets "no offer" on the first check. A RECENT real older version (e.g. beta3 when
|
|
// latest is beta4, or stable 1.7.12) is legitimately offerable and is NOT phantom.
|
|
//
|
|
// KEYING: keyed on device_id when the client sends one (beta4+ clients -> precise
|
|
// per-device throttling), falling back to the reported VERSION when absent (legacy
|
|
// clients send only ?version=, and the site is behind NAT so IP is useless). So every
|
|
// device is covered: new clients per-device, stuck legacy clients per-version.
|
|
//
|
|
// Constants are env-tunable for ops + tests.
|
|
|
|
const WINDOW_MS = parseInt(process.env.OTA_BREAKER_WINDOW_MS) || 60_000; // rate window
|
|
const THRESHOLD = parseInt(process.env.OTA_BREAKER_THRESHOLD) || 3; // checks/window before tripping (>THRESHOLD trips)
|
|
const COOLDOWNS_MS = (process.env.OTA_BREAKER_COOLDOWNS_MS
|
|
? process.env.OTA_BREAKER_COOLDOWNS_MS.split(',').map(s => parseInt(s, 10))
|
|
: [30_000, 120_000, 480_000, 1_800_000]); // 30s -> 2m -> 8m -> cap 30m
|
|
const IDLE_RESET_MS = parseInt(process.env.OTA_BREAKER_IDLE_RESET_MS) || 60 * 60 * 1000;
|
|
|
|
const state = new Map(); // key -> { hits:number[], blockedUntil, level, lastSeen }
|
|
const loggedBad = new Set(); // log unrecognized/superseded versions once
|
|
|
|
// --- minimal semver-ish parse/compare (no dependency) ---
|
|
function parseVer(v) {
|
|
if (typeof v !== 'string') return null;
|
|
const m = /^(\d+)\.(\d+)\.(\d+)(?:-(.+))?$/.exec(v.trim());
|
|
if (!m) return null;
|
|
return { core: [+m[1], +m[2], +m[3]], pre: m[4] || null };
|
|
}
|
|
function coreCmp(a, b) { for (let i = 0; i < 3; i++) if (a.core[i] !== b.core[i]) return a.core[i] < b.core[i] ? -1 : 1; return 0; }
|
|
function cmpParsed(a, b) {
|
|
const c = coreCmp(a, b);
|
|
if (c !== 0) return c;
|
|
if (a.pre === b.pre) return 0;
|
|
if (a.pre === null) return 1; // release outranks a prerelease of the same core
|
|
if (b.pre === null) return -1;
|
|
// lexical prerelease compare — fine for beta1..beta9 (cores decide everything else).
|
|
return a.pre < b.pre ? -1 : (a.pre > b.pre ? 1 : 0);
|
|
}
|
|
function cmp(a, b) { const pa = parseVer(a), pb = parseVer(b); return (!pa || !pb) ? null : cmpParsed(pa, pb); }
|
|
|
|
// decide(clientVersion, latestVersion, deviceId?, now?) ->
|
|
// { update_available, reason, retry_after_seconds?, log? }
|
|
function decide(clientVersion, latestVersion, deviceId = null, now = Date.now()) {
|
|
// ---- PHANTOM / unrecognized guard (immediate, version-based, no rate state) ----
|
|
if (!clientVersion) return { update_available: false, reason: 'no-version' };
|
|
const pc = parseVer(clientVersion), pl = parseVer(latestVersion);
|
|
if (!pc || !pl) return { update_available: false, reason: 'unrecognized-version', log: logOnce(clientVersion, `[ota] unrecognized client version '${clientVersion}' — no offer (latest=${latestVersion})`) };
|
|
const full = cmpParsed(pc, pl);
|
|
if (full === 0) return { update_available: false, reason: 'up-to-date' };
|
|
if (full > 0) return { update_available: false, reason: 'client-newer' }; // never offer a downgrade
|
|
if (pc.pre !== null && coreCmp(pc, pl) < 0) { // superseded old-core prerelease (e.g. 1.9.1-beta4)
|
|
return { update_available: false, reason: 'superseded-prerelease', log: logOnce(clientVersion, `[ota] superseded prerelease '${clientVersion}' (older core than latest=${latestVersion}) — no offer`) };
|
|
}
|
|
|
|
// ---- offerable (recent real older version) -> RATE breaker, keyed per device / per version ----
|
|
const key = deviceId ? 'd:' + deviceId : 'v:' + clientVersion;
|
|
let b = state.get(key);
|
|
if (!b) { b = { hits: [], blockedUntil: 0, level: 0, lastSeen: now }; state.set(key, b); }
|
|
if (now - b.lastSeen > IDLE_RESET_MS) { b.hits = []; b.blockedUntil = 0; b.level = 0; } // long-quiet -> fresh
|
|
b.lastSeen = now;
|
|
|
|
if (now < b.blockedUntil) {
|
|
return { update_available: false, reason: 'rate-backoff', retry_after_seconds: Math.ceil((b.blockedUntil - now) / 1000) };
|
|
}
|
|
if (b.blockedUntil !== 0) b.blockedUntil = 0; // cooldown elapsed -> probe window
|
|
|
|
b.hits = b.hits.filter(t => now - t < WINDOW_MS);
|
|
b.hits.push(now);
|
|
if (b.hits.length > THRESHOLD) { // looping faster than a healthy device ever would
|
|
const cd = COOLDOWNS_MS[Math.min(b.level, COOLDOWNS_MS.length - 1)];
|
|
b.blockedUntil = now + cd;
|
|
b.level++;
|
|
b.hits = []; // require a fresh burst to re-trip after cooldown
|
|
return { update_available: false, reason: 'rate-backoff', retry_after_seconds: Math.ceil(cd / 1000),
|
|
log: `[ota] breaker tripped key=${key} (>${THRESHOLD} checks/${Math.round(WINDOW_MS / 1000)}s, looping) -> backoff ${Math.round(cd / 1000)}s [level ${b.level}]` };
|
|
}
|
|
return { update_available: true, reason: 'offer' };
|
|
}
|
|
|
|
function logOnce(version, msg) { if (loggedBad.has(version)) return undefined; loggedBad.add(version); return msg; }
|
|
|
|
// #144: actively EVICT idle buckets so the keyed state can't grow unbounded over time
|
|
// (churned device_ids, varied versions). reset-on-access alone never deletes; this does.
|
|
function sweep(now = Date.now()) {
|
|
let n = 0;
|
|
for (const [k, b] of state) if (now - b.lastSeen > IDLE_RESET_MS) { state.delete(k); n++; }
|
|
if (n > 0) console.log(`[ota] breaker swept ${n} idle bucket(s) (idle > ${Math.round(IDLE_RESET_MS / 60000)}m); ${state.size} remain`);
|
|
return n;
|
|
}
|
|
let sweepTimer = null;
|
|
function startSweep() {
|
|
if (sweepTimer) return sweepTimer;
|
|
sweepTimer = setInterval(() => sweep(), IDLE_RESET_MS);
|
|
if (sweepTimer.unref) sweepTimer.unref(); // don't keep the process alive on this timer
|
|
return sweepTimer;
|
|
}
|
|
|
|
function reset() { state.clear(); loggedBad.clear(); }
|
|
function _size() { return state.size; }
|
|
module.exports = { decide, reset, sweep, startSweep, cmp, parseVer, _size, WINDOW_MS, THRESHOLD };
|