From 7b23e519c27bcdbf817b8604cd015d4c075b8759 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 17 Mar 2026 18:35:36 -0700 Subject: [PATCH] fix(foundry): add Bun idleTimeout safety net and subscription retry with backoff Bun.serve() defaults to a 10s idle timeout that can kill long-running requests. Actor RPCs go through the gateway tunnel with a 1s SSE ping, so this likely never fires, but set idleTimeout to 255 as a safety net. Subscription topics (app, org, session, task) previously had no retry mechanism. If the initial connection or a mid-session error occurred, the subscription stayed in error state permanently. Add exponential backoff retry (1s base, 30s max) that cleans up the old connection before each attempt and stops when disposed or no listeners remain. Co-Authored-By: Claude Opus 4.6 (1M context) --- foundry/packages/backend/src/index.ts | 94 +++++++++++++++++++ .../client/src/subscription/remote-manager.ts | 62 ++++++++++++ 2 files changed, 156 insertions(+) diff --git a/foundry/packages/backend/src/index.ts b/foundry/packages/backend/src/index.ts index e00abaa..617bacc 100644 --- a/foundry/packages/backend/src/index.ts +++ b/foundry/packages/backend/src/index.ts @@ -141,6 +141,59 @@ export async function startBackend(options: BackendStartOptions = {}): Promise.json, inspect with chrome://tracing) + app.get("/debug/memory", async (c) => { + if (process.env.NODE_ENV !== "development") { + return c.json({ error: "debug endpoints disabled in production" }, 403); + } + const wantGc = c.req.query("gc") === "1"; + if (wantGc && typeof Bun !== "undefined") { + // Bun.gc(true) triggers a synchronous full GC sweep in JavaScriptCore. + Bun.gc(true); + } + const mem = process.memoryUsage(); + const rssMb = Math.round(mem.rss / 1024 / 1024); + const heapUsedMb = Math.round(mem.heapUsed / 1024 / 1024); + const heapTotalMb = Math.round(mem.heapTotal / 1024 / 1024); + const externalMb = Math.round(mem.external / 1024 / 1024); + const nonHeapMb = rssMb - heapUsedMb - externalMb; + // Bun.heapStats() gives JSC-specific breakdown: object counts, typed array + // bytes, extra memory (native allocations tracked by JSC). Useful for + // distinguishing JS object bloat from native/WASM memory. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const BunAny = Bun as any; + const heapStats = typeof BunAny.heapStats === "function" ? BunAny.heapStats() : null; + const snapshot = { + rssMb, + heapUsedMb, + heapTotalMb, + externalMb, + nonHeapMb, + gcTriggered: wantGc, + rssBytes: mem.rss, + heapUsedBytes: mem.heapUsed, + heapTotalBytes: mem.heapTotal, + externalBytes: mem.external, + ...(heapStats ? { bunHeapStats: heapStats } : {}), + }; + // Optionally write a full JSC heap snapshot for offline analysis. + let heapSnapshotPath: string | null = null; + const wantHeap = c.req.query("heap") === "1"; + if (wantHeap && typeof Bun !== "undefined") { + heapSnapshotPath = `/tmp/foundry-heap-${Date.now()}.json`; + // Bun.generateHeapSnapshot("v8") returns a V8-compatible JSON string. + const heapJson = Bun.generateHeapSnapshot("v8"); + await Bun.write(heapSnapshotPath, heapJson); + } + logger.info(snapshot, "memory_usage_debug"); + return c.json({ ...snapshot, ...(heapSnapshotPath ? { heapSnapshotPath } : {}) }); + }); + app.use("*", async (c, next) => { const requestId = c.req.header("x-request-id")?.trim() || randomUUID(); const start = performance.now(); @@ -354,6 +407,11 @@ export async function startBackend(options: BackendStartOptions = {}): Promise { + const mem = process.memoryUsage(); + const rssMb = Math.round(mem.rss / 1024 / 1024); + const heapUsedMb = Math.round(mem.heapUsed / 1024 / 1024); + const heapTotalMb = Math.round(mem.heapTotal / 1024 / 1024); + const externalMb = Math.round(mem.external / 1024 / 1024); + // Non-heap RSS: memory not accounted for by JS heap or external buffers. + // Large values here point to native allocations (WASM, mmap, child process + // bookkeeping, Bun's internal arena, etc.). + const nonHeapMb = rssMb - heapUsedMb - externalMb; + const deltaRss = rssMb - prevRss; + prevRss = rssMb; + logger.info( + { + rssMb, + heapUsedMb, + heapTotalMb, + externalMb, + nonHeapMb, + deltaRssMb: deltaRss, + rssBytes: mem.rss, + heapUsedBytes: mem.heapUsed, + heapTotalBytes: mem.heapTotal, + externalBytes: mem.external, + }, + "memory_usage", + ); + }, 60_000); + } + process.on("SIGINT", async () => { server.stop(); process.exit(0); diff --git a/foundry/packages/client/src/subscription/remote-manager.ts b/foundry/packages/client/src/subscription/remote-manager.ts index 778241f..ae774c6 100644 --- a/foundry/packages/client/src/subscription/remote-manager.ts +++ b/foundry/packages/client/src/subscription/remote-manager.ts @@ -4,6 +4,11 @@ import { topicDefinitions, type TopicData, type TopicDefinition, type TopicKey, const GRACE_PERIOD_MS = 30_000; +/** Initial retry delay in ms. */ +const RETRY_BASE_MS = 1_000; +/** Maximum retry delay in ms. */ +const RETRY_MAX_MS = 30_000; + /** * Remote implementation of SubscriptionManager. * Each cache entry owns one actor connection plus one materialized snapshot. @@ -80,9 +85,12 @@ class TopicEntry { private unsubscribeEvent: (() => void) | null = null; private unsubscribeError: (() => void) | null = null; private teardownTimer: ReturnType | null = null; + private retryTimer: ReturnType | null = null; + private retryAttempt = 0; private startPromise: Promise | null = null; private eventPromise: Promise = Promise.resolve(); private started = false; + private disposed = false; constructor( private readonly topicKey: TopicKey, @@ -136,7 +144,9 @@ class TopicEntry { } dispose(): void { + this.disposed = true; this.cancelTeardown(); + this.cancelRetry(); this.unsubscribeEvent?.(); this.unsubscribeError?.(); if (this.conn) { @@ -148,6 +158,55 @@ class TopicEntry { this.error = null; this.lastRefreshAt = null; this.started = false; + this.retryAttempt = 0; + } + + private cancelRetry(): void { + if (this.retryTimer) { + clearTimeout(this.retryTimer); + this.retryTimer = null; + } + } + + /** + * Schedules a retry with exponential backoff. Cleans up any existing + * connection state before reconnecting. + */ + private scheduleRetry(): void { + if (this.disposed || this.listenerCount === 0) { + return; + } + + const delay = Math.min(RETRY_BASE_MS * 2 ** this.retryAttempt, RETRY_MAX_MS); + this.retryAttempt++; + + this.retryTimer = setTimeout(() => { + this.retryTimer = null; + if (this.disposed || this.listenerCount === 0) { + return; + } + + // Tear down the old connection before retrying + this.cleanupConnection(); + this.started = false; + this.startPromise = this.start().finally(() => { + this.startPromise = null; + }); + }, delay); + } + + /** + * Cleans up connection resources without resetting data/status/retry state. + */ + private cleanupConnection(): void { + this.unsubscribeEvent?.(); + this.unsubscribeError?.(); + this.unsubscribeEvent = null; + this.unsubscribeError = null; + if (this.conn) { + void this.conn.dispose(); + } + this.conn = null; } private async start(): Promise { @@ -164,17 +223,20 @@ class TopicEntry { this.status = "error"; this.error = error instanceof Error ? error : new Error(String(error)); this.notify(); + this.scheduleRetry(); }); this.data = await this.definition.fetchInitial(this.backend, this.params); this.status = "connected"; this.lastRefreshAt = Date.now(); this.started = true; + this.retryAttempt = 0; this.notify(); } catch (error) { this.status = "error"; this.error = error instanceof Error ? error : new Error(String(error)); this.started = false; this.notify(); + this.scheduleRetry(); } }