mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 21:03:26 +00:00
feat(foundry): memory investigation tooling and VFS pool spec
Add memory monitoring instrumentation, investigation findings, and SQLite VFS pool design spec for addressing WASM SQLite memory spikes. - Add /debug/memory endpoint and periodic memory logging (dev only) - Add mem-monitor.sh script for continuous memory profiling with automatic heap snapshot capture on spike detection - Add configureRunnerPool to registry setup for engine driver support - Document memory investigation findings (per-actor cost, spike behavior) - Write SQLite VFS pool spec for bin-packing actors onto shared WASM instances - Add foundry-mem-monitor and foundry-dev-engine justfile recipes - Add compose.dev.yaml engine driver and platform support Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7b23e519c2
commit
ee99d0b318
18 changed files with 888 additions and 496 deletions
|
|
@ -136,6 +136,7 @@ Do not use polling (`refetchInterval`), empty "go re-fetch" broadcast events, or
|
|||
- **Task actor** materializes its own detail state (session summaries, sandbox info, diffs, file tree). `getTaskDetail` reads from the task actor's own SQLite. The task actor broadcasts updates directly to clients connected to it.
|
||||
- **Session data** lives on the task actor but is a separate subscription topic. The task topic includes `sessions_summary` (list without content). The `session` topic provides full transcript and draft state. Clients subscribe to the `session` topic for whichever session is active, and filter `sessionUpdated` events by session ID (ignoring events for other sessions on the same actor).
|
||||
- There is no fan-out on the read path. The organization actor owns all task summaries locally.
|
||||
- **Never build client-side fan-out** that iterates task summaries and calls `getTaskDetail`/`getSessionDetail` on each. This wakes every actor simultaneously and causes OOM crashes in production (~25 MB per actor wake). The subscription system connects to at most 4 actors at a time (app + org + task + session).
|
||||
|
||||
### Subscription manager
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@ name: foundry
|
|||
|
||||
services:
|
||||
backend:
|
||||
# Use linux/amd64 when RIVET_RUN_ENGINE=1 since the engine binary only
|
||||
# ships for x86_64 Linux. Falls back to native platform otherwise.
|
||||
platform: ${FOUNDRY_BACKEND_PLATFORM:-}
|
||||
build:
|
||||
context: ..
|
||||
dockerfile: foundry/docker/backend.dev.Dockerfile
|
||||
|
|
@ -11,6 +14,7 @@ services:
|
|||
- path: .env
|
||||
required: false
|
||||
environment:
|
||||
NODE_ENV: "development"
|
||||
HF_BACKEND_HOST: "0.0.0.0"
|
||||
HF_BACKEND_PORT: "7741"
|
||||
RIVETKIT_STORAGE_PATH: "/root/.local/share/foundry/rivetkit"
|
||||
|
|
@ -18,6 +22,10 @@ services:
|
|||
RIVET_LOG_LEVEL: "${RIVET_LOG_LEVEL:-debug}"
|
||||
RIVET_LOG_TIMESTAMP: "${RIVET_LOG_TIMESTAMP:-1}"
|
||||
FOUNDRY_LOG_LEVEL: "${FOUNDRY_LOG_LEVEL:-debug}"
|
||||
# Set RIVET_RUN_ENGINE=1 in .env or environment to use the Rust engine
|
||||
# instead of the file-system driver. Eliminates native bun:sqlite KV
|
||||
# overhead but WASM SQLite per-actor cost remains.
|
||||
RIVET_RUN_ENGINE: "${RIVET_RUN_ENGINE:-}"
|
||||
# Pass through credentials needed for agent execution + PR creation in dev/e2e.
|
||||
# Do not hardcode secrets; set these in your environment when starting compose.
|
||||
ANTHROPIC_API_KEY: "${ANTHROPIC_API_KEY:-}"
|
||||
|
|
|
|||
|
|
@ -10,9 +10,14 @@ import { resolveRunnerVersion } from "../config/runner-version.js";
|
|||
|
||||
const runnerVersion = resolveRunnerVersion();
|
||||
|
||||
const backendPort = process.env.HF_BACKEND_PORT ?? "7741";
|
||||
|
||||
export const registry = setup({
|
||||
serverless: {
|
||||
basePath: "/v1/rivet",
|
||||
configureRunnerPool: {
|
||||
url: `http://127.0.0.1:${backendPort}/v1/rivet`,
|
||||
},
|
||||
},
|
||||
runner: { version: runnerVersion },
|
||||
logging: {
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
"dependencies": {
|
||||
"@sandbox-agent/foundry-shared": "workspace:*",
|
||||
"react": "^19.1.1",
|
||||
"rivetkit": "2.1.6",
|
||||
"rivetkit": "https://pkg.pr.new/rivet-dev/rivet/rivetkit@791500a",
|
||||
"sandbox-agent": "workspace:*"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ import type {
|
|||
TaskWorkspaceSelectInput,
|
||||
TaskWorkspaceSetSessionUnreadInput,
|
||||
TaskWorkspaceSendMessageInput,
|
||||
TaskWorkspaceSnapshot,
|
||||
TaskWorkspaceSessionInput,
|
||||
TaskWorkspaceUpdateDraftInput,
|
||||
TaskEvent,
|
||||
|
|
@ -291,7 +290,6 @@ export interface BackendClient {
|
|||
getOrganizationSummary(organizationId: string): Promise<OrganizationSummarySnapshot>;
|
||||
getTaskDetail(organizationId: string, repoId: string, taskId: string): Promise<WorkspaceTaskDetail>;
|
||||
getSessionDetail(organizationId: string, repoId: string, taskId: string, sessionId: string): Promise<WorkspaceSessionDetail>;
|
||||
getWorkspace(organizationId: string): Promise<TaskWorkspaceSnapshot>;
|
||||
subscribeWorkspace(organizationId: string, listener: () => void): () => void;
|
||||
createWorkspaceTask(organizationId: string, input: TaskWorkspaceCreateTaskInput): Promise<TaskWorkspaceCreateTaskResponse>;
|
||||
markWorkspaceUnread(organizationId: string, input: TaskWorkspaceSelectInput): Promise<void>;
|
||||
|
|
@ -595,91 +593,6 @@ export function createBackendClient(options: BackendClientOptions): BackendClien
|
|||
return (await task(organizationId, repoId, taskIdValue)).getSessionDetail(await withAuthSessionInput({ sessionId }));
|
||||
};
|
||||
|
||||
const getWorkspaceCompat = async (organizationId: string): Promise<TaskWorkspaceSnapshot> => {
|
||||
const authSessionInput = await getAuthSessionInput();
|
||||
const summary = await (await organization(organizationId)).getOrganizationSummary({ organizationId });
|
||||
const resolvedTasks = await Promise.all(
|
||||
summary.taskSummaries.map(async (taskSummary) => {
|
||||
let detail;
|
||||
try {
|
||||
const taskHandle = await task(organizationId, taskSummary.repoId, taskSummary.id);
|
||||
detail = await taskHandle.getTaskDetail(authSessionInput);
|
||||
} catch (error) {
|
||||
if (isActorNotFoundError(error)) {
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
const sessionDetails = await Promise.all(
|
||||
detail.sessionsSummary.map(async (session) => {
|
||||
try {
|
||||
const full = await (await task(organizationId, detail.repoId, detail.id)).getSessionDetail({
|
||||
sessionId: session.id,
|
||||
...(authSessionInput ?? {}),
|
||||
});
|
||||
return [session.id, full] as const;
|
||||
} catch (error) {
|
||||
if (isActorNotFoundError(error)) {
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}),
|
||||
);
|
||||
const sessionDetailsById = new Map(sessionDetails.filter((entry): entry is readonly [string, WorkspaceSessionDetail] => entry !== null));
|
||||
return {
|
||||
id: detail.id,
|
||||
repoId: detail.repoId,
|
||||
title: detail.title,
|
||||
status: detail.status,
|
||||
repoName: detail.repoName,
|
||||
updatedAtMs: detail.updatedAtMs,
|
||||
branch: detail.branch,
|
||||
pullRequest: detail.pullRequest,
|
||||
activeSessionId: detail.activeSessionId ?? null,
|
||||
sessions: detail.sessionsSummary.map((session) => {
|
||||
const full = sessionDetailsById.get(session.id);
|
||||
return {
|
||||
id: session.id,
|
||||
sessionId: session.sessionId,
|
||||
sessionName: session.sessionName,
|
||||
agent: session.agent,
|
||||
model: session.model,
|
||||
status: session.status,
|
||||
thinkingSinceMs: session.thinkingSinceMs,
|
||||
unread: session.unread,
|
||||
created: session.created,
|
||||
draft: full?.draft ?? { text: "", attachments: [], updatedAtMs: null },
|
||||
transcript: full?.transcript ?? [],
|
||||
};
|
||||
}),
|
||||
fileChanges: detail.fileChanges,
|
||||
diffs: detail.diffs,
|
||||
fileTree: detail.fileTree,
|
||||
minutesUsed: detail.minutesUsed,
|
||||
activeSandboxId: detail.activeSandboxId ?? null,
|
||||
};
|
||||
}),
|
||||
);
|
||||
const tasks = resolvedTasks.filter((task): task is Exclude<(typeof resolvedTasks)[number], null> => task !== null);
|
||||
|
||||
const repositories = summary.repos
|
||||
.map((repo) => ({
|
||||
id: repo.id,
|
||||
label: repo.label,
|
||||
updatedAtMs: tasks.filter((task) => task.repoId === repo.id).reduce((latest, task) => Math.max(latest, task.updatedAtMs), repo.latestActivityMs),
|
||||
tasks: tasks.filter((task) => task.repoId === repo.id).sort((left, right) => right.updatedAtMs - left.updatedAtMs),
|
||||
}))
|
||||
.filter((repo) => repo.tasks.length > 0);
|
||||
|
||||
return {
|
||||
organizationId,
|
||||
repos: summary.repos.map((repo) => ({ id: repo.id, label: repo.label })),
|
||||
repositories,
|
||||
tasks: tasks.sort((left, right) => right.updatedAtMs - left.updatedAtMs),
|
||||
};
|
||||
};
|
||||
|
||||
const subscribeWorkspace = (organizationId: string, listener: () => void): (() => void) => {
|
||||
let entry = workspaceSubscriptions.get(organizationId);
|
||||
if (!entry) {
|
||||
|
|
@ -1225,10 +1138,6 @@ export function createBackendClient(options: BackendClientOptions): BackendClien
|
|||
return await getSessionDetailWithAuth(organizationId, repoId, taskIdValue, sessionId);
|
||||
},
|
||||
|
||||
async getWorkspace(organizationId: string): Promise<TaskWorkspaceSnapshot> {
|
||||
return await getWorkspaceCompat(organizationId);
|
||||
},
|
||||
|
||||
subscribeWorkspace(organizationId: string, listener: () => void): () => void {
|
||||
return subscribeWorkspace(organizationId, listener);
|
||||
},
|
||||
|
|
|
|||
|
|
@ -8,4 +8,4 @@ export * from "./subscription/use-subscription.js";
|
|||
export * from "./keys.js";
|
||||
export * from "./mock-app.js";
|
||||
export * from "./view-model.js";
|
||||
export * from "./workspace-client.js";
|
||||
export type { TaskWorkspaceClient } from "./workspace-client.js";
|
||||
|
|
|
|||
|
|
@ -654,10 +654,6 @@ export function createMockBackendClient(defaultOrganizationId = "default"): Back
|
|||
return buildSessionDetail(requireTask(taskId), sessionId);
|
||||
},
|
||||
|
||||
async getWorkspace(): Promise<TaskWorkspaceSnapshot> {
|
||||
return workspace.getSnapshot();
|
||||
},
|
||||
|
||||
subscribeWorkspace(_organizationId: string, listener: () => void): () => void {
|
||||
return workspace.subscribe(listener);
|
||||
},
|
||||
|
|
|
|||
|
|
@ -1,204 +0,0 @@
|
|||
import type {
|
||||
TaskWorkspaceAddSessionResponse,
|
||||
TaskWorkspaceChangeModelInput,
|
||||
TaskWorkspaceChangeOwnerInput,
|
||||
TaskWorkspaceCreateTaskInput,
|
||||
TaskWorkspaceCreateTaskResponse,
|
||||
TaskWorkspaceDiffInput,
|
||||
TaskWorkspaceRenameInput,
|
||||
TaskWorkspaceRenameSessionInput,
|
||||
TaskWorkspaceSelectInput,
|
||||
TaskWorkspaceSetSessionUnreadInput,
|
||||
TaskWorkspaceSendMessageInput,
|
||||
TaskWorkspaceSnapshot,
|
||||
TaskWorkspaceSessionInput,
|
||||
TaskWorkspaceUpdateDraftInput,
|
||||
} from "@sandbox-agent/foundry-shared";
|
||||
import type { BackendClient } from "../backend-client.js";
|
||||
import { groupWorkspaceRepositories } from "../workspace-model.js";
|
||||
import type { TaskWorkspaceClient } from "../workspace-client.js";
|
||||
|
||||
export interface RemoteWorkspaceClientOptions {
|
||||
backend: BackendClient;
|
||||
organizationId: string;
|
||||
}
|
||||
|
||||
class RemoteWorkspaceStore implements TaskWorkspaceClient {
|
||||
private readonly backend: BackendClient;
|
||||
private readonly organizationId: string;
|
||||
private snapshot: TaskWorkspaceSnapshot;
|
||||
private readonly listeners = new Set<() => void>();
|
||||
private unsubscribeWorkspace: (() => void) | null = null;
|
||||
private refreshPromise: Promise<void> | null = null;
|
||||
private refreshRetryTimeout: ReturnType<typeof setTimeout> | null = null;
|
||||
|
||||
constructor(options: RemoteWorkspaceClientOptions) {
|
||||
this.backend = options.backend;
|
||||
this.organizationId = options.organizationId;
|
||||
this.snapshot = {
|
||||
organizationId: options.organizationId,
|
||||
repos: [],
|
||||
repositories: [],
|
||||
tasks: [],
|
||||
};
|
||||
}
|
||||
|
||||
getSnapshot(): TaskWorkspaceSnapshot {
|
||||
return this.snapshot;
|
||||
}
|
||||
|
||||
subscribe(listener: () => void): () => void {
|
||||
this.listeners.add(listener);
|
||||
this.ensureStarted();
|
||||
return () => {
|
||||
this.listeners.delete(listener);
|
||||
if (this.listeners.size === 0 && this.refreshRetryTimeout) {
|
||||
clearTimeout(this.refreshRetryTimeout);
|
||||
this.refreshRetryTimeout = null;
|
||||
}
|
||||
if (this.listeners.size === 0 && this.unsubscribeWorkspace) {
|
||||
this.unsubscribeWorkspace();
|
||||
this.unsubscribeWorkspace = null;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
async createTask(input: TaskWorkspaceCreateTaskInput): Promise<TaskWorkspaceCreateTaskResponse> {
|
||||
const created = await this.backend.createWorkspaceTask(this.organizationId, input);
|
||||
await this.refresh();
|
||||
return created;
|
||||
}
|
||||
|
||||
async markTaskUnread(input: TaskWorkspaceSelectInput): Promise<void> {
|
||||
await this.backend.markWorkspaceUnread(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async renameTask(input: TaskWorkspaceRenameInput): Promise<void> {
|
||||
await this.backend.renameWorkspaceTask(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async archiveTask(input: TaskWorkspaceSelectInput): Promise<void> {
|
||||
await this.backend.runAction(this.organizationId, input.repoId, input.taskId, "archive");
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async publishPr(input: TaskWorkspaceSelectInput): Promise<void> {
|
||||
await this.backend.publishWorkspacePr(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async revertFile(input: TaskWorkspaceDiffInput): Promise<void> {
|
||||
await this.backend.revertWorkspaceFile(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async updateDraft(input: TaskWorkspaceUpdateDraftInput): Promise<void> {
|
||||
await this.backend.updateWorkspaceDraft(this.organizationId, input);
|
||||
// Skip refresh — the server broadcast will trigger it, and the frontend
|
||||
// holds local draft state to avoid the round-trip overwriting user input.
|
||||
}
|
||||
|
||||
async sendMessage(input: TaskWorkspaceSendMessageInput): Promise<void> {
|
||||
await this.backend.sendWorkspaceMessage(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async stopAgent(input: TaskWorkspaceSessionInput): Promise<void> {
|
||||
await this.backend.stopWorkspaceSession(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async selectSession(input: TaskWorkspaceSessionInput): Promise<void> {
|
||||
await this.backend.selectWorkspaceSession(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async setSessionUnread(input: TaskWorkspaceSetSessionUnreadInput): Promise<void> {
|
||||
await this.backend.setWorkspaceSessionUnread(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async renameSession(input: TaskWorkspaceRenameSessionInput): Promise<void> {
|
||||
await this.backend.renameWorkspaceSession(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async closeSession(input: TaskWorkspaceSessionInput): Promise<void> {
|
||||
await this.backend.closeWorkspaceSession(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async addSession(input: TaskWorkspaceSelectInput): Promise<TaskWorkspaceAddSessionResponse> {
|
||||
const created = await this.backend.createWorkspaceSession(this.organizationId, input);
|
||||
await this.refresh();
|
||||
return created;
|
||||
}
|
||||
|
||||
async changeModel(input: TaskWorkspaceChangeModelInput): Promise<void> {
|
||||
await this.backend.changeWorkspaceModel(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
async changeOwner(input: TaskWorkspaceChangeOwnerInput): Promise<void> {
|
||||
await this.backend.changeWorkspaceTaskOwner(this.organizationId, input);
|
||||
await this.refresh();
|
||||
}
|
||||
|
||||
private ensureStarted(): void {
|
||||
if (!this.unsubscribeWorkspace) {
|
||||
this.unsubscribeWorkspace = this.backend.subscribeWorkspace(this.organizationId, () => {
|
||||
void this.refresh().catch(() => {
|
||||
this.scheduleRefreshRetry();
|
||||
});
|
||||
});
|
||||
}
|
||||
void this.refresh().catch(() => {
|
||||
this.scheduleRefreshRetry();
|
||||
});
|
||||
}
|
||||
|
||||
private scheduleRefreshRetry(): void {
|
||||
if (this.refreshRetryTimeout || this.listeners.size === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.refreshRetryTimeout = setTimeout(() => {
|
||||
this.refreshRetryTimeout = null;
|
||||
void this.refresh().catch(() => {
|
||||
this.scheduleRefreshRetry();
|
||||
});
|
||||
}, 1_000);
|
||||
}
|
||||
|
||||
private async refresh(): Promise<void> {
|
||||
if (this.refreshPromise) {
|
||||
await this.refreshPromise;
|
||||
return;
|
||||
}
|
||||
|
||||
this.refreshPromise = (async () => {
|
||||
const nextSnapshot = await this.backend.getWorkspace(this.organizationId);
|
||||
if (this.refreshRetryTimeout) {
|
||||
clearTimeout(this.refreshRetryTimeout);
|
||||
this.refreshRetryTimeout = null;
|
||||
}
|
||||
this.snapshot = {
|
||||
...nextSnapshot,
|
||||
repositories: nextSnapshot.repositories ?? groupWorkspaceRepositories(nextSnapshot.repos, nextSnapshot.tasks),
|
||||
};
|
||||
for (const listener of [...this.listeners]) {
|
||||
listener();
|
||||
}
|
||||
})().finally(() => {
|
||||
this.refreshPromise = null;
|
||||
});
|
||||
|
||||
await this.refreshPromise;
|
||||
}
|
||||
}
|
||||
|
||||
export function createRemoteWorkspaceClient(options: RemoteWorkspaceClientOptions): TaskWorkspaceClient {
|
||||
return new RemoteWorkspaceStore(options);
|
||||
}
|
||||
|
|
@ -14,17 +14,6 @@ import type {
|
|||
TaskWorkspaceSessionInput,
|
||||
TaskWorkspaceUpdateDraftInput,
|
||||
} from "@sandbox-agent/foundry-shared";
|
||||
import type { BackendClient } from "./backend-client.js";
|
||||
import { getSharedMockWorkspaceClient } from "./mock/workspace-client.js";
|
||||
import { createRemoteWorkspaceClient } from "./remote/workspace-client.js";
|
||||
|
||||
export type TaskWorkspaceClientMode = "mock" | "remote";
|
||||
|
||||
export interface CreateTaskWorkspaceClientOptions {
|
||||
mode: TaskWorkspaceClientMode;
|
||||
backend?: BackendClient;
|
||||
organizationId?: string;
|
||||
}
|
||||
|
||||
export interface TaskWorkspaceClient {
|
||||
getSnapshot(): TaskWorkspaceSnapshot;
|
||||
|
|
@ -46,21 +35,3 @@ export interface TaskWorkspaceClient {
|
|||
changeModel(input: TaskWorkspaceChangeModelInput): Promise<void>;
|
||||
changeOwner(input: TaskWorkspaceChangeOwnerInput): Promise<void>;
|
||||
}
|
||||
|
||||
export function createTaskWorkspaceClient(options: CreateTaskWorkspaceClientOptions): TaskWorkspaceClient {
|
||||
if (options.mode === "mock") {
|
||||
return getSharedMockWorkspaceClient();
|
||||
}
|
||||
|
||||
if (!options.backend) {
|
||||
throw new Error("Remote task workspace client requires a backend client");
|
||||
}
|
||||
if (!options.organizationId) {
|
||||
throw new Error("Remote task workspace client requires a organization id");
|
||||
}
|
||||
|
||||
return createRemoteWorkspaceClient({
|
||||
backend: options.backend,
|
||||
organizationId: options.organizationId,
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import type { TaskWorkspaceSnapshot, WorkspaceSession, WorkspaceTask, WorkspaceModelId, WorkspaceTranscriptEvent } from "@sandbox-agent/foundry-shared";
|
||||
import type { WorkspaceSession, WorkspaceTask, WorkspaceModelId, WorkspaceTranscriptEvent } from "@sandbox-agent/foundry-shared";
|
||||
import { createBackendClient } from "../../src/backend-client.js";
|
||||
import { requireImportedRepo } from "./helpers.js";
|
||||
|
||||
|
|
@ -38,12 +38,35 @@ async function poll<T>(label: string, timeoutMs: number, intervalMs: number, fn:
|
|||
}
|
||||
}
|
||||
|
||||
function findTask(snapshot: TaskWorkspaceSnapshot, taskId: string): WorkspaceTask {
|
||||
const task = snapshot.tasks.find((candidate) => candidate.id === taskId);
|
||||
if (!task) {
|
||||
throw new Error(`task ${taskId} missing from snapshot`);
|
||||
}
|
||||
return task;
|
||||
async function fetchFullTask(client: ReturnType<typeof createBackendClient>, organizationId: string, repoId: string, taskId: string): Promise<WorkspaceTask> {
|
||||
const detail = await client.getTaskDetail(organizationId, repoId, taskId);
|
||||
const sessionDetails = await Promise.all(
|
||||
detail.sessionsSummary.map(async (s) => {
|
||||
const full = await client.getSessionDetail(organizationId, repoId, taskId, s.id);
|
||||
return {
|
||||
...s,
|
||||
draft: full.draft,
|
||||
transcript: full.transcript,
|
||||
} as WorkspaceSession;
|
||||
}),
|
||||
);
|
||||
return {
|
||||
id: detail.id,
|
||||
repoId: detail.repoId,
|
||||
title: detail.title,
|
||||
status: detail.status,
|
||||
repoName: detail.repoName,
|
||||
updatedAtMs: detail.updatedAtMs,
|
||||
branch: detail.branch,
|
||||
pullRequest: detail.pullRequest,
|
||||
activeSessionId: detail.activeSessionId ?? null,
|
||||
sessions: sessionDetails,
|
||||
fileChanges: detail.fileChanges,
|
||||
diffs: detail.diffs,
|
||||
fileTree: detail.fileTree,
|
||||
minutesUsed: detail.minutesUsed,
|
||||
activeSandboxId: detail.activeSandboxId ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function findTab(task: WorkspaceTask, sessionId: string): WorkspaceSession {
|
||||
|
|
@ -155,7 +178,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
"task provisioning",
|
||||
12 * 60_000,
|
||||
2_000,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => task.branch === `e2e/${runId}` && task.sessions.length > 0,
|
||||
);
|
||||
|
||||
|
|
@ -165,7 +188,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
"initial agent response",
|
||||
12 * 60_000,
|
||||
2_000,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => {
|
||||
const tab = findTab(task, primaryTab.id);
|
||||
return task.status === "idle" && tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, expectedInitialReply);
|
||||
|
|
@ -219,7 +242,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
],
|
||||
});
|
||||
|
||||
const drafted = findTask(await client.getWorkspace(organizationId), created.taskId);
|
||||
const drafted = await fetchFullTask(client, organizationId, repo.repoId, created.taskId);
|
||||
expect(findTab(drafted, secondTab.sessionId).draft.text).toContain(expectedReply);
|
||||
expect(findTab(drafted, secondTab.sessionId).draft.attachments).toHaveLength(1);
|
||||
|
||||
|
|
@ -246,7 +269,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
"follow-up session response",
|
||||
10 * 60_000,
|
||||
2_000,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => {
|
||||
const tab = findTab(task, secondTab.sessionId);
|
||||
return (
|
||||
|
|
@ -267,7 +290,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
});
|
||||
await client.markWorkspaceUnread(organizationId, { repoId: repo.repoId, taskId: created.taskId });
|
||||
|
||||
const unreadSnapshot = findTask(await client.getWorkspace(organizationId), created.taskId);
|
||||
const unreadSnapshot = await fetchFullTask(client, organizationId, repo.repoId, created.taskId);
|
||||
expect(unreadSnapshot.sessions.some((tab) => tab.unread)).toBe(true);
|
||||
|
||||
await client.closeWorkspaceSession(organizationId, {
|
||||
|
|
@ -280,7 +303,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
"secondary session closed",
|
||||
30_000,
|
||||
1_000,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => !task.sessions.some((tab) => tab.id === secondTab.sessionId),
|
||||
);
|
||||
expect(closedSnapshot.sessions).toHaveLength(1);
|
||||
|
|
@ -295,7 +318,7 @@ describe("e2e(client): workspace flows", () => {
|
|||
"file revert reflected in workspace",
|
||||
30_000,
|
||||
1_000,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => !task.fileChanges.some((file) => file.path === expectedFile),
|
||||
);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
createFoundryLogger,
|
||||
type TaskWorkspaceSnapshot,
|
||||
type WorkspaceSession,
|
||||
type WorkspaceTask,
|
||||
type WorkspaceModelId,
|
||||
|
|
@ -60,12 +59,35 @@ async function poll<T>(label: string, timeoutMs: number, intervalMs: number, fn:
|
|||
}
|
||||
}
|
||||
|
||||
function findTask(snapshot: TaskWorkspaceSnapshot, taskId: string): WorkspaceTask {
|
||||
const task = snapshot.tasks.find((candidate) => candidate.id === taskId);
|
||||
if (!task) {
|
||||
throw new Error(`task ${taskId} missing from snapshot`);
|
||||
}
|
||||
return task;
|
||||
async function fetchFullTask(client: ReturnType<typeof createBackendClient>, organizationId: string, repoId: string, taskId: string): Promise<WorkspaceTask> {
|
||||
const detail = await client.getTaskDetail(organizationId, repoId, taskId);
|
||||
const sessionDetails = await Promise.all(
|
||||
detail.sessionsSummary.map(async (s) => {
|
||||
const full = await client.getSessionDetail(organizationId, repoId, taskId, s.id);
|
||||
return {
|
||||
...s,
|
||||
draft: full.draft,
|
||||
transcript: full.transcript,
|
||||
} as WorkspaceSession;
|
||||
}),
|
||||
);
|
||||
return {
|
||||
id: detail.id,
|
||||
repoId: detail.repoId,
|
||||
title: detail.title,
|
||||
status: detail.status,
|
||||
repoName: detail.repoName,
|
||||
updatedAtMs: detail.updatedAtMs,
|
||||
branch: detail.branch,
|
||||
pullRequest: detail.pullRequest,
|
||||
activeSessionId: detail.activeSessionId ?? null,
|
||||
sessions: sessionDetails,
|
||||
fileChanges: detail.fileChanges,
|
||||
diffs: detail.diffs,
|
||||
fileTree: detail.fileTree,
|
||||
minutesUsed: detail.minutesUsed,
|
||||
activeSandboxId: detail.activeSandboxId ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function findTab(task: WorkspaceTask, sessionId: string): WorkspaceSession {
|
||||
|
|
@ -138,7 +160,7 @@ function average(values: number[]): number {
|
|||
return values.reduce((sum, value) => sum + value, 0) / Math.max(values.length, 1);
|
||||
}
|
||||
|
||||
async function measureWorkspaceSnapshot(
|
||||
async function measureOrganizationSummary(
|
||||
client: ReturnType<typeof createBackendClient>,
|
||||
organizationId: string,
|
||||
iterations: number,
|
||||
|
|
@ -147,35 +169,24 @@ async function measureWorkspaceSnapshot(
|
|||
maxMs: number;
|
||||
payloadBytes: number;
|
||||
taskCount: number;
|
||||
tabCount: number;
|
||||
transcriptEventCount: number;
|
||||
}> {
|
||||
const durations: number[] = [];
|
||||
let snapshot: TaskWorkspaceSnapshot | null = null;
|
||||
let snapshot: Awaited<ReturnType<typeof client.getOrganizationSummary>> | null = null;
|
||||
|
||||
for (let index = 0; index < iterations; index += 1) {
|
||||
const startedAt = performance.now();
|
||||
snapshot = await client.getWorkspace(organizationId);
|
||||
snapshot = await client.getOrganizationSummary(organizationId);
|
||||
durations.push(performance.now() - startedAt);
|
||||
}
|
||||
|
||||
const finalSnapshot = snapshot ?? {
|
||||
organizationId,
|
||||
repos: [],
|
||||
repositories: [],
|
||||
tasks: [],
|
||||
};
|
||||
const finalSnapshot = snapshot ?? { organizationId, github: {} as any, repos: [], taskSummaries: [] };
|
||||
const payloadBytes = Buffer.byteLength(JSON.stringify(finalSnapshot), "utf8");
|
||||
const tabCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.sessions.length, 0);
|
||||
const transcriptEventCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.sessions.reduce((tabSum, tab) => tabSum + tab.transcript.length, 0), 0);
|
||||
|
||||
return {
|
||||
avgMs: Math.round(average(durations)),
|
||||
maxMs: Math.round(Math.max(...durations, 0)),
|
||||
payloadBytes,
|
||||
taskCount: finalSnapshot.tasks.length,
|
||||
tabCount,
|
||||
transcriptEventCount,
|
||||
taskCount: finalSnapshot.taskSummaries.length,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -204,11 +215,9 @@ describe("e2e(client): workspace load", () => {
|
|||
avgMs: number;
|
||||
maxMs: number;
|
||||
payloadBytes: number;
|
||||
tabCount: number;
|
||||
transcriptEventCount: number;
|
||||
}> = [];
|
||||
|
||||
snapshotSeries.push(await measureWorkspaceSnapshot(client, organizationId, 2));
|
||||
snapshotSeries.push(await measureOrganizationSummary(client, organizationId, 2));
|
||||
|
||||
for (let taskIndex = 0; taskIndex < taskCount; taskIndex += 1) {
|
||||
const runId = `load-${taskIndex}-${Date.now().toString(36)}`;
|
||||
|
|
@ -229,7 +238,7 @@ describe("e2e(client): workspace load", () => {
|
|||
`task ${runId} provisioning`,
|
||||
12 * 60_000,
|
||||
pollIntervalMs,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => {
|
||||
const tab = task.sessions[0];
|
||||
return Boolean(tab && task.status === "idle" && tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, initialReply));
|
||||
|
|
@ -264,7 +273,7 @@ describe("e2e(client): workspace load", () => {
|
|||
`task ${runId} session ${sessionIndex} reply`,
|
||||
10 * 60_000,
|
||||
pollIntervalMs,
|
||||
async () => findTask(await client.getWorkspace(organizationId), created.taskId),
|
||||
async () => fetchFullTask(client, organizationId, repo.repoId, created.taskId),
|
||||
(task) => {
|
||||
const tab = findTab(task, createdSession.sessionId);
|
||||
return tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, expectedReply);
|
||||
|
|
@ -275,7 +284,7 @@ describe("e2e(client): workspace load", () => {
|
|||
expect(transcriptIncludesAgentText(findTab(withReply, createdSession.sessionId).transcript, expectedReply)).toBe(true);
|
||||
}
|
||||
|
||||
const snapshotMetrics = await measureWorkspaceSnapshot(client, organizationId, 3);
|
||||
const snapshotMetrics = await measureOrganizationSummary(client, organizationId, 3);
|
||||
snapshotSeries.push(snapshotMetrics);
|
||||
logger.info(
|
||||
{
|
||||
|
|
@ -300,8 +309,7 @@ describe("e2e(client): workspace load", () => {
|
|||
snapshotReadFinalMaxMs: lastSnapshot.maxMs,
|
||||
snapshotPayloadBaselineBytes: firstSnapshot.payloadBytes,
|
||||
snapshotPayloadFinalBytes: lastSnapshot.payloadBytes,
|
||||
snapshotTabFinalCount: lastSnapshot.tabCount,
|
||||
snapshotTranscriptFinalCount: lastSnapshot.transcriptEventCount,
|
||||
snapshotTaskFinalCount: lastSnapshot.taskCount,
|
||||
};
|
||||
|
||||
logger.info(summary, "workspace_load_summary");
|
||||
|
|
|
|||
88
foundry/research/memory-investigation.md
Normal file
88
foundry/research/memory-investigation.md
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# Foundry Backend Memory Investigation
|
||||
|
||||
Date: 2026-03-17
|
||||
|
||||
## Problem
|
||||
|
||||
Production Railway deployment shows memory spikes from near-zero to 40+ GB when users interact with the app. Local reproduction shows spikes from ~300 MB to ~2.1 GB when opening a task workspace.
|
||||
|
||||
## Architecture
|
||||
|
||||
Each actor in the system has **two SQLite instances**:
|
||||
|
||||
1. **WASM SQLite** (16.6 MB per actor) - Runs Drizzle ORM queries for actor-specific tables (task data, session transcripts, etc.). Each actor gets its own `SqliteVfs` which instantiates a full `WebAssembly.Instance` with 16.6 MB linear memory.
|
||||
|
||||
2. **Native bun:sqlite** (~4-8 MB per actor) - Backs the KV store that the WASM SQLite's VFS reads/writes to. This is the persistence layer. Not visible in JS heap snapshots (native C memory).
|
||||
|
||||
## Findings
|
||||
|
||||
### Memory breakdown (steady state, 14 active WASM instances)
|
||||
|
||||
| Category | Size | % of RSS | Description |
|
||||
|----------|------|----------|-------------|
|
||||
| WASM SQLite heaps | 232 MB | 46% | 14 x 16.6 MB ArrayBuffers (WASM linear memory) |
|
||||
| Bun native (bun:sqlite + runtime) | 225 MB | 44% | KV backing store page caches, mmap'd WAL files, Bun runtime |
|
||||
| JS application objects | 27 MB | 5% | Closures, actor state, plain objects |
|
||||
| Module graph | 20 MB | 4% | Compiled code, FunctionCodeBlocks, ModuleRecords |
|
||||
| ArrayBuffer intermediates | 4 MB | 1% | Non-WASM buffers |
|
||||
| KV data in transit | ~0 MB | 0% | 4KB chunks copied and freed immediately |
|
||||
|
||||
### Spike behavior
|
||||
|
||||
When opening a task workspace, many actors wake simultaneously:
|
||||
|
||||
| State | WASM Instances | SqliteVfs | WASM Heap | Actors (task) | RSS |
|
||||
|-------|---------------|-----------|-----------|---------------|-----|
|
||||
| Baseline | 7-9 | 6-8 | 116-149 MB | 14 | 289-309 MB |
|
||||
| Spike | 32 | 32 | 531 MB | 25 | 2,118 MB |
|
||||
| Post-sleep | 14 | 13 | 232 MB | 25 (23 sleeping) | 509 MB |
|
||||
|
||||
### Per-actor memory cost
|
||||
|
||||
Each actor that wakes up and accesses its database costs:
|
||||
- 16.6 MB for WASM SQLite linear memory
|
||||
- ~4-8 MB for native bun:sqlite KV backing store
|
||||
- **Total: ~20-25 MB per actor**
|
||||
|
||||
### No per-actor WASM leak
|
||||
|
||||
Controlled testing (3 wake/sleep cycles on a single actor) confirmed WASM is properly freed on sleep:
|
||||
- Wake: +1 SqliteVfs, +17 MB
|
||||
- Sleep: -1 SqliteVfs, -17 MB
|
||||
- No accumulation across cycles
|
||||
|
||||
### Production impact
|
||||
|
||||
With 200+ PRs in production, if something wakes all task actors simultaneously:
|
||||
- 200 actors x 25 MB = 5 GB minimum
|
||||
- Plus JS garbage from git operations, sandbox bootstraps, etc.
|
||||
- Explains the 40 GB spike seen on Railway (multiple replicas, plus GC pressure)
|
||||
|
||||
### The double-SQLite problem
|
||||
|
||||
The current file-system driver architecture means every actor runs SQLite-in-WASM on top of SQLite-native:
|
||||
|
||||
```
|
||||
Actor Drizzle queries
|
||||
-> WASM SQLite (16.6 MB heap)
|
||||
-> VFS layer (copies 4KB chunks)
|
||||
-> KV store API
|
||||
-> bun:sqlite (native, ~4-8 MB page cache)
|
||||
-> disk (.db files)
|
||||
```
|
||||
|
||||
The engine driver eliminates the WASM layer entirely, using the Rust engine's native SQLite directly.
|
||||
|
||||
## Root causes of mass actor wake-up
|
||||
|
||||
1. `maybeScheduleWorkspaceRefreshes()` is called twice per `getTaskDetail()` (once directly, once via `buildTaskSummary()`)
|
||||
2. ~~`getWorkspace()` fetches ALL task details in parallel, waking all task actors~~ **Dead code — removed 2026-03-17.** The frontend uses the subscription system exclusively; `getWorkspaceCompat` and `RemoteWorkspaceStore` had zero callers.
|
||||
3. Frontend retry interval is 1 second with no backoff
|
||||
4. No deduplication of concurrent `collectWorkspaceGitState()` calls
|
||||
|
||||
## Next steps
|
||||
|
||||
- [ ] Test with engine driver enabled to measure WASM elimination impact
|
||||
- [ ] Investigate what triggers mass actor wake-up in production (the `getWorkspace` fan-out was dead code; the actual trigger is still unknown)
|
||||
- [ ] Consider sharing a single WASM module across actors (mutex around non-reentrant init)
|
||||
- [ ] Enable periodic memory logging in production to capture state before OOM kills
|
||||
214
foundry/research/sqlite-vfs-pool-spec.md
Normal file
214
foundry/research/sqlite-vfs-pool-spec.md
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
# SQLite VFS Pool Spec
|
||||
|
||||
Date: 2026-03-17
|
||||
Package: `@rivetkit/sqlite-vfs`
|
||||
Scope: WASM SQLite only (not Cloudflare D1 driver)
|
||||
|
||||
## Problem
|
||||
|
||||
Each actor gets its own WASM SQLite instance via `SqliteVfs`, allocating 16.6 MB
|
||||
of linear memory per instance. With 200+ actors waking simultaneously, this
|
||||
causes multi-GB memory spikes (40 GB observed in production).
|
||||
|
||||
## Design
|
||||
|
||||
### Pool model
|
||||
|
||||
A `SqliteVfsPool` manages N WASM SQLite instances. Actors are bin-packed onto
|
||||
instances via sticky assignment. The pool scales instances up to a configured
|
||||
max as actors arrive, and scales down (after a grace period) when instances have
|
||||
zero assigned actors.
|
||||
|
||||
### Configuration
|
||||
|
||||
```typescript
|
||||
interface SqliteVfsPoolConfig {
|
||||
/** Max actors sharing one WASM instance. Default: 50. */
|
||||
actorsPerInstance: number;
|
||||
/** Max WASM instances the pool will create. Default: Infinity. */
|
||||
maxInstances?: number;
|
||||
/** Grace period before destroying an empty instance. Default: 30_000ms. */
|
||||
idleDestroyMs?: number;
|
||||
}
|
||||
```
|
||||
|
||||
**Sizing guide**: each WASM instance handles ~13 SQLite ops/sec at 15ms KV RTT
|
||||
(66 KV ops/sec / ~5 KV ops per SQLite operation). For a target of X ops/sec,
|
||||
set `actorsPerInstance = totalActors / ceil(X / 13)`.
|
||||
|
||||
### Actor-to-instance assignment
|
||||
|
||||
Sticky assignment: once an actor is assigned to an instance, it stays there
|
||||
until it releases (actor sleep/destroy). Assignment uses bin-packing: pick the
|
||||
instance with the most actors that still has capacity. If all instances are
|
||||
full, create a new one (up to `maxInstances`).
|
||||
|
||||
```
|
||||
acquire(actorId) -> PooledSqliteHandle
|
||||
1. If actorId already assigned, return existing handle
|
||||
2. Find instance with most actors that has capacity (< actorsPerInstance)
|
||||
3. If none found and instanceCount < maxInstances, create new instance
|
||||
4. If none found and at max, wait (queue)
|
||||
5. Assign actorId to instance, return handle
|
||||
|
||||
release(actorId)
|
||||
1. Remove actorId from instance's assignment set
|
||||
2. If instance has zero actors, start idle timer
|
||||
3. On idle timer expiry, destroy instance (reclaim 16.6 MB)
|
||||
4. Cancel idle timer if a new actor is assigned before expiry
|
||||
```
|
||||
|
||||
### Locking mechanism
|
||||
|
||||
The existing `#sqliteMutex` on `SqliteVfs` already serializes SQLite operations
|
||||
within one instance. This is the right level: each individual xRead/xWrite call
|
||||
acquires the mutex, does its async KV operation, and releases. No change needed
|
||||
to the mutex itself.
|
||||
|
||||
Multiple databases on the same instance share the mutex. This means if actor A
|
||||
is doing an xRead (15ms), actor B on the same instance waits. This is the
|
||||
intentional serialization — asyncify cannot handle concurrent suspensions on the
|
||||
same WASM module.
|
||||
|
||||
The pool does NOT add a higher-level lock. The per-instance `#sqliteMutex`
|
||||
handles all serialization. The pool only manages assignment and lifecycle.
|
||||
|
||||
### Multiple databases per instance
|
||||
|
||||
Currently `SqliteSystem.registerFile()` enforces one main database file per VFS.
|
||||
This constraint must be lifted to allow multiple actors' databases to coexist.
|
||||
|
||||
**Change**: `SqliteSystem` tracks multiple registered files in a `Map<string, KvVfsOptions>`
|
||||
instead of a single `#mainFileName`. The VFS callbacks (`xRead`, `xWrite`, etc.)
|
||||
already receive the file handle and look up the correct options per file.
|
||||
|
||||
Each actor opens its own database file (named by actorId) on the shared VFS.
|
||||
Multiple databases can be open simultaneously on the same WASM instance. The
|
||||
`#sqliteMutex` ensures only one SQLite call executes at a time.
|
||||
|
||||
### PooledSqliteHandle
|
||||
|
||||
The handle returned to actors wraps a reference to the pool and its assigned
|
||||
instance. It exposes the same `open()` interface as `SqliteVfs`.
|
||||
|
||||
```typescript
|
||||
class PooledSqliteHandle {
|
||||
readonly #pool: SqliteVfsPool;
|
||||
readonly #instanceId: number;
|
||||
readonly #actorId: string;
|
||||
|
||||
/** Open a database on this handle's assigned WASM instance. */
|
||||
async open(fileName: string, options: KvVfsOptions): Promise<Database> {
|
||||
const vfs = this.#pool.getInstance(this.#instanceId);
|
||||
return vfs.open(fileName, options);
|
||||
}
|
||||
|
||||
/** Release this handle back to the pool. */
|
||||
async destroy(): Promise<void> {
|
||||
this.#pool.release(this.#actorId);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Integration with drivers
|
||||
|
||||
The `ActorDriver.createSqliteVfs()` method currently returns `new SqliteVfs()`.
|
||||
With pooling:
|
||||
|
||||
```typescript
|
||||
// Before
|
||||
async createSqliteVfs(): Promise<SqliteVfs> {
|
||||
return new SqliteVfs();
|
||||
}
|
||||
|
||||
// After
|
||||
async createSqliteVfs(actorId: string): Promise<PooledSqliteHandle> {
|
||||
return this.#vfsPool.acquire(actorId);
|
||||
}
|
||||
```
|
||||
|
||||
The `PooledSqliteHandle` must satisfy the same interface that actors expect from
|
||||
`SqliteVfs` (specifically the `open()` and `destroy()` methods). Either:
|
||||
- `PooledSqliteHandle` implements the `SqliteVfs` interface (duck typing)
|
||||
- Or extract an interface type that both implement
|
||||
|
||||
The actor instance code in `mod.ts` calls `this.#sqliteVfs = await driver.createSqliteVfs()`.
|
||||
It then passes `this.#sqliteVfs` to the DB provider which calls `.open()`. On
|
||||
cleanup it calls `.destroy()`. The pooled handle supports both.
|
||||
|
||||
### Scale-up and scale-down
|
||||
|
||||
**Scale-up**: new instance created lazily on `acquire()` when all existing
|
||||
instances are at capacity. WASM module is loaded in `#ensureInitialized()` on
|
||||
first `open()` call (existing lazy behavior). Cost: ~16.6 MB + WASM compile time.
|
||||
|
||||
**Scale-down**: when last actor releases from an instance, start a timer
|
||||
(`idleDestroyMs`). If no new actor is assigned before the timer fires, call
|
||||
`sqliteVfs.destroy()` to free the WASM module. This reclaims 16.6 MB.
|
||||
|
||||
If an actor is assigned to an instance that is in the idle-destroy grace period,
|
||||
cancel the timer and reuse the instance.
|
||||
|
||||
### Memory budget examples
|
||||
|
||||
| Actors | actorsPerInstance | Instances | WASM Memory |
|
||||
|--------|-------------------|-----------|-------------|
|
||||
| 50 | 50 | 1 | 17 MB |
|
||||
| 200 | 50 | 4 | 66 MB |
|
||||
| 500 | 50 | 10 | 166 MB |
|
||||
| 200 | 25 | 8 | 133 MB |
|
||||
|
||||
Compare to current: 200 actors = 200 instances = 3,320 MB.
|
||||
|
||||
## Changes required
|
||||
|
||||
### `@rivetkit/sqlite-vfs`
|
||||
|
||||
1. **`SqliteSystem`**: Remove single-main-file constraint. Replace
|
||||
`#mainFileName`/`#mainFileOptions` with a `Map<string, KvVfsOptions>`.
|
||||
Update `registerFile()` to insert into the map. Update VFS callbacks to look
|
||||
up options by file handle.
|
||||
|
||||
2. **`SqliteVfs`**: Allow multiple `open()` calls with different filenames.
|
||||
Each returns an independent `Database` handle. All share the same WASM
|
||||
module and `#sqliteMutex`.
|
||||
|
||||
3. **New `SqliteVfsPool`**: Manages instance lifecycle, actor assignment, and
|
||||
scale-up/scale-down. Exported from the package.
|
||||
|
||||
4. **New `PooledSqliteHandle`**: Returned by `pool.acquire()`. Implements the
|
||||
subset of `SqliteVfs` that actors use (`open`, `destroy`).
|
||||
|
||||
### `rivetkit` (drivers)
|
||||
|
||||
5. **`ActorDriver` interface**: `createSqliteVfs()` signature adds `actorId`
|
||||
parameter so the pool can do sticky assignment.
|
||||
|
||||
6. **File-system driver**: Create `SqliteVfsPool` once, call
|
||||
`pool.acquire(actorId)` in `createSqliteVfs()`.
|
||||
|
||||
7. **Engine driver**: Same change as file-system driver.
|
||||
|
||||
8. **Actor instance (`mod.ts`)**: Pass `actorId` to `driver.createSqliteVfs(actorId)`.
|
||||
No other changes needed — the handle quacks like `SqliteVfs`.
|
||||
|
||||
### Not changed
|
||||
|
||||
- Cloudflare driver (uses D1, no WASM)
|
||||
- KV storage layer (unchanged)
|
||||
- Drizzle integration (unchanged, still receives a `Database` from `open()`)
|
||||
- `#sqliteMutex` behavior (unchanged, already serializes correctly)
|
||||
|
||||
## Risks
|
||||
|
||||
1. **Hot instance**: If one instance has 50 chatty actors, the mutex contention
|
||||
increases latency for all of them. Mitigation: monitor mutex wait time, tune
|
||||
`actorsPerInstance` down if needed.
|
||||
|
||||
2. **WASM memory growth**: SQLite can grow WASM linear memory via
|
||||
`memory.grow()`. If one actor causes growth, all actors on that instance pay
|
||||
the cost. In practice, SQLite's page cache is small and growth is rare.
|
||||
|
||||
3. **Database close ordering**: If actor A crashes without closing its DB, the
|
||||
open file handle leaks inside the VFS. The pool must track open databases
|
||||
and force-close on `release()`.
|
||||
304
foundry/scripts/measure-actor-wakeup.ts
Normal file
304
foundry/scripts/measure-actor-wakeup.ts
Normal file
|
|
@ -0,0 +1,304 @@
|
|||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Actor Wake-Up Timing Measurement Script
|
||||
*
|
||||
* 1. Finds a sleeping actor via the Rivet API
|
||||
* 2. Records LOCAL wall-clock time, then sends /health to the gateway to wake it
|
||||
* 3. Records LOCAL wall-clock time when response arrives
|
||||
* 4. Fetches the actor state from the Rivet API to get connectable_ts
|
||||
* 5. Fetches Railway logs for the actor ID to find startup timestamps
|
||||
* 6. Writes a report with all timing data
|
||||
*/
|
||||
|
||||
import { execSync } from "child_process";
|
||||
import { writeFileSync } from "fs";
|
||||
|
||||
const RIVET_API = "https://api.rivet.dev";
|
||||
const NAMESPACE = "sandbox-agent-t2ta-prod-1ved";
|
||||
const TOKEN = "pk_qufWQ7qDoQge0B4iBjSbX1E2ygIfuUKZcFhBJ65jBFLzjHPjuiLIgwbtOv6BJwZP";
|
||||
const REPORT_PATH = "/Users/nathan/sandbox-agent/.agents/notes/wakeup-timing-report.md";
|
||||
|
||||
// Known actor configs to try waking
|
||||
const ACTOR_CONFIGS = [
|
||||
{ name: "auditLog", key: "org/test-wake-1/audit-log", label: "auditLog (test-wake-1)" },
|
||||
{ name: "auditLog", key: "org/test-wake-2/audit-log", label: "auditLog (test-wake-2)" },
|
||||
{ name: "auditLog", key: "org/test-wake-3/audit-log", label: "auditLog (test-wake-3)" },
|
||||
{ name: "task", key: "org/rivet-dev/task/71d7fa2abec273e5/8f5265b4-297e-47ab-b8af-d54c0fe7e98c", label: "task (rivet-dev/71d7...)" },
|
||||
{ name: "task", key: "org/rivet-dev/task/d49a32ea4570b3fa/ccd735aa-06bf-437b-823e-24f8c230743b", label: "task (rivet-dev/d49a...)" },
|
||||
{ name: "organization", key: "org/app", label: "org/app (app shell)" },
|
||||
{ name: "organization", key: "org/rivet-dev", label: "org/rivet-dev" },
|
||||
];
|
||||
|
||||
interface ActorState {
|
||||
actor: {
|
||||
actor_id: string;
|
||||
name: string;
|
||||
key: string;
|
||||
create_ts: number;
|
||||
start_ts: number | null;
|
||||
pending_allocation_ts: number | null;
|
||||
connectable_ts: number | null;
|
||||
sleep_ts: number | null;
|
||||
reschedule_ts: number | null;
|
||||
destroy_ts: number | null;
|
||||
};
|
||||
created: boolean;
|
||||
}
|
||||
|
||||
async function getOrCreateActor(name: string, key: string): Promise<ActorState> {
|
||||
const res = await fetch(`${RIVET_API}/actors?namespace=${NAMESPACE}`, {
|
||||
method: "PUT",
|
||||
headers: {
|
||||
Authorization: `Bearer ${TOKEN}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
name,
|
||||
key,
|
||||
runner_name_selector: "default",
|
||||
input: "Y2FwcA==",
|
||||
crash_policy: "sleep",
|
||||
}),
|
||||
});
|
||||
if (!res.ok) {
|
||||
throw new Error(`getOrCreate failed: ${res.status} ${await res.text()}`);
|
||||
}
|
||||
return res.json() as Promise<ActorState>;
|
||||
}
|
||||
|
||||
async function pingHealth(actorId: string): Promise<{
|
||||
status: number;
|
||||
body: string;
|
||||
durationMs: number;
|
||||
localRequestStartMs: number;
|
||||
localResponseEndMs: number;
|
||||
}> {
|
||||
const localRequestStartMs = Date.now();
|
||||
const start = performance.now();
|
||||
const res = await fetch(`${RIVET_API}/gateway/${actorId}@${TOKEN}/health`, { method: "GET" });
|
||||
const body = await res.text();
|
||||
const durationMs = performance.now() - start;
|
||||
const localResponseEndMs = Date.now();
|
||||
return { status: res.status, body, durationMs, localRequestStartMs, localResponseEndMs };
|
||||
}
|
||||
|
||||
function getRailwayLogs(lines: number = 500): string {
|
||||
try {
|
||||
return execSync(`cd /Users/nathan/sandbox-agent/foundry && railway logs --deployment --lines ${lines}`, { encoding: "utf-8", timeout: 30_000 });
|
||||
} catch (e: any) {
|
||||
return e.stdout || e.message || "Failed to fetch Railway logs";
|
||||
}
|
||||
}
|
||||
|
||||
function extractActorLogs(allLogs: string, actorId: string): string[] {
|
||||
return allLogs.split("\n").filter((line) => line.includes(actorId));
|
||||
}
|
||||
|
||||
function formatTs(ts: number | null): string {
|
||||
if (ts === null) return "null";
|
||||
return `${new Date(ts).toISOString()} (${ts})`;
|
||||
}
|
||||
|
||||
async function measureWakeup() {
|
||||
const report: string[] = [];
|
||||
report.push("# Actor Wake-Up Timing Report");
|
||||
report.push(`**Generated:** ${new Date().toISOString()}`);
|
||||
report.push("");
|
||||
|
||||
// Step 1: Find a sleeping actor
|
||||
console.log("Step 1: Finding a sleeping actor...");
|
||||
let sleepingActor: ActorState | null = null;
|
||||
let actorLabel = "";
|
||||
|
||||
for (const config of ACTOR_CONFIGS) {
|
||||
console.log(` Checking ${config.label}...`);
|
||||
try {
|
||||
const state = await getOrCreateActor(config.name, config.key);
|
||||
console.log(` actor_id=${state.actor.actor_id} sleep_ts=${state.actor.sleep_ts} connectable_ts=${state.actor.connectable_ts}`);
|
||||
if (state.actor.sleep_ts !== null && state.actor.connectable_ts === null) {
|
||||
sleepingActor = state;
|
||||
actorLabel = config.label;
|
||||
console.log(` Found sleeping actor: ${config.label}`);
|
||||
break;
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(` Error: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!sleepingActor) {
|
||||
console.log("No sleeping actors found. Waiting 45s for first actor to go back to sleep...");
|
||||
const config = ACTOR_CONFIGS[0]!;
|
||||
const state = await getOrCreateActor(config.name, config.key);
|
||||
if (state.actor.connectable_ts !== null) {
|
||||
console.log(`Actor ${config.label} is awake. Waiting 45s...`);
|
||||
await new Promise((r) => setTimeout(r, 45_000));
|
||||
const recheck = await getOrCreateActor(config.name, config.key);
|
||||
sleepingActor = recheck;
|
||||
actorLabel = config.label;
|
||||
if (recheck.actor.sleep_ts !== null && recheck.actor.connectable_ts === null) {
|
||||
console.log("Actor went back to sleep.");
|
||||
} else {
|
||||
console.log(`Actor still awake. Proceeding anyway.`);
|
||||
}
|
||||
} else {
|
||||
sleepingActor = state;
|
||||
actorLabel = config.label;
|
||||
}
|
||||
}
|
||||
|
||||
const actorId = sleepingActor.actor.actor_id;
|
||||
const wasSleeping = sleepingActor.actor.sleep_ts !== null && sleepingActor.actor.connectable_ts === null;
|
||||
|
||||
report.push(`## Target Actor`);
|
||||
report.push(`- **Label:** ${actorLabel}`);
|
||||
report.push(`- **Actor ID:** ${actorId}`);
|
||||
report.push(`- **Was sleeping:** ${wasSleeping}`);
|
||||
report.push(`- **State before wake:**`);
|
||||
report.push(` - create_ts: ${formatTs(sleepingActor.actor.create_ts)}`);
|
||||
report.push(` - start_ts: ${formatTs(sleepingActor.actor.start_ts)}`);
|
||||
report.push(` - connectable_ts: ${formatTs(sleepingActor.actor.connectable_ts)}`);
|
||||
report.push(` - sleep_ts: ${formatTs(sleepingActor.actor.sleep_ts)}`);
|
||||
report.push(` - pending_allocation_ts: ${formatTs(sleepingActor.actor.pending_allocation_ts)}`);
|
||||
report.push("");
|
||||
|
||||
// Step 2: Ping /health to wake the actor
|
||||
console.log("\nStep 2: Pinging /health to wake actor...");
|
||||
|
||||
const healthResult = await pingHealth(actorId);
|
||||
|
||||
console.log(` LOCAL request start: ${new Date(healthResult.localRequestStartMs).toISOString()} (${healthResult.localRequestStartMs})`);
|
||||
console.log(` LOCAL response end: ${new Date(healthResult.localResponseEndMs).toISOString()} (${healthResult.localResponseEndMs})`);
|
||||
console.log(` Duration: ${healthResult.durationMs.toFixed(0)}ms`);
|
||||
console.log(` Response status: ${healthResult.status}`);
|
||||
console.log(` Response body: ${healthResult.body.substring(0, 300)}`);
|
||||
|
||||
report.push(`## Health Endpoint Timing`);
|
||||
report.push(`- **Endpoint:** GET /gateway/${actorId}@.../health`);
|
||||
report.push(`- **LOCAL request start:** ${formatTs(healthResult.localRequestStartMs)}`);
|
||||
report.push(`- **LOCAL response end:** ${formatTs(healthResult.localResponseEndMs)}`);
|
||||
report.push(`- **Total round-trip:** ${healthResult.durationMs.toFixed(0)}ms`);
|
||||
report.push(`- **HTTP status:** ${healthResult.status}`);
|
||||
report.push(`- **Response:** \`${healthResult.body.substring(0, 300)}\``);
|
||||
report.push("");
|
||||
|
||||
// Step 3: Fetch actor state after wake to get new connectable_ts
|
||||
console.log("\nStep 3: Fetching actor state after wake...");
|
||||
await new Promise((r) => setTimeout(r, 500));
|
||||
|
||||
const afterState = await getOrCreateActor(sleepingActor.actor.name, sleepingActor.actor.key);
|
||||
console.log(` connectable_ts: ${afterState.actor.connectable_ts}`);
|
||||
console.log(` sleep_ts: ${afterState.actor.sleep_ts}`);
|
||||
console.log(` start_ts: ${afterState.actor.start_ts}`);
|
||||
|
||||
report.push(`## Actor State After Wake`);
|
||||
report.push(`- start_ts: ${formatTs(afterState.actor.start_ts)}`);
|
||||
report.push(`- connectable_ts: ${formatTs(afterState.actor.connectable_ts)}`);
|
||||
report.push(`- sleep_ts: ${formatTs(afterState.actor.sleep_ts)}`);
|
||||
report.push(`- pending_allocation_ts: ${formatTs(afterState.actor.pending_allocation_ts)}`);
|
||||
report.push("");
|
||||
|
||||
// Step 4: Compute timing deltas
|
||||
report.push(`## Timing Analysis`);
|
||||
|
||||
const localStart = healthResult.localRequestStartMs;
|
||||
const localEnd = healthResult.localResponseEndMs;
|
||||
|
||||
if (wasSleeping && afterState.actor.connectable_ts) {
|
||||
const sleepTs = sleepingActor.actor.sleep_ts!;
|
||||
const connectableTs = afterState.actor.connectable_ts;
|
||||
|
||||
const requestToConnectable = connectableTs - localStart;
|
||||
const sleepToConnectable = connectableTs - sleepTs;
|
||||
const connectableToResponse = localEnd - connectableTs;
|
||||
|
||||
report.push(`### Key Deltas`);
|
||||
report.push(`| Metric | Value |`);
|
||||
report.push(`|--------|-------|`);
|
||||
report.push(`| LOCAL request start → LOCAL response end (total round-trip) | ${healthResult.durationMs.toFixed(0)}ms |`);
|
||||
report.push(`| LOCAL request start → connectable_ts (network hop to engine + engine wake) | ${requestToConnectable}ms |`);
|
||||
report.push(`| connectable_ts → LOCAL response end (KV reads + /health + network hop back) | ${connectableToResponse}ms |`);
|
||||
report.push(`| sleep_ts → connectable_ts (time actor was asleep before our request) | ${sleepToConnectable}ms |`);
|
||||
report.push("");
|
||||
|
||||
report.push(`### Timeline`);
|
||||
report.push("```");
|
||||
report.push(`${formatTs(sleepTs)} - Actor went to sleep (ENGINE timestamp)`);
|
||||
report.push(`${formatTs(localStart)} - LOCAL: HTTP request sent to gateway`);
|
||||
report.push(`${formatTs(connectableTs)} - ENGINE: connectable_ts set (actor allocated to runner)`);
|
||||
report.push(`${formatTs(localEnd)} - LOCAL: HTTP response received`);
|
||||
report.push("```");
|
||||
report.push("");
|
||||
report.push(`**Note:** LOCAL vs ENGINE timestamps include clock skew + network latency.`);
|
||||
report.push("");
|
||||
} else {
|
||||
report.push(`Actor was not sleeping or connectable_ts not set after wake.`);
|
||||
report.push(`- wasSleeping: ${wasSleeping}`);
|
||||
report.push(`- afterState.connectable_ts: ${afterState.actor.connectable_ts}`);
|
||||
report.push("");
|
||||
}
|
||||
|
||||
// Step 5: Fetch Railway logs
|
||||
console.log("\nStep 4: Fetching Railway logs...");
|
||||
const railwayLogs = getRailwayLogs(500);
|
||||
const actorLogs = extractActorLogs(railwayLogs, actorId);
|
||||
console.log(` Found ${actorLogs.length} log lines mentioning actor ${actorId}`);
|
||||
|
||||
const startupKeywords = ["CommandStartActor", "ActorStateRunning", "starting actor", "kv", "sleep", "wake", "connectable", actorId];
|
||||
|
||||
const relevantLogs = railwayLogs
|
||||
.split("\n")
|
||||
.filter((line) => startupKeywords.some((kw) => line.toLowerCase().includes(kw.toLowerCase())))
|
||||
.slice(-50);
|
||||
|
||||
report.push(`## Railway Logs`);
|
||||
report.push(`### Lines mentioning actor ID (${actorId})`);
|
||||
if (actorLogs.length > 0) {
|
||||
report.push("```");
|
||||
for (const line of actorLogs.slice(-30)) {
|
||||
report.push(line);
|
||||
}
|
||||
report.push("```");
|
||||
} else {
|
||||
report.push("*No log lines found mentioning the actor ID directly.*");
|
||||
}
|
||||
report.push("");
|
||||
|
||||
report.push(`### Startup-related log lines (last 50)`);
|
||||
if (relevantLogs.length > 0) {
|
||||
report.push("```");
|
||||
for (const line of relevantLogs) {
|
||||
report.push(line);
|
||||
}
|
||||
report.push("```");
|
||||
} else {
|
||||
report.push("*No startup-related log lines found.*");
|
||||
}
|
||||
report.push("");
|
||||
|
||||
// Step 6: Poll actor state
|
||||
console.log("\nStep 5: Polling actor state over next 5 seconds...");
|
||||
report.push(`## Actor State Polling (post-wake)`);
|
||||
report.push(`| Time | connectable_ts | sleep_ts |`);
|
||||
report.push(`|------|---------------|----------|`);
|
||||
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const pollState = await getOrCreateActor(sleepingActor.actor.name, sleepingActor.actor.key);
|
||||
const now = new Date().toISOString();
|
||||
report.push(`| ${now} | ${formatTs(pollState.actor.connectable_ts)} | ${formatTs(pollState.actor.sleep_ts)} |`);
|
||||
await new Promise((r) => setTimeout(r, 1000));
|
||||
}
|
||||
report.push("");
|
||||
|
||||
// Write report
|
||||
const reportContent = report.join("\n");
|
||||
writeFileSync(REPORT_PATH, reportContent);
|
||||
console.log(`\nReport written to: ${REPORT_PATH}`);
|
||||
console.log("\n--- Report Preview ---");
|
||||
console.log(reportContent);
|
||||
}
|
||||
|
||||
measureWakeup().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
179
foundry/scripts/mem-monitor.sh
Executable file
179
foundry/scripts/mem-monitor.sh
Executable file
|
|
@ -0,0 +1,179 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Memory monitor for Foundry backend.
|
||||
# Polls /debug/memory and actor counts every N seconds, writes TSV + heap
|
||||
# snapshots to a timestamped output directory.
|
||||
#
|
||||
# Usage:
|
||||
# ./foundry/scripts/mem-monitor.sh [interval_seconds] [backend_url]
|
||||
#
|
||||
# Defaults: interval=5s, backend=http://127.0.0.1:7741
|
||||
# Output: foundry/.foundry/mem-monitor/<run-timestamp>/
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
INTERVAL="${1:-5}"
|
||||
BACKEND="${2:-http://127.0.0.1:7741}"
|
||||
RIVETKIT="${3:-http://127.0.0.1:6420}"
|
||||
|
||||
RUN_TS="$(date +%Y%m%dT%H%M%S)"
|
||||
OUT_DIR="foundry/.foundry/mem-monitor/$RUN_TS"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
MEMORY_TSV="$OUT_DIR/memory.tsv"
|
||||
ACTORS_TSV="$OUT_DIR/actors.tsv"
|
||||
EVENTS_LOG="$OUT_DIR/events.log"
|
||||
HEAP_DIR="$OUT_DIR/heaps"
|
||||
mkdir -p "$HEAP_DIR"
|
||||
|
||||
# Column headers
|
||||
printf "timestamp\telapsed_s\trss_mb\theap_used_mb\theap_total_mb\texternal_mb\tnon_heap_mb\n" > "$MEMORY_TSV"
|
||||
printf "timestamp\telapsed_s\torganization\ttask\ttask_sandbox\tuser\tgithub_data\taudit_log\ttotal\n" > "$ACTORS_TSV"
|
||||
|
||||
echo "=== Foundry Memory Monitor ==="
|
||||
echo " Interval: ${INTERVAL}s"
|
||||
echo " Backend: $BACKEND"
|
||||
echo " RivetKit: $RIVETKIT"
|
||||
echo " Output: $OUT_DIR"
|
||||
echo ""
|
||||
|
||||
START_EPOCH="$(date +%s)"
|
||||
TICK=0
|
||||
PREV_RSS=0
|
||||
|
||||
# Record baseline heap snapshot
|
||||
echo "[$(date +%H:%M:%S)] Recording baseline heap snapshot..."
|
||||
baseline_resp=$(curl -sf "${BACKEND}/debug/memory?gc=1&heap=1" 2>/dev/null || echo '{}')
|
||||
baseline_path=$(echo "$baseline_resp" | python3 -c "import json,sys; print(json.load(sys.stdin).get('heapSnapshotPath',''))" 2>/dev/null || true)
|
||||
if [[ -n "$baseline_path" ]]; then
|
||||
docker cp "foundry-backend-1:${baseline_path}" "$HEAP_DIR/baseline.json" 2>/dev/null && \
|
||||
echo "[$(date +%H:%M:%S)] Baseline heap snapshot saved to $HEAP_DIR/baseline.json" || true
|
||||
fi
|
||||
|
||||
# Analyze WASM instances in a heap snapshot file
|
||||
analyze_heap() {
|
||||
local heap_file="$1"
|
||||
python3 << PYEOF
|
||||
import json
|
||||
with open("$heap_file") as f:
|
||||
snap = json.load(f)
|
||||
strings = snap["strings"]
|
||||
nodes = snap["nodes"]
|
||||
fpn = len(snap["snapshot"]["meta"]["node_fields"])
|
||||
total = len(nodes) // fpn
|
||||
wasm_inst = 0; sqlite_vfs = 0; big_ab = 0; big_ab_bytes = 0
|
||||
for i in range(total):
|
||||
b = i * fpn
|
||||
name = strings[nodes[b+1]]
|
||||
size = nodes[b+3]
|
||||
if name == "WebAssembly.Instance": wasm_inst += 1
|
||||
if name == "SqliteVfs": sqlite_vfs += 1
|
||||
if name == "ArrayBuffer" and size > 10*1024*1024:
|
||||
big_ab += 1; big_ab_bytes += size
|
||||
print(f"wasm_instances={wasm_inst} sqlite_vfs={sqlite_vfs} big_arraybuffers={big_ab} wasm_heap_mb={big_ab_bytes/1024/1024:.1f}")
|
||||
PYEOF
|
||||
}
|
||||
|
||||
if [[ -f "$HEAP_DIR/baseline.json" ]]; then
|
||||
baseline_wasm=$(analyze_heap "$HEAP_DIR/baseline.json")
|
||||
echo "[$(date +%H:%M:%S)] Baseline WASM: $baseline_wasm"
|
||||
echo "$(date +%H:%M:%S) BASELINE wasm: $baseline_wasm" >> "$EVENTS_LOG"
|
||||
fi
|
||||
|
||||
# Record baseline actor counts
|
||||
get_actor_counts() {
|
||||
local counts=""
|
||||
local total=0
|
||||
for name in organization task taskSandbox user github-data audit-log; do
|
||||
# Try without namespace (file-system driver), then with namespace (engine driver)
|
||||
c=$(curl -sf "${RIVETKIT}/actors?name=$name" 2>/dev/null \
|
||||
| python3 -c "import json,sys; d=json.load(sys.stdin); print(len(d.get('actors',d) if isinstance(d,dict) else d))" 2>/dev/null)
|
||||
if [[ -z "$c" || "$c" == "0" ]]; then
|
||||
c=$(curl -sf "${RIVETKIT}/actors?name=$name&namespace=default" 2>/dev/null \
|
||||
| python3 -c "import json,sys; d=json.load(sys.stdin); print(len(d.get('actors',d) if isinstance(d,dict) else d))" 2>/dev/null || echo "0")
|
||||
fi
|
||||
counts="${counts}\t${c}"
|
||||
total=$((total + c))
|
||||
done
|
||||
counts="${counts}\t${total}"
|
||||
echo -e "$counts"
|
||||
}
|
||||
|
||||
baseline_actors=$(get_actor_counts)
|
||||
echo "[$(date +%H:%M:%S)] Baseline actors: $baseline_actors"
|
||||
echo "$(date +%H:%M:%S) BASELINE actors:$baseline_actors" >> "$EVENTS_LOG"
|
||||
|
||||
# Print baseline memory
|
||||
baseline_mem=$(curl -sf "${BACKEND}/debug/memory?gc=1" 2>/dev/null || echo '{}')
|
||||
baseline_rss=$(echo "$baseline_mem" | python3 -c "import json,sys; print(json.load(sys.stdin).get('rssMb',0))" 2>/dev/null || echo "?")
|
||||
echo "[$(date +%H:%M:%S)] Baseline RSS (after GC): ${baseline_rss} MB"
|
||||
echo "$(date +%H:%M:%S) BASELINE rss=${baseline_rss}MB" >> "$EVENTS_LOG"
|
||||
echo ""
|
||||
echo "[$(date +%H:%M:%S)] Monitoring started. Press Ctrl+C to stop."
|
||||
echo ""
|
||||
|
||||
# Spike detection state
|
||||
PEAK_RSS=0
|
||||
SPIKE_HEAP_TAKEN=0
|
||||
SPIKE_THRESHOLD_MB=100 # delta from baseline to trigger heap snapshot
|
||||
|
||||
while true; do
|
||||
NOW="$(date +%H:%M:%S)"
|
||||
ELAPSED=$(( $(date +%s) - START_EPOCH ))
|
||||
TICK=$((TICK + 1))
|
||||
|
||||
# Memory poll (no GC — we want to see real usage)
|
||||
mem_json=$(curl -sf "${BACKEND}/debug/memory" 2>/dev/null || echo '{}')
|
||||
rss=$(echo "$mem_json" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('rssMb',0))" 2>/dev/null || echo 0)
|
||||
heap_used=$(echo "$mem_json" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('heapUsedMb',0))" 2>/dev/null || echo 0)
|
||||
heap_total=$(echo "$mem_json" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('heapTotalMb',0))" 2>/dev/null || echo 0)
|
||||
external=$(echo "$mem_json" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('externalMb',0))" 2>/dev/null || echo 0)
|
||||
non_heap=$(echo "$mem_json" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('nonHeapMb',0))" 2>/dev/null || echo 0)
|
||||
|
||||
printf "%s\t%d\t%s\t%s\t%s\t%s\t%s\n" "$NOW" "$ELAPSED" "$rss" "$heap_used" "$heap_total" "$external" "$non_heap" >> "$MEMORY_TSV"
|
||||
|
||||
delta=$((rss - PREV_RSS))
|
||||
PREV_RSS=$rss
|
||||
|
||||
# Track peak
|
||||
if [[ "$rss" -gt "$PEAK_RSS" ]]; then
|
||||
PEAK_RSS=$rss
|
||||
fi
|
||||
|
||||
# Print live status
|
||||
printf "\r[%s] +%4ds RSS: %4s MB (Δ%+d) heap: %4s MB ext: %4s MB peak: %4s MB" \
|
||||
"$NOW" "$ELAPSED" "$rss" "$delta" "$heap_used" "$external" "$PEAK_RSS"
|
||||
|
||||
# Auto-capture heap snapshot on spike
|
||||
spike_delta=$((rss - baseline_rss))
|
||||
if [[ "$spike_delta" -gt "$SPIKE_THRESHOLD_MB" && "$SPIKE_HEAP_TAKEN" -eq 0 ]]; then
|
||||
SPIKE_HEAP_TAKEN=1
|
||||
echo ""
|
||||
echo "[${NOW}] SPIKE DETECTED: RSS=${rss}MB (+${spike_delta}MB from baseline). Capturing heap snapshot..."
|
||||
spike_resp=$(curl -sf "${BACKEND}/debug/memory?heap=1" 2>/dev/null || echo '{}')
|
||||
spike_path=$(echo "$spike_resp" | python3 -c "import json,sys; print(json.load(sys.stdin).get('heapSnapshotPath',''))" 2>/dev/null || true)
|
||||
if [[ -n "$spike_path" ]]; then
|
||||
docker cp "foundry-backend-1:${spike_path}" "$HEAP_DIR/spike-${NOW}.json" 2>/dev/null && \
|
||||
echo "[${NOW}] Spike heap snapshot saved to $HEAP_DIR/spike-${NOW}.json" || true
|
||||
spike_wasm=$(analyze_heap "$HEAP_DIR/spike-${NOW}.json" 2>/dev/null || echo "analysis failed")
|
||||
echo "[${NOW}] Spike WASM: $spike_wasm"
|
||||
echo "${NOW} SPIKE rss=${rss}MB delta=+${spike_delta}MB wasm: $spike_wasm" >> "$EVENTS_LOG"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Reset spike detection when RSS drops back near baseline
|
||||
if [[ "$spike_delta" -lt 50 && "$SPIKE_HEAP_TAKEN" -eq 1 ]]; then
|
||||
SPIKE_HEAP_TAKEN=0
|
||||
echo ""
|
||||
echo "[${NOW}] RSS returned near baseline (${rss}MB). Spike detection re-armed."
|
||||
echo "${NOW} SPIKE_RESET rss=${rss}MB" >> "$EVENTS_LOG"
|
||||
fi
|
||||
|
||||
# Actor counts every 6th tick (every 30s at default interval)
|
||||
if [[ $((TICK % 6)) -eq 0 ]]; then
|
||||
actor_counts=$(get_actor_counts)
|
||||
printf "%s\t%d%s\n" "$NOW" "$ELAPSED" "$actor_counts" >> "$ACTORS_TSV"
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
Loading…
Add table
Add a link
Reference in a new issue