mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 20:03:11 +00:00
307 lines
11 KiB
TypeScript
307 lines
11 KiB
TypeScript
import { describe, expect, it } from "vitest";
|
|
import type { TaskWorkbenchSnapshot, WorkbenchAgentTab, WorkbenchTask, WorkbenchModelId, WorkbenchTranscriptEvent } from "@sandbox-agent/foundry-shared";
|
|
import { createBackendClient } from "../../src/backend-client.js";
|
|
|
|
const DEFAULT_E2E_GITHUB_REPO = "rivet-dev/sandbox-agent-testing";
|
|
|
|
function requiredEnv(name: string): string {
|
|
const value = process.env[name]?.trim();
|
|
if (!value) {
|
|
throw new Error(`Missing required env var: ${name}`);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
function workbenchModelEnv(name: string, fallback: WorkbenchModelId): WorkbenchModelId {
|
|
const value = process.env[name]?.trim();
|
|
switch (value) {
|
|
case "claude-sonnet-4":
|
|
case "claude-opus-4":
|
|
case "gpt-4o":
|
|
case "o3":
|
|
return value;
|
|
default:
|
|
return fallback;
|
|
}
|
|
}
|
|
|
|
function intEnv(name: string, fallback: number): number {
|
|
const raw = process.env[name]?.trim();
|
|
if (!raw) {
|
|
return fallback;
|
|
}
|
|
const value = Number.parseInt(raw, 10);
|
|
return Number.isFinite(value) && value > 0 ? value : fallback;
|
|
}
|
|
|
|
async function sleep(ms: number): Promise<void> {
|
|
await new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async function poll<T>(label: string, timeoutMs: number, intervalMs: number, fn: () => Promise<T>, isDone: (value: T) => boolean): Promise<T> {
|
|
const startedAt = Date.now();
|
|
let lastValue: T;
|
|
|
|
for (;;) {
|
|
lastValue = await fn();
|
|
if (isDone(lastValue)) {
|
|
return lastValue;
|
|
}
|
|
if (Date.now() - startedAt > timeoutMs) {
|
|
throw new Error(`timed out waiting for ${label}`);
|
|
}
|
|
await sleep(intervalMs);
|
|
}
|
|
}
|
|
|
|
function findTask(snapshot: TaskWorkbenchSnapshot, taskId: string): WorkbenchTask {
|
|
const task = snapshot.tasks.find((candidate) => candidate.id === taskId);
|
|
if (!task) {
|
|
throw new Error(`task ${taskId} missing from snapshot`);
|
|
}
|
|
return task;
|
|
}
|
|
|
|
function findTab(task: WorkbenchTask, tabId: string): WorkbenchAgentTab {
|
|
const tab = task.tabs.find((candidate) => candidate.id === tabId);
|
|
if (!tab) {
|
|
throw new Error(`tab ${tabId} missing from task ${task.id}`);
|
|
}
|
|
return tab;
|
|
}
|
|
|
|
function extractEventText(event: WorkbenchTranscriptEvent): string {
|
|
const payload = event.payload;
|
|
if (!payload || typeof payload !== "object") {
|
|
return String(payload ?? "");
|
|
}
|
|
|
|
const envelope = payload as {
|
|
method?: unknown;
|
|
params?: unknown;
|
|
result?: unknown;
|
|
};
|
|
|
|
const params = envelope.params;
|
|
if (params && typeof params === "object") {
|
|
const update = (params as { update?: unknown }).update;
|
|
if (update && typeof update === "object") {
|
|
const content = (update as { content?: unknown }).content;
|
|
if (content && typeof content === "object") {
|
|
const chunkText = (content as { text?: unknown }).text;
|
|
if (typeof chunkText === "string") {
|
|
return chunkText;
|
|
}
|
|
}
|
|
}
|
|
|
|
const text = (params as { text?: unknown }).text;
|
|
if (typeof text === "string" && text.trim()) {
|
|
return text.trim();
|
|
}
|
|
|
|
const prompt = (params as { prompt?: Array<{ text?: unknown }> }).prompt;
|
|
if (Array.isArray(prompt)) {
|
|
return prompt
|
|
.map((item) => (typeof item?.text === "string" ? item.text.trim() : ""))
|
|
.filter(Boolean)
|
|
.join("\n");
|
|
}
|
|
}
|
|
|
|
const result = envelope.result;
|
|
if (result && typeof result === "object") {
|
|
const text = (result as { text?: unknown }).text;
|
|
if (typeof text === "string" && text.trim()) {
|
|
return text.trim();
|
|
}
|
|
}
|
|
|
|
return typeof envelope.method === "string" ? envelope.method : JSON.stringify(payload);
|
|
}
|
|
|
|
function transcriptIncludesAgentText(transcript: WorkbenchTranscriptEvent[], expectedText: string): boolean {
|
|
return transcript
|
|
.filter((event) => event.sender === "agent")
|
|
.map((event) => extractEventText(event))
|
|
.join("")
|
|
.includes(expectedText);
|
|
}
|
|
|
|
function average(values: number[]): number {
|
|
return values.reduce((sum, value) => sum + value, 0) / Math.max(values.length, 1);
|
|
}
|
|
|
|
async function measureWorkbenchSnapshot(
|
|
client: ReturnType<typeof createBackendClient>,
|
|
workspaceId: string,
|
|
iterations: number,
|
|
): Promise<{
|
|
avgMs: number;
|
|
maxMs: number;
|
|
payloadBytes: number;
|
|
taskCount: number;
|
|
tabCount: number;
|
|
transcriptEventCount: number;
|
|
}> {
|
|
const durations: number[] = [];
|
|
let snapshot: TaskWorkbenchSnapshot | null = null;
|
|
|
|
for (let index = 0; index < iterations; index += 1) {
|
|
const startedAt = performance.now();
|
|
snapshot = await client.getWorkbench(workspaceId);
|
|
durations.push(performance.now() - startedAt);
|
|
}
|
|
|
|
const finalSnapshot = snapshot ?? {
|
|
workspaceId,
|
|
repos: [],
|
|
projects: [],
|
|
tasks: [],
|
|
};
|
|
const payloadBytes = Buffer.byteLength(JSON.stringify(finalSnapshot), "utf8");
|
|
const tabCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.tabs.length, 0);
|
|
const transcriptEventCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.tabs.reduce((tabSum, tab) => tabSum + tab.transcript.length, 0), 0);
|
|
|
|
return {
|
|
avgMs: Math.round(average(durations)),
|
|
maxMs: Math.round(Math.max(...durations, 0)),
|
|
payloadBytes,
|
|
taskCount: finalSnapshot.tasks.length,
|
|
tabCount,
|
|
transcriptEventCount,
|
|
};
|
|
}
|
|
|
|
describe("e2e(client): workbench load", () => {
|
|
it("runs a simple sequential load profile against the real backend", { timeout: 30 * 60_000 }, async () => {
|
|
const endpoint = process.env.HF_E2E_BACKEND_ENDPOINT?.trim() || "http://127.0.0.1:7741/api/rivet";
|
|
const workspaceId = process.env.HF_E2E_WORKSPACE?.trim() || "default";
|
|
const repoRemote = process.env.HF_E2E_GITHUB_REPO?.trim() || DEFAULT_E2E_GITHUB_REPO;
|
|
requiredEnv("GITHUB_TOKEN");
|
|
const model = workbenchModelEnv("HF_E2E_MODEL", "gpt-4o");
|
|
const taskCount = intEnv("HF_LOAD_TASK_COUNT", 3);
|
|
const extraSessionCount = intEnv("HF_LOAD_EXTRA_SESSION_COUNT", 2);
|
|
const pollIntervalMs = intEnv("HF_LOAD_POLL_INTERVAL_MS", 2_000);
|
|
|
|
const client = createBackendClient({
|
|
endpoint,
|
|
defaultWorkspaceId: workspaceId,
|
|
});
|
|
|
|
const repo = await client.addRepo(workspaceId, repoRemote);
|
|
const createTaskLatencies: number[] = [];
|
|
const provisionLatencies: number[] = [];
|
|
const createSessionLatencies: number[] = [];
|
|
const messageRoundTripLatencies: number[] = [];
|
|
const snapshotSeries: Array<{
|
|
taskCount: number;
|
|
avgMs: number;
|
|
maxMs: number;
|
|
payloadBytes: number;
|
|
tabCount: number;
|
|
transcriptEventCount: number;
|
|
}> = [];
|
|
|
|
snapshotSeries.push(await measureWorkbenchSnapshot(client, workspaceId, 2));
|
|
|
|
for (let taskIndex = 0; taskIndex < taskCount; taskIndex += 1) {
|
|
const runId = `load-${taskIndex}-${Date.now().toString(36)}`;
|
|
const initialReply = `LOAD_INIT_${runId}`;
|
|
|
|
const createStartedAt = performance.now();
|
|
const created = await client.createWorkbenchTask(workspaceId, {
|
|
repoId: repo.repoId,
|
|
title: `Workbench Load ${runId}`,
|
|
branch: `load/${runId}`,
|
|
model,
|
|
task: `Reply with exactly: ${initialReply}`,
|
|
});
|
|
createTaskLatencies.push(performance.now() - createStartedAt);
|
|
|
|
const provisionStartedAt = performance.now();
|
|
const provisioned = await poll(
|
|
`task ${runId} provisioning`,
|
|
12 * 60_000,
|
|
pollIntervalMs,
|
|
async () => findTask(await client.getWorkbench(workspaceId), created.taskId),
|
|
(task) => {
|
|
const tab = task.tabs[0];
|
|
return Boolean(tab && task.status === "idle" && tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, initialReply));
|
|
},
|
|
);
|
|
provisionLatencies.push(performance.now() - provisionStartedAt);
|
|
|
|
expect(provisioned.tabs.length).toBeGreaterThan(0);
|
|
const primaryTab = provisioned.tabs[0]!;
|
|
expect(transcriptIncludesAgentText(primaryTab.transcript, initialReply)).toBe(true);
|
|
|
|
for (let sessionIndex = 0; sessionIndex < extraSessionCount; sessionIndex += 1) {
|
|
const expectedReply = `LOAD_REPLY_${runId}_${sessionIndex}`;
|
|
const createSessionStartedAt = performance.now();
|
|
const createdSession = await client.createWorkbenchSession(workspaceId, {
|
|
taskId: created.taskId,
|
|
model,
|
|
});
|
|
createSessionLatencies.push(performance.now() - createSessionStartedAt);
|
|
|
|
await client.sendWorkbenchMessage(workspaceId, {
|
|
taskId: created.taskId,
|
|
tabId: createdSession.tabId,
|
|
text: `Run pwd in the repo, then reply with exactly: ${expectedReply}`,
|
|
attachments: [],
|
|
});
|
|
|
|
const messageStartedAt = performance.now();
|
|
const withReply = await poll(
|
|
`task ${runId} session ${sessionIndex} reply`,
|
|
10 * 60_000,
|
|
pollIntervalMs,
|
|
async () => findTask(await client.getWorkbench(workspaceId), created.taskId),
|
|
(task) => {
|
|
const tab = findTab(task, createdSession.tabId);
|
|
return tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, expectedReply);
|
|
},
|
|
);
|
|
messageRoundTripLatencies.push(performance.now() - messageStartedAt);
|
|
|
|
expect(transcriptIncludesAgentText(findTab(withReply, createdSession.tabId).transcript, expectedReply)).toBe(true);
|
|
}
|
|
|
|
const snapshotMetrics = await measureWorkbenchSnapshot(client, workspaceId, 3);
|
|
snapshotSeries.push(snapshotMetrics);
|
|
console.info(
|
|
"[workbench-load-snapshot]",
|
|
JSON.stringify({
|
|
taskIndex: taskIndex + 1,
|
|
...snapshotMetrics,
|
|
}),
|
|
);
|
|
}
|
|
|
|
const firstSnapshot = snapshotSeries[0]!;
|
|
const lastSnapshot = snapshotSeries[snapshotSeries.length - 1]!;
|
|
const summary = {
|
|
taskCount,
|
|
extraSessionCount,
|
|
createTaskAvgMs: Math.round(average(createTaskLatencies)),
|
|
provisionAvgMs: Math.round(average(provisionLatencies)),
|
|
createSessionAvgMs: Math.round(average(createSessionLatencies)),
|
|
messageRoundTripAvgMs: Math.round(average(messageRoundTripLatencies)),
|
|
snapshotReadBaselineAvgMs: firstSnapshot.avgMs,
|
|
snapshotReadFinalAvgMs: lastSnapshot.avgMs,
|
|
snapshotReadFinalMaxMs: lastSnapshot.maxMs,
|
|
snapshotPayloadBaselineBytes: firstSnapshot.payloadBytes,
|
|
snapshotPayloadFinalBytes: lastSnapshot.payloadBytes,
|
|
snapshotTabFinalCount: lastSnapshot.tabCount,
|
|
snapshotTranscriptFinalCount: lastSnapshot.transcriptEventCount,
|
|
};
|
|
|
|
console.info("[workbench-load-summary]", JSON.stringify(summary));
|
|
|
|
expect(createTaskLatencies.length).toBe(taskCount);
|
|
expect(provisionLatencies.length).toBe(taskCount);
|
|
expect(createSessionLatencies.length).toBe(taskCount * extraSessionCount);
|
|
expect(messageRoundTripLatencies.length).toBe(taskCount * extraSessionCount);
|
|
});
|
|
});
|