mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 07:04:48 +00:00
Surface aggregate status (error, provisioning, running, ready, no sandbox) as a colored pill in the transcript panel header. Integrates task runtime status, session status, and sandbox availability via the sandboxProcesses interest topic so the pill accurately reflects unreachable sandboxes. Includes mock tasks demonstrating error, provisioning, and running states, unit tests for deriveHeaderStatus, and workspace-dashboard integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
323 lines
11 KiB
TypeScript
323 lines
11 KiB
TypeScript
import { describe, expect, it } from "vitest";
|
|
import {
|
|
createFoundryLogger,
|
|
type TaskWorkbenchSnapshot,
|
|
type WorkbenchAgentTab,
|
|
type WorkbenchTask,
|
|
type WorkbenchModelId,
|
|
type WorkbenchTranscriptEvent,
|
|
} from "@sandbox-agent/foundry-shared";
|
|
import { createBackendClient } from "../../src/backend-client.js";
|
|
|
|
const RUN_WORKBENCH_LOAD_E2E = process.env.HF_ENABLE_DAEMON_WORKBENCH_LOAD_E2E === "1";
|
|
const logger = createFoundryLogger({
|
|
service: "foundry-client-e2e",
|
|
bindings: {
|
|
suite: "workbench-load",
|
|
},
|
|
});
|
|
|
|
function requiredEnv(name: string): string {
|
|
const value = process.env[name]?.trim();
|
|
if (!value) {
|
|
throw new Error(`Missing required env var: ${name}`);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
function workbenchModelEnv(name: string, fallback: WorkbenchModelId): WorkbenchModelId {
|
|
const value = process.env[name]?.trim();
|
|
switch (value) {
|
|
case "claude-sonnet-4":
|
|
case "claude-opus-4":
|
|
case "gpt-5.3-codex":
|
|
case "gpt-5.4":
|
|
case "gpt-5.2-codex":
|
|
case "gpt-5.1-codex-max":
|
|
case "gpt-5.2":
|
|
case "gpt-5.1-codex-mini":
|
|
return value;
|
|
default:
|
|
return fallback;
|
|
}
|
|
}
|
|
|
|
function intEnv(name: string, fallback: number): number {
|
|
const raw = process.env[name]?.trim();
|
|
if (!raw) {
|
|
return fallback;
|
|
}
|
|
const value = Number.parseInt(raw, 10);
|
|
return Number.isFinite(value) && value > 0 ? value : fallback;
|
|
}
|
|
|
|
async function sleep(ms: number): Promise<void> {
|
|
await new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async function poll<T>(label: string, timeoutMs: number, intervalMs: number, fn: () => Promise<T>, isDone: (value: T) => boolean): Promise<T> {
|
|
const startedAt = Date.now();
|
|
let lastValue: T;
|
|
|
|
for (;;) {
|
|
lastValue = await fn();
|
|
if (isDone(lastValue)) {
|
|
return lastValue;
|
|
}
|
|
if (Date.now() - startedAt > timeoutMs) {
|
|
throw new Error(`timed out waiting for ${label}`);
|
|
}
|
|
await sleep(intervalMs);
|
|
}
|
|
}
|
|
|
|
function findTask(snapshot: TaskWorkbenchSnapshot, taskId: string): WorkbenchTask {
|
|
const task = snapshot.tasks.find((candidate) => candidate.id === taskId);
|
|
if (!task) {
|
|
throw new Error(`task ${taskId} missing from snapshot`);
|
|
}
|
|
return task;
|
|
}
|
|
|
|
function findTab(task: WorkbenchTask, tabId: string): WorkbenchAgentTab {
|
|
const tab = task.tabs.find((candidate) => candidate.id === tabId);
|
|
if (!tab) {
|
|
throw new Error(`tab ${tabId} missing from task ${task.id}`);
|
|
}
|
|
return tab;
|
|
}
|
|
|
|
function extractEventText(event: WorkbenchTranscriptEvent): string {
|
|
const payload = event.payload;
|
|
if (!payload || typeof payload !== "object") {
|
|
return String(payload ?? "");
|
|
}
|
|
|
|
const envelope = payload as {
|
|
method?: unknown;
|
|
params?: unknown;
|
|
result?: unknown;
|
|
};
|
|
|
|
const params = envelope.params;
|
|
if (params && typeof params === "object") {
|
|
const update = (params as { update?: unknown }).update;
|
|
if (update && typeof update === "object") {
|
|
const content = (update as { content?: unknown }).content;
|
|
if (content && typeof content === "object") {
|
|
const chunkText = (content as { text?: unknown }).text;
|
|
if (typeof chunkText === "string") {
|
|
return chunkText;
|
|
}
|
|
}
|
|
}
|
|
|
|
const text = (params as { text?: unknown }).text;
|
|
if (typeof text === "string" && text.trim()) {
|
|
return text.trim();
|
|
}
|
|
|
|
const prompt = (params as { prompt?: Array<{ text?: unknown }> }).prompt;
|
|
if (Array.isArray(prompt)) {
|
|
return prompt
|
|
.map((item) => (typeof item?.text === "string" ? item.text.trim() : ""))
|
|
.filter(Boolean)
|
|
.join("\n");
|
|
}
|
|
}
|
|
|
|
const result = envelope.result;
|
|
if (result && typeof result === "object") {
|
|
const text = (result as { text?: unknown }).text;
|
|
if (typeof text === "string" && text.trim()) {
|
|
return text.trim();
|
|
}
|
|
}
|
|
|
|
return typeof envelope.method === "string" ? envelope.method : JSON.stringify(payload);
|
|
}
|
|
|
|
function transcriptIncludesAgentText(transcript: WorkbenchTranscriptEvent[], expectedText: string): boolean {
|
|
return transcript
|
|
.filter((event) => event.sender === "agent")
|
|
.map((event) => extractEventText(event))
|
|
.join("")
|
|
.includes(expectedText);
|
|
}
|
|
|
|
function average(values: number[]): number {
|
|
return values.reduce((sum, value) => sum + value, 0) / Math.max(values.length, 1);
|
|
}
|
|
|
|
async function measureWorkbenchSnapshot(
|
|
client: ReturnType<typeof createBackendClient>,
|
|
workspaceId: string,
|
|
iterations: number,
|
|
): Promise<{
|
|
avgMs: number;
|
|
maxMs: number;
|
|
payloadBytes: number;
|
|
taskCount: number;
|
|
tabCount: number;
|
|
transcriptEventCount: number;
|
|
}> {
|
|
const durations: number[] = [];
|
|
let snapshot: TaskWorkbenchSnapshot | null = null;
|
|
|
|
for (let index = 0; index < iterations; index += 1) {
|
|
const startedAt = performance.now();
|
|
snapshot = await client.getWorkbench(workspaceId);
|
|
durations.push(performance.now() - startedAt);
|
|
}
|
|
|
|
const finalSnapshot = snapshot ?? {
|
|
workspaceId,
|
|
repos: [],
|
|
projects: [],
|
|
tasks: [],
|
|
};
|
|
const payloadBytes = Buffer.byteLength(JSON.stringify(finalSnapshot), "utf8");
|
|
const tabCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.tabs.length, 0);
|
|
const transcriptEventCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.tabs.reduce((tabSum, tab) => tabSum + tab.transcript.length, 0), 0);
|
|
|
|
return {
|
|
avgMs: Math.round(average(durations)),
|
|
maxMs: Math.round(Math.max(...durations, 0)),
|
|
payloadBytes,
|
|
taskCount: finalSnapshot.tasks.length,
|
|
tabCount,
|
|
transcriptEventCount,
|
|
};
|
|
}
|
|
|
|
describe("e2e(client): workbench load", () => {
|
|
it.skipIf(!RUN_WORKBENCH_LOAD_E2E)("runs a simple sequential load profile against the real backend", { timeout: 30 * 60_000 }, async () => {
|
|
const endpoint = process.env.HF_E2E_BACKEND_ENDPOINT?.trim() || "http://127.0.0.1:7741/v1/rivet";
|
|
const workspaceId = process.env.HF_E2E_WORKSPACE?.trim() || "default";
|
|
const repoRemote = requiredEnv("HF_E2E_GITHUB_REPO");
|
|
const model = workbenchModelEnv("HF_E2E_MODEL", "gpt-5.3-codex");
|
|
const taskCount = intEnv("HF_LOAD_TASK_COUNT", 3);
|
|
const extraSessionCount = intEnv("HF_LOAD_EXTRA_SESSION_COUNT", 2);
|
|
const pollIntervalMs = intEnv("HF_LOAD_POLL_INTERVAL_MS", 2_000);
|
|
|
|
const client = createBackendClient({
|
|
endpoint,
|
|
defaultWorkspaceId: workspaceId,
|
|
});
|
|
|
|
const repo = await client.addRepo(workspaceId, repoRemote);
|
|
const createTaskLatencies: number[] = [];
|
|
const provisionLatencies: number[] = [];
|
|
const createSessionLatencies: number[] = [];
|
|
const messageRoundTripLatencies: number[] = [];
|
|
const snapshotSeries: Array<{
|
|
taskCount: number;
|
|
avgMs: number;
|
|
maxMs: number;
|
|
payloadBytes: number;
|
|
tabCount: number;
|
|
transcriptEventCount: number;
|
|
}> = [];
|
|
|
|
snapshotSeries.push(await measureWorkbenchSnapshot(client, workspaceId, 2));
|
|
|
|
for (let taskIndex = 0; taskIndex < taskCount; taskIndex += 1) {
|
|
const runId = `load-${taskIndex}-${Date.now().toString(36)}`;
|
|
const initialReply = `LOAD_INIT_${runId}`;
|
|
|
|
const createStartedAt = performance.now();
|
|
const created = await client.createWorkbenchTask(workspaceId, {
|
|
repoId: repo.repoId,
|
|
title: `Workbench Load ${runId}`,
|
|
branch: `load/${runId}`,
|
|
model,
|
|
task: `Reply with exactly: ${initialReply}`,
|
|
});
|
|
createTaskLatencies.push(performance.now() - createStartedAt);
|
|
|
|
const provisionStartedAt = performance.now();
|
|
const provisioned = await poll(
|
|
`task ${runId} provisioning`,
|
|
12 * 60_000,
|
|
pollIntervalMs,
|
|
async () => findTask(await client.getWorkbench(workspaceId), created.taskId),
|
|
(task) => {
|
|
const tab = task.tabs[0];
|
|
return Boolean(tab && task.status === "idle" && tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, initialReply));
|
|
},
|
|
);
|
|
provisionLatencies.push(performance.now() - provisionStartedAt);
|
|
|
|
expect(provisioned.tabs.length).toBeGreaterThan(0);
|
|
const primaryTab = provisioned.tabs[0]!;
|
|
expect(transcriptIncludesAgentText(primaryTab.transcript, initialReply)).toBe(true);
|
|
|
|
for (let sessionIndex = 0; sessionIndex < extraSessionCount; sessionIndex += 1) {
|
|
const expectedReply = `LOAD_REPLY_${runId}_${sessionIndex}`;
|
|
const createSessionStartedAt = performance.now();
|
|
const createdSession = await client.createWorkbenchSession(workspaceId, {
|
|
taskId: created.taskId,
|
|
model,
|
|
});
|
|
createSessionLatencies.push(performance.now() - createSessionStartedAt);
|
|
|
|
await client.sendWorkbenchMessage(workspaceId, {
|
|
taskId: created.taskId,
|
|
tabId: createdSession.tabId,
|
|
text: `Run pwd in the repo, then reply with exactly: ${expectedReply}`,
|
|
attachments: [],
|
|
});
|
|
|
|
const messageStartedAt = performance.now();
|
|
const withReply = await poll(
|
|
`task ${runId} session ${sessionIndex} reply`,
|
|
10 * 60_000,
|
|
pollIntervalMs,
|
|
async () => findTask(await client.getWorkbench(workspaceId), created.taskId),
|
|
(task) => {
|
|
const tab = findTab(task, createdSession.tabId);
|
|
return tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, expectedReply);
|
|
},
|
|
);
|
|
messageRoundTripLatencies.push(performance.now() - messageStartedAt);
|
|
|
|
expect(transcriptIncludesAgentText(findTab(withReply, createdSession.tabId).transcript, expectedReply)).toBe(true);
|
|
}
|
|
|
|
const snapshotMetrics = await measureWorkbenchSnapshot(client, workspaceId, 3);
|
|
snapshotSeries.push(snapshotMetrics);
|
|
logger.info(
|
|
{
|
|
taskIndex: taskIndex + 1,
|
|
...snapshotMetrics,
|
|
},
|
|
"workbench_load_snapshot",
|
|
);
|
|
}
|
|
|
|
const firstSnapshot = snapshotSeries[0]!;
|
|
const lastSnapshot = snapshotSeries[snapshotSeries.length - 1]!;
|
|
const summary = {
|
|
taskCount,
|
|
extraSessionCount,
|
|
createTaskAvgMs: Math.round(average(createTaskLatencies)),
|
|
provisionAvgMs: Math.round(average(provisionLatencies)),
|
|
createSessionAvgMs: Math.round(average(createSessionLatencies)),
|
|
messageRoundTripAvgMs: Math.round(average(messageRoundTripLatencies)),
|
|
snapshotReadBaselineAvgMs: firstSnapshot.avgMs,
|
|
snapshotReadFinalAvgMs: lastSnapshot.avgMs,
|
|
snapshotReadFinalMaxMs: lastSnapshot.maxMs,
|
|
snapshotPayloadBaselineBytes: firstSnapshot.payloadBytes,
|
|
snapshotPayloadFinalBytes: lastSnapshot.payloadBytes,
|
|
snapshotTabFinalCount: lastSnapshot.tabCount,
|
|
snapshotTranscriptFinalCount: lastSnapshot.transcriptEventCount,
|
|
};
|
|
|
|
logger.info(summary, "workbench_load_summary");
|
|
|
|
expect(createTaskLatencies.length).toBe(taskCount);
|
|
expect(provisionLatencies.length).toBe(taskCount);
|
|
expect(createSessionLatencies.length).toBe(taskCount * extraSessionCount);
|
|
expect(messageRoundTripLatencies.length).toBe(taskCount * extraSessionCount);
|
|
});
|
|
});
|