sandbox-agent/foundry/packages/client/test/e2e/workbench-load-e2e.test.ts

import { describe, expect, it } from "vitest";
import {
  createFoundryLogger,
  type TaskWorkbenchSnapshot,
  type WorkbenchAgentTab,
  type WorkbenchTask,
  type WorkbenchModelId,
  type WorkbenchTranscriptEvent,
} from "@sandbox-agent/foundry-shared";
import { createBackendClient } from "../../src/backend-client.js";

const RUN_WORKBENCH_LOAD_E2E = process.env.HF_ENABLE_DAEMON_WORKBENCH_LOAD_E2E === "1";
const logger = createFoundryLogger({
  service: "foundry-client-e2e",
  bindings: {
    suite: "workbench-load",
  },
});

function requiredEnv(name: string): string {
  const value = process.env[name]?.trim();
  if (!value) {
    throw new Error(`Missing required env var: ${name}`);
  }
  return value;
}

function workbenchModelEnv(name: string, fallback: WorkbenchModelId): WorkbenchModelId {
  const value = process.env[name]?.trim();
  switch (value) {
    case "claude-sonnet-4":
    case "claude-opus-4":
    case "gpt-4o":
    case "o3":
      return value;
    default:
      return fallback;
  }
}

function intEnv(name: string, fallback: number): number {
  const raw = process.env[name]?.trim();
  if (!raw) {
    return fallback;
  }
  const value = Number.parseInt(raw, 10);
  return Number.isFinite(value) && value > 0 ? value : fallback;
}

async function sleep(ms: number): Promise<void> {
  await new Promise((resolve) => setTimeout(resolve, ms));
}

async function poll<T>(label: string, timeoutMs: number, intervalMs: number, fn: () => Promise<T>, isDone: (value: T) => boolean): Promise<T> {
  const startedAt = Date.now();
  let lastValue: T;

  for (;;) {
    lastValue = await fn();
    if (isDone(lastValue)) {
      return lastValue;
    }
    if (Date.now() - startedAt > timeoutMs) {
      throw new Error(`timed out waiting for ${label}`);
    }
    await sleep(intervalMs);
  }
}

function findTask(snapshot: TaskWorkbenchSnapshot, taskId: string): WorkbenchTask {
  const task = snapshot.tasks.find((candidate) => candidate.id === taskId);
  if (!task) {
    throw new Error(`task ${taskId} missing from snapshot`);
  }
  return task;
}

function findTab(task: WorkbenchTask, tabId: string): WorkbenchAgentTab {
  const tab = task.tabs.find((candidate) => candidate.id === tabId);
  if (!tab) {
    throw new Error(`tab ${tabId} missing from task ${task.id}`);
  }
  return tab;
}

function extractEventText(event: WorkbenchTranscriptEvent): string {
  const payload = event.payload;
  if (!payload || typeof payload !== "object") {
    return String(payload ?? "");
  }

  const envelope = payload as {
    method?: unknown;
    params?: unknown;
    result?: unknown;
  };

  const params = envelope.params;
  if (params && typeof params === "object") {
    const update = (params as { update?: unknown }).update;
    if (update && typeof update === "object") {
      const content = (update as { content?: unknown }).content;
      if (content && typeof content === "object") {
        const chunkText = (content as { text?: unknown }).text;
        if (typeof chunkText === "string") {
          return chunkText;
        }
      }
    }

    const text = (params as { text?: unknown }).text;
    if (typeof text === "string" && text.trim()) {
      return text.trim();
    }

    const prompt = (params as { prompt?: Array<{ text?: unknown }> }).prompt;
    if (Array.isArray(prompt)) {
      return prompt
        .map((item) => (typeof item?.text === "string" ? item.text.trim() : ""))
        .filter(Boolean)
        .join("\n");
    }
  }

  const result = envelope.result;
  if (result && typeof result === "object") {
    const text = (result as { text?: unknown }).text;
    if (typeof text === "string" && text.trim()) {
      return text.trim();
    }
  }

  return typeof envelope.method === "string" ? envelope.method : JSON.stringify(payload);
}

function transcriptIncludesAgentText(transcript: WorkbenchTranscriptEvent[], expectedText: string): boolean {
  return transcript
    .filter((event) => event.sender === "agent")
    .map((event) => extractEventText(event))
    .join("")
    .includes(expectedText);
}

function average(values: number[]): number {
  return values.reduce((sum, value) => sum + value, 0) / Math.max(values.length, 1);
}

async function measureWorkbenchSnapshot(
  client: ReturnType<typeof createBackendClient>,
  workspaceId: string,
  iterations: number,
): Promise<{
  avgMs: number;
  maxMs: number;
  payloadBytes: number;
  taskCount: number;
  tabCount: number;
  transcriptEventCount: number;
}> {
  const durations: number[] = [];
  let snapshot: TaskWorkbenchSnapshot | null = null;

  for (let index = 0; index < iterations; index += 1) {
    const startedAt = performance.now();
    snapshot = await client.getWorkbench(workspaceId);
    durations.push(performance.now() - startedAt);
  }

  const finalSnapshot = snapshot ?? {
    workspaceId,
    repos: [],
    projects: [],
    tasks: [],
  };
  const payloadBytes = Buffer.byteLength(JSON.stringify(finalSnapshot), "utf8");
  const tabCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.tabs.length, 0);
  const transcriptEventCount = finalSnapshot.tasks.reduce((sum, task) => sum + task.tabs.reduce((tabSum, tab) => tabSum + tab.transcript.length, 0), 0);

  return {
    avgMs: Math.round(average(durations)),
    maxMs: Math.round(Math.max(...durations, 0)),
    payloadBytes,
    taskCount: finalSnapshot.tasks.length,
    tabCount,
    transcriptEventCount,
  };
}

describe("e2e(client): workbench load", () => {
  it.skipIf(!RUN_WORKBENCH_LOAD_E2E)("runs a simple sequential load profile against the real backend", { timeout: 30 * 60_000 }, async () => {
    const endpoint = process.env.HF_E2E_BACKEND_ENDPOINT?.trim() || "http://127.0.0.1:7741/v1/rivet";
    const workspaceId = process.env.HF_E2E_WORKSPACE?.trim() || "default";
    const repoRemote = requiredEnv("HF_E2E_GITHUB_REPO");
    const model = workbenchModelEnv("HF_E2E_MODEL", "gpt-4o");
    const taskCount = intEnv("HF_LOAD_TASK_COUNT", 3);
    const extraSessionCount = intEnv("HF_LOAD_EXTRA_SESSION_COUNT", 2);
    const pollIntervalMs = intEnv("HF_LOAD_POLL_INTERVAL_MS", 2_000);

    const client = createBackendClient({
      endpoint,
      defaultWorkspaceId: workspaceId,
    });

    const repo = await client.addRepo(workspaceId, repoRemote);
    const createTaskLatencies: number[] = [];
    const provisionLatencies: number[] = [];
    const createSessionLatencies: number[] = [];
    const messageRoundTripLatencies: number[] = [];
    const snapshotSeries: Array<{
      taskCount: number;
      avgMs: number;
      maxMs: number;
      payloadBytes: number;
      tabCount: number;
      transcriptEventCount: number;
    }> = [];

    snapshotSeries.push(await measureWorkbenchSnapshot(client, workspaceId, 2));

    for (let taskIndex = 0; taskIndex < taskCount; taskIndex += 1) {
      const runId = `load-${taskIndex}-${Date.now().toString(36)}`;
      const initialReply = `LOAD_INIT_${runId}`;

      const createStartedAt = performance.now();
      const created = await client.createWorkbenchTask(workspaceId, {
        repoId: repo.repoId,
        title: `Workbench Load ${runId}`,
        branch: `load/${runId}`,
        model,
        task: `Reply with exactly: ${initialReply}`,
      });
      createTaskLatencies.push(performance.now() - createStartedAt);

      const provisionStartedAt = performance.now();
      const provisioned = await poll(
        `task ${runId} provisioning`,
        12 * 60_000,
        pollIntervalMs,
        async () => findTask(await client.getWorkbench(workspaceId), created.taskId),
        (task) => {
          const tab = task.tabs[0];
          return Boolean(tab && task.status === "idle" && tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, initialReply));
        },
      );
      provisionLatencies.push(performance.now() - provisionStartedAt);

      expect(provisioned.tabs.length).toBeGreaterThan(0);
      const primaryTab = provisioned.tabs[0]!;
      expect(transcriptIncludesAgentText(primaryTab.transcript, initialReply)).toBe(true);

      for (let sessionIndex = 0; sessionIndex < extraSessionCount; sessionIndex += 1) {
        const expectedReply = `LOAD_REPLY_${runId}_${sessionIndex}`;
        const createSessionStartedAt = performance.now();
        const createdSession = await client.createWorkbenchSession(workspaceId, {
          taskId: created.taskId,
          model,
        });
        createSessionLatencies.push(performance.now() - createSessionStartedAt);

        await client.sendWorkbenchMessage(workspaceId, {
          taskId: created.taskId,
          tabId: createdSession.tabId,
          text: `Run pwd in the repo, then reply with exactly: ${expectedReply}`,
          attachments: [],
        });

        const messageStartedAt = performance.now();
        const withReply = await poll(
          `task ${runId} session ${sessionIndex} reply`,
          10 * 60_000,
          pollIntervalMs,
          async () => findTask(await client.getWorkbench(workspaceId), created.taskId),
          (task) => {
            const tab = findTab(task, createdSession.tabId);
            return tab.status === "idle" && transcriptIncludesAgentText(tab.transcript, expectedReply);
          },
        );
        messageRoundTripLatencies.push(performance.now() - messageStartedAt);

        expect(transcriptIncludesAgentText(findTab(withReply, createdSession.tabId).transcript, expectedReply)).toBe(true);
      }

      const snapshotMetrics = await measureWorkbenchSnapshot(client, workspaceId, 3);
      snapshotSeries.push(snapshotMetrics);
      logger.info(
        {
          taskIndex: taskIndex + 1,
          ...snapshotMetrics,
        },
        "workbench_load_snapshot",
      );
    }

    const firstSnapshot = snapshotSeries[0]!;
    const lastSnapshot = snapshotSeries[snapshotSeries.length - 1]!;
    const summary = {
      taskCount,
      extraSessionCount,
      createTaskAvgMs: Math.round(average(createTaskLatencies)),
      provisionAvgMs: Math.round(average(provisionLatencies)),
      createSessionAvgMs: Math.round(average(createSessionLatencies)),
      messageRoundTripAvgMs: Math.round(average(messageRoundTripLatencies)),
      snapshotReadBaselineAvgMs: firstSnapshot.avgMs,
      snapshotReadFinalAvgMs: lastSnapshot.avgMs,
      snapshotReadFinalMaxMs: lastSnapshot.maxMs,
      snapshotPayloadBaselineBytes: firstSnapshot.payloadBytes,
      snapshotPayloadFinalBytes: lastSnapshot.payloadBytes,
      snapshotTabFinalCount: lastSnapshot.tabCount,
      snapshotTranscriptFinalCount: lastSnapshot.transcriptEventCount,
    };

    logger.info(summary, "workbench_load_summary");

    expect(createTaskLatencies.length).toBe(taskCount);
    expect(provisionLatencies.length).toBe(taskCount);
    expect(createSessionLatencies.length).toBe(taskCount * extraSessionCount);
    expect(messageRoundTripLatencies.length).toBe(taskCount * extraSessionCount);
  });
});