feat: desktop computer-use APIs with neko-based streaming

Add desktop runtime management (Xvfb, openbox, dbus), screen capture,
mouse/keyboard input, and video streaming via neko binary extracted
from the m1k1o/neko container. Includes Docker test rig, TypeScript SDK
desktop support, and inspector Desktop tab.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nathan Flurry 2026-03-16 17:56:39 -07:00
parent 3895e34bdb
commit 33821d8660
66 changed files with 13190 additions and 1135 deletions

View file

@ -0,0 +1,244 @@
import { execFileSync } from "node:child_process";
import { mkdtempSync, mkdirSync, rmSync } from "node:fs";
import { dirname, join, resolve } from "node:path";
import { fileURLToPath } from "node:url";
const __dirname = dirname(fileURLToPath(import.meta.url));
const REPO_ROOT = resolve(__dirname, "../../../..");
const CONTAINER_PORT = 3000;
const DEFAULT_PATH = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
const DEFAULT_IMAGE_TAG = "sandbox-agent-test:dev";
const STANDARD_PATHS = new Set(["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"]);
let cachedImage: string | undefined;
let containerCounter = 0;
export type DockerSandboxAgentHandle = {
baseUrl: string;
token: string;
dispose: () => Promise<void>;
};
export type DockerSandboxAgentOptions = {
env?: Record<string, string>;
pathMode?: "merge" | "replace";
timeoutMs?: number;
};
type TestLayout = {
rootDir: string;
homeDir: string;
xdgDataHome: string;
xdgStateHome: string;
appDataDir: string;
localAppDataDir: string;
installDir: string;
};
export function createDockerTestLayout(): TestLayout {
const tempRoot = join(REPO_ROOT, ".context", "docker-test-");
mkdirSync(resolve(REPO_ROOT, ".context"), { recursive: true });
const rootDir = mkdtempSync(tempRoot);
const homeDir = join(rootDir, "home");
const xdgDataHome = join(rootDir, "xdg-data");
const xdgStateHome = join(rootDir, "xdg-state");
const appDataDir = join(rootDir, "appdata", "Roaming");
const localAppDataDir = join(rootDir, "appdata", "Local");
const installDir = join(xdgDataHome, "sandbox-agent", "bin");
for (const dir of [homeDir, xdgDataHome, xdgStateHome, appDataDir, localAppDataDir, installDir]) {
mkdirSync(dir, { recursive: true });
}
return {
rootDir,
homeDir,
xdgDataHome,
xdgStateHome,
appDataDir,
localAppDataDir,
installDir,
};
}
export function disposeDockerTestLayout(layout: TestLayout): void {
try {
rmSync(layout.rootDir, { recursive: true, force: true });
} catch (error) {
if (typeof process.getuid === "function" && typeof process.getgid === "function") {
try {
execFileSync(
"docker",
[
"run",
"--rm",
"--user",
"0:0",
"--entrypoint",
"sh",
"-v",
`${layout.rootDir}:${layout.rootDir}`,
ensureImage(),
"-c",
`chown -R ${process.getuid()}:${process.getgid()} '${layout.rootDir}'`,
],
{ stdio: "pipe" },
);
rmSync(layout.rootDir, { recursive: true, force: true });
return;
} catch {}
}
throw error;
}
}
export async function startDockerSandboxAgent(layout: TestLayout, options: DockerSandboxAgentOptions = {}): Promise<DockerSandboxAgentHandle> {
const image = ensureImage();
const containerId = uniqueContainerId();
const env = buildEnv(layout, options.env ?? {}, options.pathMode ?? "merge");
const mounts = buildMounts(layout.rootDir, env);
const args = ["run", "-d", "--rm", "--name", containerId, "-p", `127.0.0.1::${CONTAINER_PORT}`];
if (typeof process.getuid === "function" && typeof process.getgid === "function") {
args.push("--user", `${process.getuid()}:${process.getgid()}`);
}
if (process.platform === "linux") {
args.push("--add-host", "host.docker.internal:host-gateway");
}
for (const mount of mounts) {
args.push("-v", `${mount}:${mount}`);
}
for (const [key, value] of Object.entries(env)) {
args.push("-e", `${key}=${value}`);
}
args.push(image, "server", "--host", "0.0.0.0", "--port", String(CONTAINER_PORT), "--no-token");
execFileSync("docker", args, { stdio: "pipe" });
try {
const mapping = execFileSync("docker", ["port", containerId, `${CONTAINER_PORT}/tcp`], {
encoding: "utf8",
stdio: ["ignore", "pipe", "pipe"],
}).trim();
const mappingParts = mapping.split(":");
const hostPort = mappingParts[mappingParts.length - 1]?.trim();
if (!hostPort) {
throw new Error(`missing mapped host port in ${mapping}`);
}
const baseUrl = `http://127.0.0.1:${hostPort}`;
await waitForHealth(baseUrl, options.timeoutMs ?? 30_000);
return {
baseUrl,
token: "",
dispose: async () => {
try {
execFileSync("docker", ["rm", "-f", containerId], { stdio: "pipe" });
} catch {}
},
};
} catch (error) {
try {
execFileSync("docker", ["rm", "-f", containerId], { stdio: "pipe" });
} catch {}
throw error;
}
}
function ensureImage(): string {
if (cachedImage) {
return cachedImage;
}
cachedImage = process.env.SANDBOX_AGENT_TEST_IMAGE ?? DEFAULT_IMAGE_TAG;
execFileSync("docker", ["build", "--tag", cachedImage, "--file", resolve(REPO_ROOT, "docker/test-agent/Dockerfile"), REPO_ROOT], {
cwd: REPO_ROOT,
stdio: ["ignore", "ignore", "pipe"],
});
return cachedImage;
}
function buildEnv(layout: TestLayout, extraEnv: Record<string, string>, pathMode: "merge" | "replace"): Record<string, string> {
const env: Record<string, string> = {
HOME: layout.homeDir,
USERPROFILE: layout.homeDir,
XDG_DATA_HOME: layout.xdgDataHome,
XDG_STATE_HOME: layout.xdgStateHome,
APPDATA: layout.appDataDir,
LOCALAPPDATA: layout.localAppDataDir,
PATH: DEFAULT_PATH,
};
const customPathEntries = new Set<string>();
for (const entry of (extraEnv.PATH ?? "").split(":")) {
if (!entry || entry === DEFAULT_PATH || !entry.startsWith("/")) continue;
if (entry.startsWith(layout.rootDir)) {
customPathEntries.add(entry);
}
}
if (pathMode === "replace") {
env.PATH = extraEnv.PATH ?? "";
} else if (customPathEntries.size > 0) {
env.PATH = `${Array.from(customPathEntries).join(":")}:${DEFAULT_PATH}`;
}
for (const [key, value] of Object.entries(extraEnv)) {
if (key === "PATH") {
continue;
}
env[key] = rewriteLocalhostUrl(key, value);
}
return env;
}
function buildMounts(rootDir: string, env: Record<string, string>): string[] {
const mounts = new Set<string>([rootDir]);
for (const key of ["HOME", "USERPROFILE", "XDG_DATA_HOME", "XDG_STATE_HOME", "APPDATA", "LOCALAPPDATA", "SANDBOX_AGENT_DESKTOP_FAKE_STATE_DIR"]) {
const value = env[key];
if (value?.startsWith("/")) {
mounts.add(value);
}
}
for (const entry of (env.PATH ?? "").split(":")) {
if (entry.startsWith("/") && !STANDARD_PATHS.has(entry)) {
mounts.add(entry);
}
}
return Array.from(mounts);
}
async function waitForHealth(baseUrl: string, timeoutMs: number): Promise<void> {
const started = Date.now();
while (Date.now() - started < timeoutMs) {
try {
const response = await fetch(`${baseUrl}/v1/health`);
if (response.ok) {
return;
}
} catch {}
await new Promise((resolve) => setTimeout(resolve, 200));
}
throw new Error(`timed out waiting for sandbox-agent health at ${baseUrl}`);
}
function uniqueContainerId(): string {
containerCounter += 1;
return `sandbox-agent-ts-${process.pid}-${Date.now().toString(36)}-${containerCounter.toString(36)}`;
}
function rewriteLocalhostUrl(key: string, value: string): string {
if (key.endsWith("_URL") || key.endsWith("_URI")) {
return value.replace("http://127.0.0.1", "http://host.docker.internal").replace("http://localhost", "http://host.docker.internal");
}
return value;
}

View file

@ -1,9 +1,6 @@
import { describe, it, expect, beforeAll, afterAll } from "vitest";
import { existsSync } from "node:fs";
import { mkdtempSync, rmSync } from "node:fs";
import { dirname, resolve } from "node:path";
import { describe, it, expect, beforeEach, afterEach } from "vitest";
import { mkdirSync, mkdtempSync, rmSync } from "node:fs";
import { join } from "node:path";
import { fileURLToPath } from "node:url";
import { tmpdir } from "node:os";
import {
InMemorySessionPersistDriver,
@ -14,36 +11,11 @@ import {
type SessionPersistDriver,
type SessionRecord,
} from "../src/index.ts";
import { spawnSandboxAgent, isNodeRuntime, type SandboxAgentSpawnHandle } from "../src/spawn.ts";
import { isNodeRuntime } from "../src/spawn.ts";
import { createDockerTestLayout, disposeDockerTestLayout, startDockerSandboxAgent, type DockerSandboxAgentHandle } from "./helpers/docker.ts";
import { prepareMockAgentDataHome } from "./helpers/mock-agent.ts";
import WebSocket from "ws";
const __dirname = dirname(fileURLToPath(import.meta.url));
function findBinary(): string | null {
if (process.env.SANDBOX_AGENT_BIN) {
return process.env.SANDBOX_AGENT_BIN;
}
const cargoPaths = [resolve(__dirname, "../../../target/debug/sandbox-agent"), resolve(__dirname, "../../../target/release/sandbox-agent")];
for (const p of cargoPaths) {
if (existsSync(p)) {
return p;
}
}
return null;
}
const BINARY_PATH = findBinary();
if (!BINARY_PATH) {
throw new Error("sandbox-agent binary not found. Build it (cargo build -p sandbox-agent) or set SANDBOX_AGENT_BIN.");
}
if (!process.env.SANDBOX_AGENT_BIN) {
process.env.SANDBOX_AGENT_BIN = BINARY_PATH;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
@ -110,6 +82,15 @@ async function waitForAsync<T>(fn: () => Promise<T | undefined | null>, timeoutM
throw new Error("timed out waiting for condition");
}
async function withTimeout<T>(promise: Promise<T>, label: string, timeoutMs = 15_000): Promise<T> {
return await Promise.race([
promise,
sleep(timeoutMs).then(() => {
throw new Error(`${label} timed out after ${timeoutMs}ms`);
}),
]);
}
function buildTarArchive(entries: Array<{ name: string; content: string }>): Uint8Array {
const blocks: Buffer[] = [];
@ -174,34 +155,77 @@ function decodeProcessLogData(data: string, encoding: string): string {
function nodeCommand(source: string): { command: string; args: string[] } {
return {
command: process.execPath,
command: "node",
args: ["-e", source],
};
}
function forwardRequest(defaultFetch: typeof fetch, baseUrl: string, outgoing: Request, parsed: URL): Promise<Response> {
const forwardedInit: RequestInit & { duplex?: "half" } = {
method: outgoing.method,
headers: new Headers(outgoing.headers),
signal: outgoing.signal,
};
if (outgoing.method !== "GET" && outgoing.method !== "HEAD") {
forwardedInit.body = outgoing.body;
forwardedInit.duplex = "half";
}
const forwardedUrl = new URL(`${parsed.pathname}${parsed.search}`, baseUrl);
return defaultFetch(forwardedUrl, forwardedInit);
}
async function launchDesktopFocusWindow(sdk: SandboxAgent, display: string): Promise<string> {
const windowProcess = await sdk.createProcess({
command: "xterm",
args: ["-geometry", "80x24+40+40", "-title", "Sandbox Desktop Test", "-e", "sh", "-lc", "sleep 60"],
env: { DISPLAY: display },
});
await waitForAsync(
async () => {
const result = await sdk.runProcess({
command: "sh",
args: [
"-lc",
'wid="$(xdotool search --onlyvisible --name \'Sandbox Desktop Test\' 2>/dev/null | head -n 1 || true)"; if [ -z "$wid" ]; then exit 3; fi; xdotool windowactivate "$wid"',
],
env: { DISPLAY: display },
timeoutMs: 5_000,
});
return result.exitCode === 0 ? true : undefined;
},
10_000,
200,
);
return windowProcess.id;
}
describe("Integration: TypeScript SDK flat session API", () => {
let handle: SandboxAgentSpawnHandle;
let handle: DockerSandboxAgentHandle;
let baseUrl: string;
let token: string;
let dataHome: string;
let layout: ReturnType<typeof createDockerTestLayout>;
beforeAll(async () => {
dataHome = mkdtempSync(join(tmpdir(), "sdk-integration-"));
const agentEnv = prepareMockAgentDataHome(dataHome);
beforeEach(async () => {
layout = createDockerTestLayout();
prepareMockAgentDataHome(layout.xdgDataHome);
handle = await spawnSandboxAgent({
enabled: true,
log: "silent",
handle = await startDockerSandboxAgent(layout, {
timeoutMs: 30000,
env: agentEnv,
});
baseUrl = handle.baseUrl;
token = handle.token;
});
afterAll(async () => {
await handle.dispose();
rmSync(dataHome, { recursive: true, force: true });
afterEach(async () => {
await handle?.dispose?.();
if (layout) {
disposeDockerTestLayout(layout);
}
});
it("detects Node.js runtime", () => {
@ -280,11 +304,12 @@ describe("Integration: TypeScript SDK flat session API", () => {
token,
});
const directory = mkdtempSync(join(tmpdir(), "sdk-fs-"));
const directory = join(layout.rootDir, "fs-test");
const nestedDir = join(directory, "nested");
const filePath = join(directory, "notes.txt");
const movedPath = join(directory, "notes-moved.txt");
const uploadDir = join(directory, "uploaded");
mkdirSync(directory, { recursive: true });
try {
const listedAgents = await sdk.listAgents({ config: true, noCache: true });
@ -341,25 +366,30 @@ describe("Integration: TypeScript SDK flat session API", () => {
const parsed = new URL(outgoing.url);
seenPaths.push(parsed.pathname);
const forwardedUrl = new URL(`${parsed.pathname}${parsed.search}`, baseUrl);
const forwarded = new Request(forwardedUrl.toString(), outgoing);
return defaultFetch(forwarded);
return forwardRequest(defaultFetch, baseUrl, outgoing, parsed);
};
const sdk = await SandboxAgent.connect({
token,
fetch: customFetch,
});
let sessionId: string | undefined;
await sdk.getHealth();
const session = await sdk.createSession({ agent: "mock" });
const prompt = await session.prompt([{ type: "text", text: "custom fetch integration test" }]);
expect(prompt.stopReason).toBe("end_turn");
try {
await withTimeout(sdk.getHealth(), "custom fetch getHealth");
const session = await withTimeout(sdk.createSession({ agent: "mock" }), "custom fetch createSession");
sessionId = session.id;
expect(session.agent).toBe("mock");
await withTimeout(sdk.destroySession(session.id), "custom fetch destroySession");
expect(seenPaths).toContain("/v1/health");
expect(seenPaths.some((path) => path.startsWith("/v1/acp/"))).toBe(true);
await sdk.dispose();
expect(seenPaths).toContain("/v1/health");
expect(seenPaths.some((path) => path.startsWith("/v1/acp/"))).toBe(true);
} finally {
if (sessionId) {
await sdk.destroySession(sessionId).catch(() => {});
}
await withTimeout(sdk.dispose(), "custom fetch dispose");
}
}, 60_000);
it("requires baseUrl when fetch is not provided", async () => {
@ -386,9 +416,7 @@ describe("Integration: TypeScript SDK flat session API", () => {
}
}
const forwardedUrl = new URL(`${parsed.pathname}${parsed.search}`, baseUrl);
const forwarded = new Request(forwardedUrl.toString(), outgoing);
return defaultFetch(forwarded);
return forwardRequest(defaultFetch, baseUrl, outgoing, parsed);
};
const sdk = await SandboxAgent.connect({
@ -710,7 +738,9 @@ describe("Integration: TypeScript SDK flat session API", () => {
token,
});
const directory = mkdtempSync(join(tmpdir(), "sdk-config-"));
const directory = join(layout.rootDir, "config-test");
mkdirSync(directory, { recursive: true });
const mcpConfig = {
type: "local" as const,
@ -957,4 +987,98 @@ describe("Integration: TypeScript SDK flat session API", () => {
await sdk.dispose();
}
});
it("covers desktop status, screenshot, display, mouse, and keyboard helpers", async () => {
const sdk = await SandboxAgent.connect({
baseUrl,
token,
});
let focusWindowProcessId: string | undefined;
try {
const initialStatus = await sdk.getDesktopStatus();
expect(initialStatus.state).toBe("inactive");
const started = await sdk.startDesktop({
width: 1440,
height: 900,
dpi: 96,
});
expect(started.state).toBe("active");
expect(started.display?.startsWith(":")).toBe(true);
expect(started.missingDependencies).toEqual([]);
const displayInfo = await sdk.getDesktopDisplayInfo();
expect(displayInfo.display).toBe(started.display);
expect(displayInfo.resolution.width).toBe(1440);
expect(displayInfo.resolution.height).toBe(900);
const screenshot = await sdk.takeDesktopScreenshot();
expect(Buffer.from(screenshot.subarray(0, 8)).equals(Buffer.from("\x89PNG\r\n\x1a\n", "binary"))).toBe(true);
const region = await sdk.takeDesktopRegionScreenshot({
x: 10,
y: 20,
width: 40,
height: 50,
});
expect(Buffer.from(region.subarray(0, 8)).equals(Buffer.from("\x89PNG\r\n\x1a\n", "binary"))).toBe(true);
const moved = await sdk.moveDesktopMouse({ x: 40, y: 50 });
expect(moved.x).toBe(40);
expect(moved.y).toBe(50);
const dragged = await sdk.dragDesktopMouse({
startX: 40,
startY: 50,
endX: 80,
endY: 90,
button: "left",
});
expect(dragged.x).toBe(80);
expect(dragged.y).toBe(90);
const clicked = await sdk.clickDesktop({
x: 80,
y: 90,
button: "left",
clickCount: 1,
});
expect(clicked.x).toBe(80);
expect(clicked.y).toBe(90);
const scrolled = await sdk.scrollDesktop({
x: 80,
y: 90,
deltaY: -2,
});
expect(scrolled.x).toBe(80);
expect(scrolled.y).toBe(90);
const position = await sdk.getDesktopMousePosition();
expect(position.x).toBe(80);
expect(position.y).toBe(90);
focusWindowProcessId = await launchDesktopFocusWindow(sdk, started.display!);
const typed = await sdk.typeDesktopText({
text: "hello desktop",
delayMs: 5,
});
expect(typed.ok).toBe(true);
const pressed = await sdk.pressDesktopKey({ key: "ctrl+l" });
expect(pressed.ok).toBe(true);
const stopped = await sdk.stopDesktop();
expect(stopped.state).toBe("inactive");
} finally {
if (focusWindowProcessId) {
await sdk.killProcess(focusWindowProcessId, { waitMs: 5_000 }).catch(() => {});
await sdk.deleteProcess(focusWindowProcessId).catch(() => {});
}
await sdk.stopDesktop().catch(() => {});
await sdk.dispose();
}
});
});