feat: desktop computer-use APIs with neko-based streaming

Add desktop runtime management (Xvfb, openbox, dbus), screen capture,
mouse/keyboard input, and video streaming via neko binary extracted
from the m1k1o/neko container. Includes Docker test rig, TypeScript SDK
desktop support, and inspector Desktop tab.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nathan Flurry 2026-03-16 17:56:39 -07:00
parent 3895e34bdb
commit 33821d8660
66 changed files with 13190 additions and 1135 deletions

View file

@ -2889,6 +2889,94 @@
gap: 20px;
}
.desktop-panel {
display: flex;
flex-direction: column;
gap: 16px;
}
.desktop-state-grid {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 12px;
margin-bottom: 12px;
}
.desktop-start-controls {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 10px;
}
.desktop-input-group {
display: flex;
flex-direction: column;
gap: 4px;
}
.desktop-chip-list {
display: flex;
flex-wrap: wrap;
gap: 8px;
}
.desktop-command {
margin-top: 6px;
padding: 8px 10px;
border-radius: var(--radius);
border: 1px solid var(--border);
background: var(--surface);
overflow-x: auto;
}
.desktop-diagnostic-block + .desktop-diagnostic-block {
margin-top: 14px;
}
.desktop-process-list {
display: flex;
flex-direction: column;
gap: 10px;
margin-top: 8px;
}
.desktop-process-item {
padding: 10px;
border-radius: var(--radius);
border: 1px solid var(--border);
background: var(--surface);
display: flex;
flex-direction: column;
gap: 4px;
}
.desktop-screenshot-empty {
padding: 18px;
border: 1px dashed var(--border);
border-radius: var(--radius);
color: var(--muted);
background: var(--surface);
text-align: center;
}
.desktop-screenshot-frame {
border-radius: calc(var(--radius) + 2px);
overflow: hidden;
border: 1px solid var(--border);
background:
linear-gradient(135deg, rgba(15, 23, 42, 0.9), rgba(30, 41, 59, 0.92)),
radial-gradient(circle at top right, rgba(56, 189, 248, 0.12), transparent 40%);
padding: 10px;
}
.desktop-screenshot-image {
display: block;
width: 100%;
height: auto;
border-radius: var(--radius);
background: rgba(0, 0, 0, 0.24);
}
.processes-section {
display: flex;
flex-direction: column;
@ -3551,6 +3639,11 @@
grid-template-columns: 1fr;
}
.desktop-state-grid,
.desktop-start-controls {
grid-template-columns: 1fr;
}
.session-sidebar {
display: none;
}

View file

@ -18,6 +18,7 @@
"@types/react-dom": "^19.1.6",
"@vitejs/plugin-react": "^4.3.1",
"fake-indexeddb": "^6.2.4",
"jsdom": "^26.1.0",
"typescript": "^5.7.3",
"vite": "^5.4.7",
"vitest": "^3.0.0"

View file

@ -1,4 +1,4 @@
import { ChevronLeft, ChevronRight, Cloud, Play, PlayCircle, Server, Terminal, Wrench } from "lucide-react";
import { ChevronLeft, ChevronRight, Cloud, Monitor, Play, PlayCircle, Server, Terminal, Wrench } from "lucide-react";
import type { AgentInfo, SandboxAgent, SessionEvent } from "sandbox-agent";
type AgentModeInfo = { id: string; name: string; description: string };
@ -9,9 +9,10 @@ import ProcessesTab from "./ProcessesTab";
import ProcessRunTab from "./ProcessRunTab";
import SkillsTab from "./SkillsTab";
import RequestLogTab from "./RequestLogTab";
import DesktopTab from "./DesktopTab";
import type { RequestLog } from "../../types/requestLog";
export type DebugTab = "log" | "events" | "agents" | "mcp" | "skills" | "processes" | "run-process";
export type DebugTab = "log" | "events" | "agents" | "desktop" | "mcp" | "skills" | "processes" | "run-process";
const DebugPanel = ({
debugTab,
@ -75,6 +76,10 @@ const DebugPanel = ({
<Cloud className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
Agents
</button>
<button className={`debug-tab ${debugTab === "desktop" ? "active" : ""}`} onClick={() => onDebugTabChange("desktop")}>
<Monitor className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
Desktop
</button>
<button className={`debug-tab ${debugTab === "mcp" ? "active" : ""}`} onClick={() => onDebugTabChange("mcp")}>
<Server className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
MCP
@ -112,6 +117,8 @@ const DebugPanel = ({
/>
)}
{debugTab === "desktop" && <DesktopTab getClient={getClient} />}
{debugTab === "mcp" && <McpTab getClient={getClient} />}
{debugTab === "processes" && <ProcessesTab getClient={getClient} />}

View file

@ -0,0 +1,142 @@
// @vitest-environment jsdom
import { act } from "react";
import { createRoot, type Root } from "react-dom/client";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { SandboxAgent } from "sandbox-agent";
import {
createDockerTestLayout,
disposeDockerTestLayout,
startDockerSandboxAgent,
type DockerSandboxAgentHandle,
} from "../../../../../../sdks/typescript/tests/helpers/docker.ts";
import DesktopTab from "./DesktopTab";
type DockerTestLayout = ReturnType<typeof createDockerTestLayout>;
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function waitFor<T>(fn: () => T | undefined | null, timeoutMs = 20_000, stepMs = 50): Promise<T> {
const started = Date.now();
while (Date.now() - started < timeoutMs) {
const value = fn();
if (value !== undefined && value !== null) {
return value;
}
await sleep(stepMs);
}
throw new Error("timed out waiting for condition");
}
function findButton(container: HTMLElement, label: string): HTMLButtonElement | undefined {
return Array.from(container.querySelectorAll("button")).find((button) => button.textContent?.includes(label)) as HTMLButtonElement | undefined;
}
describe.sequential("DesktopTab", () => {
let container: HTMLDivElement;
let root: Root;
let layout: DockerTestLayout | undefined;
let handle: DockerSandboxAgentHandle | undefined;
let client: SandboxAgent | undefined;
beforeEach(() => {
(globalThis as { IS_REACT_ACT_ENVIRONMENT?: boolean }).IS_REACT_ACT_ENVIRONMENT = true;
container = document.createElement("div");
document.body.appendChild(container);
root = createRoot(container);
});
afterEach(async () => {
await act(async () => {
root.unmount();
});
if (client) {
await client.stopDesktop().catch(() => {});
await client.dispose().catch(() => {});
}
if (handle) {
await handle.dispose();
}
if (layout) {
disposeDockerTestLayout(layout);
}
container.remove();
delete (globalThis as { IS_REACT_ACT_ENVIRONMENT?: boolean }).IS_REACT_ACT_ENVIRONMENT;
client = undefined;
handle = undefined;
layout = undefined;
});
async function connectDesktopClient(options?: { pathMode?: "merge" | "replace" }): Promise<SandboxAgent> {
layout = createDockerTestLayout();
handle = await startDockerSandboxAgent(layout, {
timeoutMs: 30_000,
pathMode: options?.pathMode,
env: options?.pathMode === "replace" ? { PATH: layout.rootDir } : undefined,
});
client = await SandboxAgent.connect({
baseUrl: handle.baseUrl,
token: handle.token,
});
return client;
}
it("renders install remediation when desktop deps are missing", async () => {
const connectedClient = await connectDesktopClient({ pathMode: "replace" });
await act(async () => {
root.render(<DesktopTab getClient={() => connectedClient} />);
});
await waitFor(() => {
const text = container.textContent ?? "";
return text.includes("install_required") ? text : undefined;
});
expect(container.textContent).toContain("install_required");
expect(container.textContent).toContain("sandbox-agent install desktop --yes");
expect(container.textContent).toContain("Xvfb");
});
it("starts desktop, refreshes screenshot, and stops desktop", async () => {
const connectedClient = await connectDesktopClient();
await act(async () => {
root.render(<DesktopTab getClient={() => connectedClient} />);
});
await waitFor(() => {
const text = container.textContent ?? "";
return text.includes("inactive") ? true : undefined;
});
const startButton = await waitFor(() => findButton(container, "Start Desktop"));
await act(async () => {
startButton.dispatchEvent(new MouseEvent("click", { bubbles: true }));
});
await waitFor(() => {
const screenshot = container.querySelector("img[alt='Desktop screenshot']") as HTMLImageElement | null;
return screenshot?.src ? screenshot : undefined;
});
const screenshot = container.querySelector("img[alt='Desktop screenshot']") as HTMLImageElement | null;
expect(screenshot).toBeTruthy();
expect(screenshot?.src.startsWith("blob:") || screenshot?.src.startsWith("data:image/png")).toBe(true);
expect(container.textContent).toContain("active");
const stopButton = await waitFor(() => findButton(container, "Stop Desktop"));
await act(async () => {
stopButton.dispatchEvent(new MouseEvent("click", { bubbles: true }));
});
await waitFor(() => {
const text = container.textContent ?? "";
return text.includes("inactive") ? true : undefined;
});
expect(container.textContent).toContain("inactive");
});
});

View file

@ -0,0 +1,340 @@
import { Loader2, Monitor, Play, RefreshCw, Square, Camera } from "lucide-react";
import { useCallback, useEffect, useMemo, useState } from "react";
import { SandboxAgentError } from "sandbox-agent";
import type { DesktopStatusResponse, SandboxAgent } from "sandbox-agent";
const MIN_SPIN_MS = 350;
const extractErrorMessage = (error: unknown, fallback: string): string => {
if (error instanceof SandboxAgentError && error.problem?.detail) return error.problem.detail;
if (error instanceof Error) return error.message;
return fallback;
};
const formatStartedAt = (value: string | null | undefined): string => {
if (!value) {
return "Not started";
}
const parsed = new Date(value);
return Number.isNaN(parsed.getTime()) ? value : parsed.toLocaleString();
};
const createScreenshotUrl = async (bytes: Uint8Array): Promise<string> => {
const payload = new Uint8Array(bytes.byteLength);
payload.set(bytes);
const blob = new Blob([payload.buffer], { type: "image/png" });
if (typeof URL.createObjectURL === "function") {
return URL.createObjectURL(blob);
}
return await new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onerror = () => reject(reader.error ?? new Error("Unable to read screenshot blob."));
reader.onload = () => {
if (typeof reader.result === "string") {
resolve(reader.result);
} else {
reject(new Error("Unable to read screenshot blob."));
}
};
reader.readAsDataURL(blob);
});
};
const DesktopTab = ({ getClient }: { getClient: () => SandboxAgent }) => {
const [status, setStatus] = useState<DesktopStatusResponse | null>(null);
const [loading, setLoading] = useState(false);
const [refreshing, setRefreshing] = useState(false);
const [acting, setActing] = useState<"start" | "stop" | null>(null);
const [error, setError] = useState<string | null>(null);
const [width, setWidth] = useState("1440");
const [height, setHeight] = useState("900");
const [dpi, setDpi] = useState("96");
const [screenshotUrl, setScreenshotUrl] = useState<string | null>(null);
const [screenshotLoading, setScreenshotLoading] = useState(false);
const [screenshotError, setScreenshotError] = useState<string | null>(null);
const revokeScreenshotUrl = useCallback(() => {
setScreenshotUrl((current) => {
if (current?.startsWith("blob:") && typeof URL.revokeObjectURL === "function") {
URL.revokeObjectURL(current);
}
return null;
});
}, []);
const loadStatus = useCallback(
async (mode: "initial" | "refresh" = "initial") => {
if (mode === "initial") {
setLoading(true);
} else {
setRefreshing(true);
}
setError(null);
try {
const next = await getClient().getDesktopStatus();
setStatus(next);
return next;
} catch (loadError) {
setError(extractErrorMessage(loadError, "Unable to load desktop status."));
return null;
} finally {
setLoading(false);
setRefreshing(false);
}
},
[getClient],
);
const refreshScreenshot = useCallback(async () => {
setScreenshotLoading(true);
setScreenshotError(null);
try {
const bytes = await getClient().takeDesktopScreenshot();
revokeScreenshotUrl();
setScreenshotUrl(await createScreenshotUrl(bytes));
} catch (captureError) {
revokeScreenshotUrl();
setScreenshotError(extractErrorMessage(captureError, "Unable to capture desktop screenshot."));
} finally {
setScreenshotLoading(false);
}
}, [getClient, revokeScreenshotUrl]);
useEffect(() => {
void loadStatus();
}, [loadStatus]);
useEffect(() => {
if (status?.state === "active") {
void refreshScreenshot();
} else {
revokeScreenshotUrl();
}
}, [refreshScreenshot, revokeScreenshotUrl, status?.state]);
useEffect(() => {
return () => {
revokeScreenshotUrl();
};
}, [revokeScreenshotUrl]);
const handleStart = async () => {
const parsedWidth = Number.parseInt(width, 10);
const parsedHeight = Number.parseInt(height, 10);
const parsedDpi = Number.parseInt(dpi, 10);
setActing("start");
setError(null);
const startedAt = Date.now();
try {
const next = await getClient().startDesktop({
width: Number.isFinite(parsedWidth) ? parsedWidth : undefined,
height: Number.isFinite(parsedHeight) ? parsedHeight : undefined,
dpi: Number.isFinite(parsedDpi) ? parsedDpi : undefined,
});
setStatus(next);
if (next.state === "active") {
await refreshScreenshot();
}
} catch (startError) {
setError(extractErrorMessage(startError, "Unable to start desktop runtime."));
await loadStatus("refresh");
} finally {
const elapsedMs = Date.now() - startedAt;
if (elapsedMs < MIN_SPIN_MS) {
await new Promise((resolve) => window.setTimeout(resolve, MIN_SPIN_MS - elapsedMs));
}
setActing(null);
}
};
const handleStop = async () => {
setActing("stop");
setError(null);
const startedAt = Date.now();
try {
const next = await getClient().stopDesktop();
setStatus(next);
revokeScreenshotUrl();
} catch (stopError) {
setError(extractErrorMessage(stopError, "Unable to stop desktop runtime."));
await loadStatus("refresh");
} finally {
const elapsedMs = Date.now() - startedAt;
if (elapsedMs < MIN_SPIN_MS) {
await new Promise((resolve) => window.setTimeout(resolve, MIN_SPIN_MS - elapsedMs));
}
setActing(null);
}
};
const canRefreshScreenshot = status?.state === "active";
const resolutionLabel = useMemo(() => {
const resolution = status?.resolution;
if (!resolution) return "Unknown";
const dpiLabel = resolution.dpi ? ` @ ${resolution.dpi} DPI` : "";
return `${resolution.width} x ${resolution.height}${dpiLabel}`;
}, [status?.resolution]);
return (
<div className="desktop-panel">
<div className="inline-row" style={{ marginBottom: 16 }}>
<button className="button secondary small" onClick={() => void loadStatus("refresh")} disabled={loading || refreshing}>
<RefreshCw className={`button-icon ${loading || refreshing ? "spinner-icon" : ""}`} />
Refresh Status
</button>
<button className="button secondary small" onClick={() => void refreshScreenshot()} disabled={!canRefreshScreenshot || screenshotLoading}>
{screenshotLoading ? <Loader2 className="button-icon spinner-icon" /> : <Camera className="button-icon" />}
Refresh Screenshot
</button>
</div>
{error && <div className="banner error">{error}</div>}
{screenshotError && <div className="banner error">{screenshotError}</div>}
<div className="card">
<div className="card-header">
<span className="card-title">
<Monitor size={14} style={{ marginRight: 6 }} />
Desktop Runtime
</span>
<span
className={`pill ${
status?.state === "active" ? "success" : status?.state === "install_required" ? "warning" : status?.state === "failed" ? "danger" : ""
}`}
>
{status?.state ?? "unknown"}
</span>
</div>
<div className="desktop-state-grid">
<div>
<div className="card-meta">Display</div>
<div className="mono">{status?.display ?? "Not assigned"}</div>
</div>
<div>
<div className="card-meta">Resolution</div>
<div className="mono">{resolutionLabel}</div>
</div>
<div>
<div className="card-meta">Started</div>
<div>{formatStartedAt(status?.startedAt)}</div>
</div>
</div>
<div className="desktop-start-controls">
<div className="desktop-input-group">
<label className="label">Width</label>
<input className="setup-input mono" value={width} onChange={(event) => setWidth(event.target.value)} inputMode="numeric" />
</div>
<div className="desktop-input-group">
<label className="label">Height</label>
<input className="setup-input mono" value={height} onChange={(event) => setHeight(event.target.value)} inputMode="numeric" />
</div>
<div className="desktop-input-group">
<label className="label">DPI</label>
<input className="setup-input mono" value={dpi} onChange={(event) => setDpi(event.target.value)} inputMode="numeric" />
</div>
</div>
<div className="card-actions">
<button className="button success small" onClick={() => void handleStart()} disabled={acting === "start"}>
{acting === "start" ? <Loader2 className="button-icon spinner-icon" /> : <Play className="button-icon" />}
Start Desktop
</button>
<button className="button danger small" onClick={() => void handleStop()} disabled={acting === "stop"}>
{acting === "stop" ? <Loader2 className="button-icon spinner-icon" /> : <Square className="button-icon" />}
Stop Desktop
</button>
</div>
</div>
{status?.missingDependencies && status.missingDependencies.length > 0 && (
<div className="card">
<div className="card-header">
<span className="card-title">Missing Dependencies</span>
</div>
<div className="desktop-chip-list">
{status.missingDependencies.map((dependency) => (
<span key={dependency} className="pill warning">
{dependency}
</span>
))}
</div>
{status.installCommand && (
<>
<div className="card-meta" style={{ marginTop: 12 }}>
Install command
</div>
<div className="mono desktop-command">{status.installCommand}</div>
</>
)}
</div>
)}
{(status?.lastError || status?.runtimeLogPath || (status?.processes?.length ?? 0) > 0) && (
<div className="card">
<div className="card-header">
<span className="card-title">Diagnostics</span>
</div>
{status?.lastError && (
<div className="desktop-diagnostic-block">
<div className="card-meta">Last error</div>
<div className="mono">{status.lastError.code}</div>
<div>{status.lastError.message}</div>
</div>
)}
{status?.runtimeLogPath && (
<div className="desktop-diagnostic-block">
<div className="card-meta">Runtime log</div>
<div className="mono">{status.runtimeLogPath}</div>
</div>
)}
{status?.processes && status.processes.length > 0 && (
<div className="desktop-diagnostic-block">
<div className="card-meta">Processes</div>
<div className="desktop-process-list">
{status.processes.map((process) => (
<div key={`${process.name}-${process.pid ?? "none"}`} className="desktop-process-item">
<div>
<strong>{process.name}</strong>
<span className={`pill ${process.running ? "success" : "danger"}`} style={{ marginLeft: 8 }}>
{process.running ? "running" : "stopped"}
</span>
</div>
<div className="mono">{process.pid ? `pid ${process.pid}` : "no pid"}</div>
{process.logPath && <div className="mono">{process.logPath}</div>}
</div>
))}
</div>
</div>
)}
</div>
)}
<div className="card">
<div className="card-header">
<span className="card-title">Latest Screenshot</span>
{status?.state === "active" ? <span className="card-meta">Manual refresh only</span> : null}
</div>
{loading ? <div className="card-meta">Loading...</div> : null}
{!loading && !screenshotUrl && (
<div className="desktop-screenshot-empty">
{status?.state === "active" ? "No screenshot loaded yet." : "Start the desktop runtime to capture a screenshot."}
</div>
)}
{screenshotUrl && (
<div className="desktop-screenshot-frame">
<img src={screenshotUrl} alt="Desktop screenshot" className="desktop-screenshot-image" />
</div>
)}
</div>
</div>
);
};
export default DesktopTab;