feat: desktop computer-use APIs with neko-based streaming

Add desktop runtime management (Xvfb, openbox, dbus), screen capture,
mouse/keyboard input, and video streaming via neko binary extracted
from the m1k1o/neko container. Includes Docker test rig, TypeScript SDK
desktop support, and inspector Desktop tab.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nathan Flurry 2026-03-16 17:56:39 -07:00
parent 3895e34bdb
commit 33821d8660
66 changed files with 13190 additions and 1135 deletions

View file

@ -23,12 +23,35 @@ import {
type SetSessionModeRequest,
} from "acp-http-client";
import type { SandboxProvider } from "./providers/types.ts";
import { DesktopStreamSession, type DesktopStreamConnectOptions } from "./desktop-stream.ts";
import {
type AcpServerListResponse,
type AgentInfo,
type AgentInstallRequest,
type AgentInstallResponse,
type AgentListResponse,
type DesktopActionResponse,
type DesktopDisplayInfoResponse,
type DesktopKeyboardDownRequest,
type DesktopKeyboardPressRequest,
type DesktopKeyboardTypeRequest,
type DesktopMouseClickRequest,
type DesktopMouseDownRequest,
type DesktopMouseDragRequest,
type DesktopMouseMoveRequest,
type DesktopMousePositionResponse,
type DesktopMouseScrollRequest,
type DesktopMouseUpRequest,
type DesktopKeyboardUpRequest,
type DesktopRecordingInfo,
type DesktopRecordingListResponse,
type DesktopRecordingStartRequest,
type DesktopRegionScreenshotQuery,
type DesktopScreenshotQuery,
type DesktopStartRequest,
type DesktopStatusResponse,
type DesktopStreamStatusResponse,
type DesktopWindowListResponse,
type FsActionResponse,
type FsDeleteQuery,
type FsEntriesQuery,
@ -53,7 +76,9 @@ import {
type ProcessInfo,
type ProcessInputRequest,
type ProcessInputResponse,
type ProcessListQuery,
type ProcessListResponse,
type ProcessOwner,
type ProcessLogEntry,
type ProcessLogsQuery,
type ProcessLogsResponse,
@ -201,6 +226,7 @@ export interface ProcessTerminalConnectOptions extends ProcessTerminalWebSocketU
}
export type ProcessTerminalSessionOptions = ProcessTerminalConnectOptions;
export type DesktopStreamSessionOptions = DesktopStreamConnectOptions;
export class SandboxAgentError extends Error {
readonly status: number;
@ -1533,6 +1559,148 @@ export class SandboxAgent {
return this.requestHealth();
}
async startDesktop(request: DesktopStartRequest = {}): Promise<DesktopStatusResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/start`, {
body: request,
});
}
async stopDesktop(): Promise<DesktopStatusResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/stop`);
}
async getDesktopStatus(): Promise<DesktopStatusResponse> {
return this.requestJson("GET", `${API_PREFIX}/desktop/status`);
}
async getDesktopDisplayInfo(): Promise<DesktopDisplayInfoResponse> {
return this.requestJson("GET", `${API_PREFIX}/desktop/display/info`);
}
async takeDesktopScreenshot(query: DesktopScreenshotQuery = {}): Promise<Uint8Array> {
const response = await this.requestRaw("GET", `${API_PREFIX}/desktop/screenshot`, {
query,
accept: "image/*",
});
const buffer = await response.arrayBuffer();
return new Uint8Array(buffer);
}
async takeDesktopRegionScreenshot(query: DesktopRegionScreenshotQuery): Promise<Uint8Array> {
const response = await this.requestRaw("GET", `${API_PREFIX}/desktop/screenshot/region`, {
query,
accept: "image/*",
});
const buffer = await response.arrayBuffer();
return new Uint8Array(buffer);
}
async getDesktopMousePosition(): Promise<DesktopMousePositionResponse> {
return this.requestJson("GET", `${API_PREFIX}/desktop/mouse/position`);
}
async moveDesktopMouse(request: DesktopMouseMoveRequest): Promise<DesktopMousePositionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/mouse/move`, {
body: request,
});
}
async clickDesktop(request: DesktopMouseClickRequest): Promise<DesktopMousePositionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/mouse/click`, {
body: request,
});
}
async mouseDownDesktop(request: DesktopMouseDownRequest): Promise<DesktopMousePositionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/mouse/down`, {
body: request,
});
}
async mouseUpDesktop(request: DesktopMouseUpRequest): Promise<DesktopMousePositionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/mouse/up`, {
body: request,
});
}
async dragDesktopMouse(request: DesktopMouseDragRequest): Promise<DesktopMousePositionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/mouse/drag`, {
body: request,
});
}
async scrollDesktop(request: DesktopMouseScrollRequest): Promise<DesktopMousePositionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/mouse/scroll`, {
body: request,
});
}
async typeDesktopText(request: DesktopKeyboardTypeRequest): Promise<DesktopActionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/keyboard/type`, {
body: request,
});
}
async pressDesktopKey(request: DesktopKeyboardPressRequest): Promise<DesktopActionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/keyboard/press`, {
body: request,
});
}
async keyDownDesktop(request: DesktopKeyboardDownRequest): Promise<DesktopActionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/keyboard/down`, {
body: request,
});
}
async keyUpDesktop(request: DesktopKeyboardUpRequest): Promise<DesktopActionResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/keyboard/up`, {
body: request,
});
}
async listDesktopWindows(): Promise<DesktopWindowListResponse> {
return this.requestJson("GET", `${API_PREFIX}/desktop/windows`);
}
async startDesktopRecording(request: DesktopRecordingStartRequest = {}): Promise<DesktopRecordingInfo> {
return this.requestJson("POST", `${API_PREFIX}/desktop/recording/start`, {
body: request,
});
}
async stopDesktopRecording(): Promise<DesktopRecordingInfo> {
return this.requestJson("POST", `${API_PREFIX}/desktop/recording/stop`);
}
async listDesktopRecordings(): Promise<DesktopRecordingListResponse> {
return this.requestJson("GET", `${API_PREFIX}/desktop/recordings`);
}
async getDesktopRecording(id: string): Promise<DesktopRecordingInfo> {
return this.requestJson("GET", `${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}`);
}
async downloadDesktopRecording(id: string): Promise<Uint8Array> {
const response = await this.requestRaw("GET", `${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}/download`, {
accept: "video/mp4",
});
const buffer = await response.arrayBuffer();
return new Uint8Array(buffer);
}
async deleteDesktopRecording(id: string): Promise<void> {
await this.requestRaw("DELETE", `${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}`);
}
async startDesktopStream(): Promise<DesktopStreamStatusResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/stream/start`);
}
async stopDesktopStream(): Promise<DesktopStreamStatusResponse> {
return this.requestJson("POST", `${API_PREFIX}/desktop/stream/stop`);
}
async listAgents(options?: AgentQueryOptions): Promise<AgentListResponse> {
return this.requestJson("GET", `${API_PREFIX}/agents`, {
query: toAgentQuery(options),
@ -1665,8 +1833,10 @@ export class SandboxAgent {
});
}
async listProcesses(): Promise<ProcessListResponse> {
return this.requestJson("GET", `${API_PREFIX}/processes`);
async listProcesses(query?: ProcessListQuery): Promise<ProcessListResponse> {
return this.requestJson("GET", `${API_PREFIX}/processes`, {
query,
});
}
async getProcess(id: string): Promise<ProcessInfo> {
@ -1754,6 +1924,32 @@ export class SandboxAgent {
return new ProcessTerminalSession(this.connectProcessTerminalWebSocket(id, options));
}
buildDesktopStreamWebSocketUrl(options: ProcessTerminalWebSocketUrlOptions = {}): string {
return toWebSocketUrl(
this.buildUrl(`${API_PREFIX}/desktop/stream/ws`, {
access_token: options.accessToken ?? this.token,
}),
);
}
connectDesktopStreamWebSocket(options: DesktopStreamConnectOptions = {}): WebSocket {
const WebSocketCtor = options.WebSocket ?? globalThis.WebSocket;
if (!WebSocketCtor) {
throw new Error("WebSocket API is not available; provide a WebSocket implementation.");
}
return new WebSocketCtor(
this.buildDesktopStreamWebSocketUrl({
accessToken: options.accessToken,
}),
options.protocols,
);
}
connectDesktopStream(options: DesktopStreamSessionOptions = {}): DesktopStreamSession {
return new DesktopStreamSession(this.connectDesktopStreamWebSocket(options));
}
private async getLiveConnection(agent: string): Promise<LiveAcpConnection> {
await this.awaitHealthy();

View file

@ -0,0 +1,236 @@
import type { DesktopMouseButton } from "./types.ts";
const WS_READY_STATE_CONNECTING = 0;
const WS_READY_STATE_OPEN = 1;
const WS_READY_STATE_CLOSED = 3;
export interface DesktopStreamReadyStatus {
type: "ready";
width: number;
height: number;
}
export interface DesktopStreamErrorStatus {
type: "error";
message: string;
}
export type DesktopStreamStatusMessage = DesktopStreamReadyStatus | DesktopStreamErrorStatus;
export interface DesktopStreamConnectOptions {
accessToken?: string;
WebSocket?: typeof WebSocket;
protocols?: string | string[];
}
type DesktopStreamClientFrame =
| {
type: "moveMouse";
x: number;
y: number;
}
| {
type: "mouseDown" | "mouseUp";
x?: number;
y?: number;
button?: DesktopMouseButton;
}
| {
type: "scroll";
x: number;
y: number;
deltaX?: number;
deltaY?: number;
}
| {
type: "keyDown" | "keyUp";
key: string;
}
| {
type: "close";
};
export class DesktopStreamSession {
readonly socket: WebSocket;
readonly closed: Promise<void>;
private readonly readyListeners = new Set<(status: DesktopStreamReadyStatus) => void>();
private readonly frameListeners = new Set<(frame: Uint8Array) => void>();
private readonly errorListeners = new Set<(error: DesktopStreamErrorStatus | Error) => void>();
private readonly closeListeners = new Set<() => void>();
private closeSignalSent = false;
private closedResolve!: () => void;
constructor(socket: WebSocket) {
this.socket = socket;
this.socket.binaryType = "arraybuffer";
this.closed = new Promise<void>((resolve) => {
this.closedResolve = resolve;
});
this.socket.addEventListener("message", (event) => {
void this.handleMessage(event.data);
});
this.socket.addEventListener("error", () => {
this.emitError(new Error("Desktop stream websocket connection failed."));
});
this.socket.addEventListener("close", () => {
this.closedResolve();
for (const listener of this.closeListeners) {
listener();
}
});
}
onReady(listener: (status: DesktopStreamReadyStatus) => void): () => void {
this.readyListeners.add(listener);
return () => {
this.readyListeners.delete(listener);
};
}
onFrame(listener: (frame: Uint8Array) => void): () => void {
this.frameListeners.add(listener);
return () => {
this.frameListeners.delete(listener);
};
}
onError(listener: (error: DesktopStreamErrorStatus | Error) => void): () => void {
this.errorListeners.add(listener);
return () => {
this.errorListeners.delete(listener);
};
}
onClose(listener: () => void): () => void {
this.closeListeners.add(listener);
return () => {
this.closeListeners.delete(listener);
};
}
moveMouse(x: number, y: number): void {
this.sendFrame({ type: "moveMouse", x, y });
}
mouseDown(button?: DesktopMouseButton, x?: number, y?: number): void {
this.sendFrame({ type: "mouseDown", button, x, y });
}
mouseUp(button?: DesktopMouseButton, x?: number, y?: number): void {
this.sendFrame({ type: "mouseUp", button, x, y });
}
scroll(x: number, y: number, deltaX?: number, deltaY?: number): void {
this.sendFrame({ type: "scroll", x, y, deltaX, deltaY });
}
keyDown(key: string): void {
this.sendFrame({ type: "keyDown", key });
}
keyUp(key: string): void {
this.sendFrame({ type: "keyUp", key });
}
close(): void {
if (this.socket.readyState === WS_READY_STATE_CONNECTING) {
this.socket.addEventListener(
"open",
() => {
this.close();
},
{ once: true },
);
return;
}
if (this.socket.readyState === WS_READY_STATE_OPEN) {
if (!this.closeSignalSent) {
this.closeSignalSent = true;
this.sendFrame({ type: "close" });
}
this.socket.close();
return;
}
if (this.socket.readyState !== WS_READY_STATE_CLOSED) {
this.socket.close();
}
}
private async handleMessage(data: unknown): Promise<void> {
try {
if (typeof data === "string") {
const frame = parseStatusFrame(data);
if (!frame) {
this.emitError(new Error("Received invalid desktop stream control frame."));
return;
}
if (frame.type === "ready") {
for (const listener of this.readyListeners) {
listener(frame);
}
return;
}
this.emitError(frame);
return;
}
const bytes = await decodeBinaryFrame(data);
for (const listener of this.frameListeners) {
listener(bytes);
}
} catch (error) {
this.emitError(error instanceof Error ? error : new Error(String(error)));
}
}
private sendFrame(frame: DesktopStreamClientFrame): void {
if (this.socket.readyState !== WS_READY_STATE_OPEN) {
return;
}
this.socket.send(JSON.stringify(frame));
}
private emitError(error: DesktopStreamErrorStatus | Error): void {
for (const listener of this.errorListeners) {
listener(error);
}
}
}
function parseStatusFrame(payload: string): DesktopStreamStatusMessage | null {
const value = JSON.parse(payload) as Record<string, unknown>;
if (value.type === "ready" && typeof value.width === "number" && typeof value.height === "number") {
return {
type: "ready",
width: value.width,
height: value.height,
};
}
if (value.type === "error" && typeof value.message === "string") {
return {
type: "error",
message: value.message,
};
}
return null;
}
async function decodeBinaryFrame(data: unknown): Promise<Uint8Array> {
if (data instanceof ArrayBuffer) {
return new Uint8Array(data);
}
if (ArrayBuffer.isView(data)) {
return new Uint8Array(data.buffer, data.byteOffset, data.byteLength);
}
if (typeof Blob !== "undefined" && data instanceof Blob) {
return new Uint8Array(await data.arrayBuffer());
}
throw new Error("Unsupported desktop stream binary frame type.");
}

File diff suppressed because it is too large Load diff

View file

@ -14,10 +14,18 @@ export {
export { AcpRpcError } from "acp-http-client";
export { buildInspectorUrl } from "./inspector.ts";
export { DesktopStreamSession } from "./desktop-stream.ts";
export type {
DesktopStreamConnectOptions,
DesktopStreamErrorStatus,
DesktopStreamReadyStatus,
DesktopStreamStatusMessage,
} from "./desktop-stream.ts";
export type {
SandboxAgentHealthWaitOptions,
AgentQueryOptions,
DesktopStreamSessionOptions,
ProcessLogFollowQuery,
ProcessLogListener,
ProcessLogSubscription,
@ -50,6 +58,37 @@ export type {
AgentInstallRequest,
AgentInstallResponse,
AgentListResponse,
DesktopActionResponse,
DesktopDisplayInfoResponse,
DesktopErrorInfo,
DesktopKeyboardDownRequest,
DesktopKeyboardUpRequest,
DesktopKeyModifiers,
DesktopKeyboardPressRequest,
DesktopKeyboardTypeRequest,
DesktopMouseButton,
DesktopMouseClickRequest,
DesktopMouseDownRequest,
DesktopMouseDragRequest,
DesktopMouseMoveRequest,
DesktopMousePositionResponse,
DesktopMouseScrollRequest,
DesktopMouseUpRequest,
DesktopProcessInfo,
DesktopRecordingInfo,
DesktopRecordingListResponse,
DesktopRecordingStartRequest,
DesktopRecordingStatus,
DesktopRegionScreenshotQuery,
DesktopResolution,
DesktopScreenshotFormat,
DesktopScreenshotQuery,
DesktopStartRequest,
DesktopState,
DesktopStatusResponse,
DesktopStreamStatusResponse,
DesktopWindowInfo,
DesktopWindowListResponse,
FsActionResponse,
FsDeleteQuery,
FsEntriesQuery,
@ -74,10 +113,12 @@ export type {
ProcessInfo,
ProcessInputRequest,
ProcessInputResponse,
ProcessListQuery,
ProcessListResponse,
ProcessLogEntry,
ProcessLogsQuery,
ProcessLogsResponse,
ProcessOwner,
ProcessLogsStream,
ProcessRunRequest,
ProcessRunResponse,

View file

@ -4,6 +4,38 @@ import type { components, operations } from "./generated/openapi.ts";
export type ProblemDetails = components["schemas"]["ProblemDetails"];
export type HealthResponse = JsonResponse<operations["get_v1_health"], 200>;
export type DesktopState = components["schemas"]["DesktopState"];
export type DesktopResolution = components["schemas"]["DesktopResolution"];
export type DesktopErrorInfo = components["schemas"]["DesktopErrorInfo"];
export type DesktopProcessInfo = components["schemas"]["DesktopProcessInfo"];
export type DesktopStatusResponse = JsonResponse<operations["get_v1_desktop_status"], 200>;
export type DesktopStartRequest = JsonRequestBody<operations["post_v1_desktop_start"]>;
export type DesktopScreenshotFormat = components["schemas"]["DesktopScreenshotFormat"];
export type DesktopScreenshotQuery =
QueryParams<operations["get_v1_desktop_screenshot"]> extends never ? Record<string, never> : QueryParams<operations["get_v1_desktop_screenshot"]>;
export type DesktopRegionScreenshotQuery = QueryParams<operations["get_v1_desktop_screenshot_region"]>;
export type DesktopMousePositionResponse = JsonResponse<operations["get_v1_desktop_mouse_position"], 200>;
export type DesktopMouseButton = components["schemas"]["DesktopMouseButton"];
export type DesktopMouseMoveRequest = JsonRequestBody<operations["post_v1_desktop_mouse_move"]>;
export type DesktopMouseClickRequest = JsonRequestBody<operations["post_v1_desktop_mouse_click"]>;
export type DesktopMouseDownRequest = JsonRequestBody<operations["post_v1_desktop_mouse_down"]>;
export type DesktopMouseUpRequest = JsonRequestBody<operations["post_v1_desktop_mouse_up"]>;
export type DesktopMouseDragRequest = JsonRequestBody<operations["post_v1_desktop_mouse_drag"]>;
export type DesktopMouseScrollRequest = JsonRequestBody<operations["post_v1_desktop_mouse_scroll"]>;
export type DesktopKeyboardTypeRequest = JsonRequestBody<operations["post_v1_desktop_keyboard_type"]>;
export type DesktopKeyModifiers = components["schemas"]["DesktopKeyModifiers"];
export type DesktopKeyboardPressRequest = JsonRequestBody<operations["post_v1_desktop_keyboard_press"]>;
export type DesktopKeyboardDownRequest = JsonRequestBody<operations["post_v1_desktop_keyboard_down"]>;
export type DesktopKeyboardUpRequest = JsonRequestBody<operations["post_v1_desktop_keyboard_up"]>;
export type DesktopActionResponse = JsonResponse<operations["post_v1_desktop_keyboard_type"], 200>;
export type DesktopDisplayInfoResponse = JsonResponse<operations["get_v1_desktop_display_info"], 200>;
export type DesktopWindowInfo = components["schemas"]["DesktopWindowInfo"];
export type DesktopWindowListResponse = JsonResponse<operations["get_v1_desktop_windows"], 200>;
export type DesktopRecordingStartRequest = JsonRequestBody<operations["post_v1_desktop_recording_start"]>;
export type DesktopRecordingStatus = components["schemas"]["DesktopRecordingStatus"];
export type DesktopRecordingInfo = JsonResponse<operations["post_v1_desktop_recording_start"], 200>;
export type DesktopRecordingListResponse = JsonResponse<operations["get_v1_desktop_recordings"], 200>;
export type DesktopStreamStatusResponse = JsonResponse<operations["post_v1_desktop_stream_start"], 200>;
export type AgentListResponse = JsonResponse<operations["get_v1_agents"], 200>;
export type AgentInfo = components["schemas"]["AgentInfo"];
export type AgentQuery = QueryParams<operations["get_v1_agents"]>;
@ -37,11 +69,13 @@ export type ProcessCreateRequest = JsonRequestBody<operations["post_v1_processes
export type ProcessInfo = components["schemas"]["ProcessInfo"];
export type ProcessInputRequest = JsonRequestBody<operations["post_v1_process_input"]>;
export type ProcessInputResponse = JsonResponse<operations["post_v1_process_input"], 200>;
export type ProcessListQuery = QueryParams<operations["get_v1_processes"]>;
export type ProcessListResponse = JsonResponse<operations["get_v1_processes"], 200>;
export type ProcessLogEntry = components["schemas"]["ProcessLogEntry"];
export type ProcessLogsQuery = QueryParams<operations["get_v1_process_logs"]>;
export type ProcessLogsResponse = JsonResponse<operations["get_v1_process_logs"], 200>;
export type ProcessLogsStream = components["schemas"]["ProcessLogsStream"];
export type ProcessOwner = components["schemas"]["ProcessOwner"];
export type ProcessRunRequest = JsonRequestBody<operations["post_v1_processes_run"]>;
export type ProcessRunResponse = JsonResponse<operations["post_v1_processes_run"], 200>;
export type ProcessSignalQuery = QueryParams<operations["post_v1_process_stop"]>;