diff --git a/CLAUDE.md b/CLAUDE.md index 6266297..91695e7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,6 +33,12 @@ - `docs/agent-capabilities.mdx` lists models/modes/thought levels per agent. Update it when adding a new agent or changing `fallback_config_options`. If its "Last updated" date is >2 weeks old, re-run `cd scripts/agent-configs && npx tsx dump.ts` and update the doc to match. Source data: `scripts/agent-configs/resources/*.json` and hardcoded entries in `server/packages/sandbox-agent/src/router/support.rs` (`fallback_config_options`). - Some agent models are gated by subscription (e.g. Claude `opus`). The live report only shows models available to the current credentials. The static doc and JSON resource files should list all known models regardless of subscription tier. +## .context Directory + +- The `.context/` directory is gitignored and used for temporary workspace files (plans, screenshots, build stamps, etc.). +- Never commit files from `.context/` to git. +- Clean up temporary files (screenshots, attachments, build artifacts) from `.context/` when they are no longer needed. + ## Docker Test Image - Docker-backed Rust and TypeScript tests build `docker/test-agent/Dockerfile` directly in-process and cache the image tag only in memory (`OnceLock` in Rust, module-level variable in TypeScript). diff --git a/docker/runtime/Dockerfile b/docker/runtime/Dockerfile index 326b9bb..b0301a2 100644 --- a/docker/runtime/Dockerfile +++ b/docker/runtime/Dockerfile @@ -153,7 +153,15 @@ RUN apt-get update && apt-get install -y \ ca-certificates \ curl \ git \ - ffmpeg && \ + ffmpeg \ + gstreamer1.0-tools \ + gstreamer1.0-plugins-base \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + gstreamer1.0-plugins-ugly \ + gstreamer1.0-nice \ + gstreamer1.0-x \ + libgstreamer1.0-0 && \ rm -rf /var/lib/apt/lists/* # Copy the binary from builder diff --git a/docker/test-agent/Dockerfile b/docker/test-agent/Dockerfile index dab4391..0caf56d 100644 --- a/docker/test-agent/Dockerfile +++ b/docker/test-agent/Dockerfile @@ -1,6 +1,16 @@ FROM rust:1.88.0-bookworm AS builder WORKDIR /build +# Install GStreamer dev packages for the desktop-gstreamer feature. +RUN apt-get update -qq && \ + apt-get install -y -qq --no-install-recommends \ + libgstreamer1.0-dev \ + libgstreamer-plugins-base1.0-dev \ + libgstreamer-plugins-bad1.0-dev \ + libnice-dev \ + > /dev/null 2>&1 && \ + rm -rf /var/lib/apt/lists/* + COPY Cargo.toml Cargo.lock ./ COPY server/ ./server/ COPY gigacode/ ./gigacode/ @@ -12,7 +22,7 @@ ENV SANDBOX_AGENT_SKIP_INSPECTOR=1 RUN --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/usr/local/cargo/git \ --mount=type=cache,target=/build/target \ - cargo build -p sandbox-agent --release && \ + cargo build -p sandbox-agent --release --features desktop-gstreamer && \ cp target/release/sandbox-agent /sandbox-agent FROM node:22-bookworm-slim @@ -26,6 +36,15 @@ RUN apt-get update -qq && \ xdotool \ imagemagick \ ffmpeg \ + gstreamer1.0-tools \ + gstreamer1.0-plugins-base \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + gstreamer1.0-plugins-ugly \ + gstreamer1.0-nice \ + gstreamer1.0-x \ + gstreamer1.0-pulseaudio \ + libxcvt0 \ x11-xserver-utils \ dbus-x11 \ xauth \ diff --git a/docs/openapi.json b/docs/openapi.json index e8c2542..3435c25 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -20,9 +20,7 @@ "paths": { "/v1/acp": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_acp_servers", "responses": { "200": { @@ -40,9 +38,7 @@ }, "/v1/acp/{server_id}": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_acp", "parameters": [ { @@ -92,9 +88,7 @@ } }, "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "post_v1_acp", "parameters": [ { @@ -204,9 +198,7 @@ } }, "delete": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "delete_v1_acp", "parameters": [ { @@ -228,9 +220,7 @@ }, "/v1/agents": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_agents", "parameters": [ { @@ -280,9 +270,7 @@ }, "/v1/agents/{agent}": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_agent", "parameters": [ { @@ -351,9 +339,7 @@ }, "/v1/agents/{agent}/install": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "post_v1_agent_install", "parameters": [ { @@ -412,9 +398,7 @@ }, "/v1/config/mcp": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_config_mcp", "parameters": [ { @@ -460,9 +444,7 @@ } }, "put": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "put_v1_config_mcp", "parameters": [ { @@ -501,9 +483,7 @@ } }, "delete": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "delete_v1_config_mcp", "parameters": [ { @@ -534,9 +514,7 @@ }, "/v1/config/skills": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_config_skills", "parameters": [ { @@ -582,9 +560,7 @@ } }, "put": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "put_v1_config_skills", "parameters": [ { @@ -623,9 +599,7 @@ } }, "delete": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "delete_v1_config_skills", "parameters": [ { @@ -656,9 +630,7 @@ }, "/v1/desktop/display/info": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Get desktop display information.", "description": "Performs a health-gated display query against the managed desktop and\nreturns the current display identifier and resolution.", "operationId": "get_v1_desktop_display_info", @@ -698,9 +670,7 @@ }, "/v1/desktop/keyboard/down": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Press and hold a desktop keyboard key.", "description": "Performs a health-gated `xdotool keydown` operation against the managed\ndesktop.", "operationId": "post_v1_desktop_keyboard_down", @@ -760,9 +730,7 @@ }, "/v1/desktop/keyboard/press": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Press a desktop keyboard shortcut.", "description": "Performs a health-gated `xdotool key` operation against the managed\ndesktop.", "operationId": "post_v1_desktop_keyboard_press", @@ -822,9 +790,7 @@ }, "/v1/desktop/keyboard/type": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Type desktop keyboard text.", "description": "Performs a health-gated `xdotool type` operation against the managed\ndesktop.", "operationId": "post_v1_desktop_keyboard_type", @@ -884,9 +850,7 @@ }, "/v1/desktop/keyboard/up": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Release a desktop keyboard key.", "description": "Performs a health-gated `xdotool keyup` operation against the managed\ndesktop.", "operationId": "post_v1_desktop_keyboard_up", @@ -946,9 +910,7 @@ }, "/v1/desktop/mouse/click": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Click on the desktop.", "description": "Performs a health-gated pointer move and click against the managed desktop\nand returns the resulting mouse position.", "operationId": "post_v1_desktop_mouse_click", @@ -1008,9 +970,7 @@ }, "/v1/desktop/mouse/down": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Press and hold a desktop mouse button.", "description": "Performs a health-gated optional pointer move followed by `xdotool mousedown`\nand returns the resulting mouse position.", "operationId": "post_v1_desktop_mouse_down", @@ -1070,9 +1030,7 @@ }, "/v1/desktop/mouse/drag": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Drag the desktop mouse.", "description": "Performs a health-gated drag gesture against the managed desktop and\nreturns the resulting mouse position.", "operationId": "post_v1_desktop_mouse_drag", @@ -1132,9 +1090,7 @@ }, "/v1/desktop/mouse/move": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Move the desktop mouse.", "description": "Performs a health-gated absolute pointer move on the managed desktop and\nreturns the resulting mouse position.", "operationId": "post_v1_desktop_mouse_move", @@ -1194,9 +1150,7 @@ }, "/v1/desktop/mouse/position": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Get the current desktop mouse position.", "description": "Performs a health-gated mouse position query against the managed desktop.", "operationId": "get_v1_desktop_mouse_position", @@ -1236,9 +1190,7 @@ }, "/v1/desktop/mouse/scroll": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Scroll the desktop mouse wheel.", "description": "Performs a health-gated scroll gesture at the requested coordinates and\nreturns the resulting mouse position.", "operationId": "post_v1_desktop_mouse_scroll", @@ -1298,9 +1250,7 @@ }, "/v1/desktop/mouse/up": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Release a desktop mouse button.", "description": "Performs a health-gated optional pointer move followed by `xdotool mouseup`\nand returns the resulting mouse position.", "operationId": "post_v1_desktop_mouse_up", @@ -1360,9 +1310,7 @@ }, "/v1/desktop/recording/start": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Start desktop recording.", "description": "Starts an ffmpeg x11grab recording against the managed desktop and returns\nthe created recording metadata.", "operationId": "post_v1_desktop_recording_start", @@ -1412,9 +1360,7 @@ }, "/v1/desktop/recording/stop": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Stop desktop recording.", "description": "Stops the active desktop recording and returns the finalized recording\nmetadata.", "operationId": "post_v1_desktop_recording_stop", @@ -1454,9 +1400,7 @@ }, "/v1/desktop/recordings": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "List desktop recordings.", "description": "Returns the current desktop recording catalog.", "operationId": "get_v1_desktop_recordings", @@ -1486,9 +1430,7 @@ }, "/v1/desktop/recordings/{id}": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Get desktop recording metadata.", "description": "Returns metadata for a single desktop recording.", "operationId": "get_v1_desktop_recording", @@ -1527,9 +1469,7 @@ } }, "delete": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Delete a desktop recording.", "description": "Removes a completed desktop recording and its file from disk.", "operationId": "delete_v1_desktop_recording", @@ -1573,9 +1513,7 @@ }, "/v1/desktop/recordings/{id}/download": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Download a desktop recording.", "description": "Serves the recorded MP4 bytes for a completed desktop recording.", "operationId": "get_v1_desktop_recording_download", @@ -1609,9 +1547,7 @@ }, "/v1/desktop/screenshot": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Capture a full desktop screenshot.", "description": "Performs a health-gated full-frame screenshot of the managed desktop and\nreturns the requested image bytes.", "operationId": "get_v1_desktop_screenshot", @@ -1690,9 +1626,7 @@ }, "/v1/desktop/screenshot/region": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Capture a desktop screenshot region.", "description": "Performs a health-gated screenshot crop against the managed desktop and\nreturns the requested region image bytes.", "operationId": "get_v1_desktop_screenshot_region", @@ -1809,9 +1743,7 @@ }, "/v1/desktop/start": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Start the private desktop runtime.", "description": "Lazily launches the managed Xvfb/openbox stack, validates display health,\nand returns the resulting desktop status snapshot.", "operationId": "post_v1_desktop_start", @@ -1881,9 +1813,7 @@ }, "/v1/desktop/status": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Get desktop runtime status.", "description": "Returns the current desktop runtime state, dependency status, active\ndisplay metadata, and supervised process information.", "operationId": "get_v1_desktop_status", @@ -1913,9 +1843,7 @@ }, "/v1/desktop/stop": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Stop the private desktop runtime.", "description": "Terminates the managed openbox/Xvfb/dbus processes owned by the desktop\nruntime and returns the resulting status snapshot.", "operationId": "post_v1_desktop_stop", @@ -1945,9 +1873,7 @@ }, "/v1/desktop/stream/start": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Start desktop streaming.", "description": "Enables desktop websocket streaming for the managed desktop.", "operationId": "post_v1_desktop_stream_start", @@ -1967,9 +1893,7 @@ }, "/v1/desktop/stream/stop": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Stop desktop streaming.", "description": "Disables desktop websocket streaming for the managed desktop.", "operationId": "post_v1_desktop_stream_stop", @@ -1989,11 +1913,9 @@ }, "/v1/desktop/stream/ws": { "get": { - "tags": [ - "v1" - ], - "summary": "Open a desktop websocket streaming session.", - "description": "Upgrades the connection to a websocket that streams JPEG desktop frames and\naccepts mouse and keyboard control frames.", + "tags": ["v1"], + "summary": "Open a desktop WebRTC signaling session.", + "description": "Upgrades the connection to a WebSocket used for WebRTC signaling between\nthe browser client and the desktop streaming process. Also accepts mouse\nand keyboard input frames as a fallback transport.", "operationId": "get_v1_desktop_stream_ws", "parameters": [ { @@ -2036,9 +1958,7 @@ }, "/v1/desktop/windows": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "List visible desktop windows.", "description": "Performs a health-gated visible-window enumeration against the managed\ndesktop and returns the current window metadata.", "operationId": "get_v1_desktop_windows", @@ -2078,9 +1998,7 @@ }, "/v1/fs/entries": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_fs_entries", "parameters": [ { @@ -2113,9 +2031,7 @@ }, "/v1/fs/entry": { "delete": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "delete_v1_fs_entry", "parameters": [ { @@ -2154,9 +2070,7 @@ }, "/v1/fs/file": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_fs_file", "parameters": [ { @@ -2176,9 +2090,7 @@ } }, "put": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "put_v1_fs_file", "parameters": [ { @@ -2218,9 +2130,7 @@ }, "/v1/fs/mkdir": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "post_v1_fs_mkdir", "parameters": [ { @@ -2249,9 +2159,7 @@ }, "/v1/fs/move": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "post_v1_fs_move", "requestBody": { "content": { @@ -2279,9 +2187,7 @@ }, "/v1/fs/stat": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_fs_stat", "parameters": [ { @@ -2310,9 +2216,7 @@ }, "/v1/fs/upload-batch": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "post_v1_fs_upload_batch", "parameters": [ { @@ -2353,9 +2257,7 @@ }, "/v1/health": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "operationId": "get_v1_health", "responses": { "200": { @@ -2373,9 +2275,7 @@ }, "/v1/processes": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "List all managed processes.", "description": "Returns a list of all processes (running and exited) currently tracked\nby the runtime, sorted by process ID.", "operationId": "get_v1_processes", @@ -2418,9 +2318,7 @@ } }, "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Create a long-lived managed process.", "description": "Spawns a new process with the given command and arguments. Supports both\npipe-based and PTY (tty) modes. Returns the process descriptor on success.", "operationId": "post_v1_processes", @@ -2480,9 +2378,7 @@ }, "/v1/processes/config": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Get process runtime configuration.", "description": "Returns the current runtime configuration for the process management API,\nincluding limits for concurrency, timeouts, and buffer sizes.", "operationId": "get_v1_processes_config", @@ -2510,9 +2406,7 @@ } }, "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Update process runtime configuration.", "description": "Replaces the runtime configuration for the process management API.\nValidates that all values are non-zero and clamps default timeout to max.", "operationId": "post_v1_processes_config", @@ -2562,9 +2456,7 @@ }, "/v1/processes/run": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Run a one-shot command.", "description": "Executes a command to completion and returns its stdout, stderr, exit code,\nand duration. Supports configurable timeout and output size limits.", "operationId": "post_v1_processes_run", @@ -2614,9 +2506,7 @@ }, "/v1/processes/{id}": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Get a single process by ID.", "description": "Returns the current state of a managed process including its status,\nPID, exit code, and creation/exit timestamps.", "operationId": "get_v1_process", @@ -2665,9 +2555,7 @@ } }, "delete": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Delete a process record.", "description": "Removes a stopped process from the runtime. Returns 409 if the process\nis still running; stop or kill it first.", "operationId": "delete_v1_process", @@ -2721,9 +2609,7 @@ }, "/v1/processes/{id}/input": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Write input to a process.", "description": "Sends data to a process's stdin (pipe mode) or PTY writer (tty mode).\nData can be encoded as base64, utf8, or text. Returns 413 if the decoded\npayload exceeds the configured `maxInputBytesPerRequest` limit.", "operationId": "post_v1_process_input", @@ -2804,9 +2690,7 @@ }, "/v1/processes/{id}/kill": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Send SIGKILL to a process.", "description": "Sends SIGKILL to the process and optionally waits up to `waitMs`\nmilliseconds for the process to exit before returning.", "operationId": "post_v1_process_kill", @@ -2869,9 +2753,7 @@ }, "/v1/processes/{id}/logs": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Fetch process logs.", "description": "Returns buffered log entries for a process. Supports filtering by stream\ntype, tail count, and sequence-based resumption. When `follow=true`,\nreturns an SSE stream that replays buffered entries then streams live output.", "operationId": "get_v1_process_logs", @@ -2969,9 +2851,7 @@ }, "/v1/processes/{id}/stop": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Send SIGTERM to a process.", "description": "Sends SIGTERM to the process and optionally waits up to `waitMs`\nmilliseconds for the process to exit before returning.", "operationId": "post_v1_process_stop", @@ -3034,9 +2914,7 @@ }, "/v1/processes/{id}/terminal/resize": { "post": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Resize a process terminal.", "description": "Sets the PTY window size (columns and rows) for a tty-mode process and\nsends SIGWINCH so the child process can adapt.", "operationId": "post_v1_process_terminal_resize", @@ -3117,9 +2995,7 @@ }, "/v1/processes/{id}/terminal/ws": { "get": { - "tags": [ - "v1" - ], + "tags": ["v1"], "summary": "Open an interactive WebSocket terminal session.", "description": "Upgrades the connection to a WebSocket for bidirectional PTY I/O. Accepts\n`access_token` query param for browser-based auth (WebSocket API cannot\nsend custom headers). Streams raw PTY output as binary frames and accepts\nJSON control frames for input, resize, and close.", "operationId": "get_v1_process_terminal_ws", @@ -3196,9 +3072,7 @@ "schemas": { "AcpEnvelope": { "type": "object", - "required": [ - "jsonrpc" - ], + "required": ["jsonrpc"], "properties": { "error": { "nullable": true @@ -3232,11 +3106,7 @@ }, "AcpServerInfo": { "type": "object", - "required": [ - "serverId", - "agent", - "createdAtMs" - ], + "required": ["serverId", "agent", "createdAtMs"], "properties": { "agent": { "type": "string" @@ -3252,9 +3122,7 @@ }, "AcpServerListResponse": { "type": "object", - "required": [ - "servers" - ], + "required": ["servers"], "properties": { "servers": { "type": "array", @@ -3345,12 +3213,7 @@ }, "AgentInfo": { "type": "object", - "required": [ - "id", - "installed", - "credentialsAvailable", - "capabilities" - ], + "required": ["id", "installed", "credentialsAvailable", "capabilities"], "properties": { "capabilities": { "$ref": "#/components/schemas/AgentCapabilities" @@ -3393,11 +3256,7 @@ }, "AgentInstallArtifact": { "type": "object", - "required": [ - "kind", - "path", - "source" - ], + "required": ["kind", "path", "source"], "properties": { "kind": { "type": "string" @@ -3433,10 +3292,7 @@ }, "AgentInstallResponse": { "type": "object", - "required": [ - "already_installed", - "artifacts" - ], + "required": ["already_installed", "artifacts"], "properties": { "already_installed": { "type": "boolean" @@ -3451,9 +3307,7 @@ }, "AgentListResponse": { "type": "object", - "required": [ - "agents" - ], + "required": ["agents"], "properties": { "agents": { "type": "array", @@ -3465,9 +3319,7 @@ }, "DesktopActionResponse": { "type": "object", - "required": [ - "ok" - ], + "required": ["ok"], "properties": { "ok": { "type": "boolean" @@ -3476,10 +3328,7 @@ }, "DesktopDisplayInfoResponse": { "type": "object", - "required": [ - "display", - "resolution" - ], + "required": ["display", "resolution"], "properties": { "display": { "type": "string" @@ -3491,10 +3340,7 @@ }, "DesktopErrorInfo": { "type": "object", - "required": [ - "code", - "message" - ], + "required": ["code", "message"], "properties": { "code": { "type": "string" @@ -3527,9 +3373,7 @@ }, "DesktopKeyboardDownRequest": { "type": "object", - "required": [ - "key" - ], + "required": ["key"], "properties": { "key": { "type": "string" @@ -3538,9 +3382,7 @@ }, "DesktopKeyboardPressRequest": { "type": "object", - "required": [ - "key" - ], + "required": ["key"], "properties": { "key": { "type": "string" @@ -3557,9 +3399,7 @@ }, "DesktopKeyboardTypeRequest": { "type": "object", - "required": [ - "text" - ], + "required": ["text"], "properties": { "delayMs": { "type": "integer", @@ -3574,9 +3414,7 @@ }, "DesktopKeyboardUpRequest": { "type": "object", - "required": [ - "key" - ], + "required": ["key"], "properties": { "key": { "type": "string" @@ -3585,18 +3423,11 @@ }, "DesktopMouseButton": { "type": "string", - "enum": [ - "left", - "middle", - "right" - ] + "enum": ["left", "middle", "right"] }, "DesktopMouseClickRequest": { "type": "object", - "required": [ - "x", - "y" - ], + "required": ["x", "y"], "properties": { "button": { "allOf": [ @@ -3647,12 +3478,7 @@ }, "DesktopMouseDragRequest": { "type": "object", - "required": [ - "startX", - "startY", - "endX", - "endY" - ], + "required": ["startX", "startY", "endX", "endY"], "properties": { "button": { "allOf": [ @@ -3682,10 +3508,7 @@ }, "DesktopMouseMoveRequest": { "type": "object", - "required": [ - "x", - "y" - ], + "required": ["x", "y"], "properties": { "x": { "type": "integer", @@ -3699,10 +3522,7 @@ }, "DesktopMousePositionResponse": { "type": "object", - "required": [ - "x", - "y" - ], + "required": ["x", "y"], "properties": { "screen": { "type": "integer", @@ -3725,10 +3545,7 @@ }, "DesktopMouseScrollRequest": { "type": "object", - "required": [ - "x", - "y" - ], + "required": ["x", "y"], "properties": { "deltaX": { "type": "integer", @@ -3775,10 +3592,7 @@ }, "DesktopProcessInfo": { "type": "object", - "required": [ - "name", - "running" - ], + "required": ["name", "running"], "properties": { "logPath": { "type": "string", @@ -3800,13 +3614,7 @@ }, "DesktopRecordingInfo": { "type": "object", - "required": [ - "id", - "status", - "fileName", - "bytes", - "startedAt" - ], + "required": ["id", "status", "fileName", "bytes", "startedAt"], "properties": { "bytes": { "type": "integer", @@ -3837,9 +3645,7 @@ }, "DesktopRecordingListResponse": { "type": "object", - "required": [ - "recordings" - ], + "required": ["recordings"], "properties": { "recordings": { "type": "array", @@ -3862,20 +3668,11 @@ }, "DesktopRecordingStatus": { "type": "string", - "enum": [ - "recording", - "completed", - "failed" - ] + "enum": ["recording", "completed", "failed"] }, "DesktopRegionScreenshotQuery": { "type": "object", - "required": [ - "x", - "y", - "width", - "height" - ], + "required": ["x", "y", "width", "height"], "properties": { "format": { "allOf": [ @@ -3918,10 +3715,7 @@ }, "DesktopResolution": { "type": "object", - "required": [ - "width", - "height" - ], + "required": ["width", "height"], "properties": { "dpi": { "type": "integer", @@ -3943,11 +3737,7 @@ }, "DesktopScreenshotFormat": { "type": "string", - "enum": [ - "png", - "jpeg", - "webp" - ] + "enum": ["png", "jpeg", "webp"] }, "DesktopScreenshotQuery": { "type": "object", @@ -3998,20 +3788,11 @@ }, "DesktopState": { "type": "string", - "enum": [ - "inactive", - "install_required", - "starting", - "active", - "stopping", - "failed" - ] + "enum": ["inactive", "install_required", "starting", "active", "stopping", "failed"] }, "DesktopStatusResponse": { "type": "object", - "required": [ - "state" - ], + "required": ["state"], "properties": { "display": { "type": "string", @@ -4064,9 +3845,7 @@ }, "DesktopStreamStatusResponse": { "type": "object", - "required": [ - "active" - ], + "required": ["active"], "properties": { "active": { "type": "boolean" @@ -4075,15 +3854,7 @@ }, "DesktopWindowInfo": { "type": "object", - "required": [ - "id", - "title", - "x", - "y", - "width", - "height", - "isActive" - ], + "required": ["id", "title", "x", "y", "width", "height", "isActive"], "properties": { "height": { "type": "integer", @@ -4116,9 +3887,7 @@ }, "DesktopWindowListResponse": { "type": "object", - "required": [ - "windows" - ], + "required": ["windows"], "properties": { "windows": { "type": "array", @@ -4151,9 +3920,7 @@ }, "FsActionResponse": { "type": "object", - "required": [ - "path" - ], + "required": ["path"], "properties": { "path": { "type": "string" @@ -4162,9 +3929,7 @@ }, "FsDeleteQuery": { "type": "object", - "required": [ - "path" - ], + "required": ["path"], "properties": { "path": { "type": "string" @@ -4186,12 +3951,7 @@ }, "FsEntry": { "type": "object", - "required": [ - "name", - "path", - "entryType", - "size" - ], + "required": ["name", "path", "entryType", "size"], "properties": { "entryType": { "$ref": "#/components/schemas/FsEntryType" @@ -4215,17 +3975,11 @@ }, "FsEntryType": { "type": "string", - "enum": [ - "file", - "directory" - ] + "enum": ["file", "directory"] }, "FsMoveRequest": { "type": "object", - "required": [ - "from", - "to" - ], + "required": ["from", "to"], "properties": { "from": { "type": "string" @@ -4241,10 +3995,7 @@ }, "FsMoveResponse": { "type": "object", - "required": [ - "from", - "to" - ], + "required": ["from", "to"], "properties": { "from": { "type": "string" @@ -4256,9 +4007,7 @@ }, "FsPathQuery": { "type": "object", - "required": [ - "path" - ], + "required": ["path"], "properties": { "path": { "type": "string" @@ -4267,11 +4016,7 @@ }, "FsStat": { "type": "object", - "required": [ - "path", - "entryType", - "size" - ], + "required": ["path", "entryType", "size"], "properties": { "entryType": { "$ref": "#/components/schemas/FsEntryType" @@ -4301,10 +4046,7 @@ }, "FsUploadBatchResponse": { "type": "object", - "required": [ - "paths", - "truncated" - ], + "required": ["paths", "truncated"], "properties": { "paths": { "type": "array", @@ -4319,10 +4061,7 @@ }, "FsWriteResponse": { "type": "object", - "required": [ - "path", - "bytesWritten" - ], + "required": ["path", "bytesWritten"], "properties": { "bytesWritten": { "type": "integer", @@ -4336,9 +4075,7 @@ }, "HealthResponse": { "type": "object", - "required": [ - "status" - ], + "required": ["status"], "properties": { "status": { "type": "string" @@ -4347,10 +4084,7 @@ }, "McpConfigQuery": { "type": "object", - "required": [ - "directory", - "mcpName" - ], + "required": ["directory", "mcpName"], "properties": { "directory": { "type": "string" @@ -4364,10 +4098,7 @@ "oneOf": [ { "type": "object", - "required": [ - "command", - "type" - ], + "required": ["command", "type"], "properties": { "args": { "type": "array", @@ -4401,18 +4132,13 @@ }, "type": { "type": "string", - "enum": [ - "local" - ] + "enum": ["local"] } } }, { "type": "object", - "required": [ - "url", - "type" - ], + "required": ["url", "type"], "properties": { "bearerTokenEnvVar": { "type": "string", @@ -4460,9 +4186,7 @@ }, "type": { "type": "string", - "enum": [ - "remote" - ] + "enum": ["remote"] }, "url": { "type": "string" @@ -4476,11 +4200,7 @@ }, "ProblemDetails": { "type": "object", - "required": [ - "type", - "title", - "status" - ], + "required": ["type", "title", "status"], "properties": { "detail": { "type": "string", @@ -4506,14 +4226,7 @@ }, "ProcessConfig": { "type": "object", - "required": [ - "maxConcurrentProcesses", - "defaultRunTimeoutMs", - "maxRunTimeoutMs", - "maxOutputBytes", - "maxLogBytesPerProcess", - "maxInputBytesPerRequest" - ], + "required": ["maxConcurrentProcesses", "defaultRunTimeoutMs", "maxRunTimeoutMs", "maxOutputBytes", "maxLogBytesPerProcess", "maxInputBytesPerRequest"], "properties": { "defaultRunTimeoutMs": { "type": "integer", @@ -4545,9 +4258,7 @@ }, "ProcessCreateRequest": { "type": "object", - "required": [ - "command" - ], + "required": ["command"], "properties": { "args": { "type": "array", @@ -4578,16 +4289,7 @@ }, "ProcessInfo": { "type": "object", - "required": [ - "id", - "command", - "args", - "tty", - "interactive", - "owner", - "status", - "createdAtMs" - ], + "required": ["id", "command", "args", "tty", "interactive", "owner", "status", "createdAtMs"], "properties": { "args": { "type": "array", @@ -4641,9 +4343,7 @@ }, "ProcessInputRequest": { "type": "object", - "required": [ - "data" - ], + "required": ["data"], "properties": { "data": { "type": "string" @@ -4656,9 +4356,7 @@ }, "ProcessInputResponse": { "type": "object", - "required": [ - "bytesWritten" - ], + "required": ["bytesWritten"], "properties": { "bytesWritten": { "type": "integer", @@ -4681,9 +4379,7 @@ }, "ProcessListResponse": { "type": "object", - "required": [ - "processes" - ], + "required": ["processes"], "properties": { "processes": { "type": "array", @@ -4695,13 +4391,7 @@ }, "ProcessLogEntry": { "type": "object", - "required": [ - "sequence", - "stream", - "timestampMs", - "data", - "encoding" - ], + "required": ["sequence", "stream", "timestampMs", "data", "encoding"], "properties": { "data": { "type": "string" @@ -4753,11 +4443,7 @@ }, "ProcessLogsResponse": { "type": "object", - "required": [ - "processId", - "stream", - "entries" - ], + "required": ["processId", "stream", "entries"], "properties": { "entries": { "type": "array", @@ -4775,26 +4461,15 @@ }, "ProcessLogsStream": { "type": "string", - "enum": [ - "stdout", - "stderr", - "combined", - "pty" - ] + "enum": ["stdout", "stderr", "combined", "pty"] }, "ProcessOwner": { "type": "string", - "enum": [ - "user", - "desktop", - "system" - ] + "enum": ["user", "desktop", "system"] }, "ProcessRunRequest": { "type": "object", - "required": [ - "command" - ], + "required": ["command"], "properties": { "args": { "type": "array", @@ -4830,14 +4505,7 @@ }, "ProcessRunResponse": { "type": "object", - "required": [ - "timedOut", - "stdout", - "stderr", - "stdoutTruncated", - "stderrTruncated", - "durationMs" - ], + "required": ["timedOut", "stdout", "stderr", "stdoutTruncated", "stderrTruncated", "durationMs"], "properties": { "durationMs": { "type": "integer", @@ -4879,17 +4547,11 @@ }, "ProcessState": { "type": "string", - "enum": [ - "running", - "exited" - ] + "enum": ["running", "exited"] }, "ProcessTerminalResizeRequest": { "type": "object", - "required": [ - "cols", - "rows" - ], + "required": ["cols", "rows"], "properties": { "cols": { "type": "integer", @@ -4905,10 +4567,7 @@ }, "ProcessTerminalResizeResponse": { "type": "object", - "required": [ - "cols", - "rows" - ], + "required": ["cols", "rows"], "properties": { "cols": { "type": "integer", @@ -4924,16 +4583,11 @@ }, "ServerStatus": { "type": "string", - "enum": [ - "running", - "stopped" - ] + "enum": ["running", "stopped"] }, "ServerStatusInfo": { "type": "object", - "required": [ - "status" - ], + "required": ["status"], "properties": { "status": { "$ref": "#/components/schemas/ServerStatus" @@ -4948,10 +4602,7 @@ }, "SkillSource": { "type": "object", - "required": [ - "type", - "source" - ], + "required": ["type", "source"], "properties": { "ref": { "type": "string", @@ -4978,9 +4629,7 @@ }, "SkillsConfig": { "type": "object", - "required": [ - "sources" - ], + "required": ["sources"], "properties": { "sources": { "type": "array", @@ -4992,10 +4641,7 @@ }, "SkillsConfigQuery": { "type": "object", - "required": [ - "directory", - "skillName" - ], + "required": ["directory", "skillName"], "properties": { "directory": { "type": "string" @@ -5013,4 +4659,4 @@ "description": "ACP proxy v1 API" } ] -} \ No newline at end of file +} diff --git a/frontend/packages/inspector/src/components/debug/DebugPanel.tsx b/frontend/packages/inspector/src/components/debug/DebugPanel.tsx index 5398fe6..947d766 100644 --- a/frontend/packages/inspector/src/components/debug/DebugPanel.tsx +++ b/frontend/packages/inspector/src/components/debug/DebugPanel.tsx @@ -80,10 +80,6 @@ const DebugPanel = ({ Desktop - + - + {isActive && !liveViewActive && ( + + )} - {error &&
{error}
} {screenshotError &&
{screenshotError}
} - + {/* ========== Runtime Section ========== */}
Desktop Runtime - + {status?.state ?? "unknown"}
-
Display
@@ -244,65 +315,35 @@ const DesktopTab = ({
{formatStartedAt(status?.startedAt)}
-
- setWidth(event.target.value)} - inputMode="numeric" - /> + setWidth(event.target.value)} inputMode="numeric" />
- setHeight(event.target.value)} - inputMode="numeric" - /> + setHeight(event.target.value)} inputMode="numeric" />
- setDpi(event.target.value)} - inputMode="numeric" - /> + setDpi(event.target.value)} inputMode="numeric" />
-
- - + {isActive ? ( + + ) : ( + + )}
- + {/* ========== Missing Dependencies ========== */} {status?.missingDependencies && status.missingDependencies.length > 0 && (
@@ -310,18 +351,188 @@ const DesktopTab = ({
{status.missingDependencies.map((dependency) => ( - {dependency} + + {dependency} + ))}
{status.installCommand && ( <> -
Install command
+
+ Install command +
{status.installCommand}
)}
)} - + {/* ========== Live View Section ========== */} +
+
+ + + {isActive && ( + + )} +
+ {liveViewError && ( +
+ {liveViewError} +
+ )} + {!isActive &&
Start the desktop runtime to enable live view.
} + {isActive && liveViewActive && } + {isActive && !liveViewActive && ( + <> + {screenshotUrl ? ( +
+ Desktop screenshot +
+ ) : ( +
Click "Start Stream" for live desktop view, or use the Screenshot button above.
+ )} + + )} +
+ {/* ========== Recording Section ========== */} +
+
+ + + Recording + + {activeRecording && Recording} +
+ {recordingError && ( +
+ {recordingError} +
+ )} + {!isActive &&
Start the desktop runtime to enable recording.
} + {isActive && ( + <> +
+
+ + setRecordingFps(e.target.value)} + inputMode="numeric" + style={{ maxWidth: 80 }} + disabled={!!activeRecording} + /> +
+
+
+ {!activeRecording ? ( + + ) : ( + + )} + +
+ {recordings.length > 0 && ( +
+ {recordings.map((rec) => ( +
+
+
+ + {rec.fileName} + + + {rec.status} + +
+ {rec.status === "completed" && ( +
+ + +
+ )} +
+
+ {formatBytes(rec.bytes)} + {" \u00b7 "} + {formatDuration(rec.startedAt, rec.endedAt)} + {" \u00b7 "} + {formatStartedAt(rec.startedAt)} +
+
+ ))} +
+ )} + {recordings.length === 0 && !recordingLoading && ( +
+ No recordings yet. Click "Start Recording" to begin. +
+ )} + + )} +
+ {/* ========== Diagnostics Section ========== */} {(status?.lastError || status?.runtimeLogPath || (status?.processes?.length ?? 0) > 0) && (
@@ -352,9 +563,7 @@ const DesktopTab = ({ {process.running ? "running" : "stopped"}
-
- {process.pid ? `pid ${process.pid}` : "no pid"} -
+
{process.pid ? `pid ${process.pid}` : "no pid"}
{process.logPath &&
{process.logPath}
}
))} @@ -363,31 +572,7 @@ const DesktopTab = ({ )} )} - -
-
- Latest Screenshot - {status?.state === "active" ? ( - Manual refresh only - ) : null} -
- - {loading ?
Loading...
: null} - {!loading && !screenshotUrl && ( -
- {status?.state === "active" - ? "No screenshot loaded yet." - : "Start the desktop runtime to capture a screenshot."} -
- )} - {screenshotUrl && ( -
- Desktop screenshot -
- )} -
); }; - export default DesktopTab; diff --git a/frontend/packages/inspector/vite.config.ts b/frontend/packages/inspector/vite.config.ts index 6496813..d398b20 100644 --- a/frontend/packages/inspector/vite.config.ts +++ b/frontend/packages/inspector/vite.config.ts @@ -8,7 +8,7 @@ export default defineConfig(({ command }) => ({ port: 5173, proxy: { "/v1": { - target: "http://localhost:2468", + target: process.env.SANDBOX_AGENT_URL || "http://localhost:2468", changeOrigin: true, ws: true, }, diff --git a/justfile b/justfile index 84b761f..c23c751 100644 --- a/justfile +++ b/justfile @@ -76,6 +76,26 @@ run-gigacode *ARGS: dev-docs: cd docs && pnpm dlx mintlify dev --host 0.0.0.0 +# Start the desktop dev stack (sandbox-agent backend in Docker + inspector frontend) +[group('server')] +server-dev: + docker compose -f server/compose.dev.yaml up --build --force-recreate -d + +# Stop the desktop dev stack +[group('server')] +server-dev-down: + docker compose -f server/compose.dev.yaml down + +# Tail desktop dev stack logs +[group('server')] +server-dev-logs *ARGS: + docker compose -f server/compose.dev.yaml logs -f --tail=200 {{ ARGS }} + +# Rebuild and restart only the backend container +[group('server')] +server-dev-restart-backend: + docker compose -f server/compose.dev.yaml up --build --force-recreate -d backend + install: pnpm install pnpm build --filter @sandbox-agent/inspector... diff --git a/sdks/react/src/DesktopViewer.tsx b/sdks/react/src/DesktopViewer.tsx index b807e42..4c0c440 100644 --- a/sdks/react/src/DesktopViewer.tsx +++ b/sdks/react/src/DesktopViewer.tsx @@ -2,26 +2,19 @@ import type { CSSProperties, MouseEvent, WheelEvent } from "react"; import { useEffect, useRef, useState } from "react"; -import type { - DesktopMouseButton, - DesktopStreamErrorStatus, - DesktopStreamReadyStatus, - SandboxAgent, -} from "sandbox-agent"; +import type { DesktopMouseButton, DesktopStreamErrorStatus, DesktopStreamReadyStatus, DesktopStreamSession, SandboxAgent } from "sandbox-agent"; type ConnectionState = "connecting" | "ready" | "closed" | "error"; -export type DesktopViewerClient = Pick< - SandboxAgent, - "startDesktopStream" | "stopDesktopStream" | "connectDesktopStream" ->; +export type DesktopViewerClient = Pick; export interface DesktopViewerProps { client: DesktopViewerClient; className?: string; style?: CSSProperties; - imageStyle?: CSSProperties; - height?: number | string; + autoStart?: boolean; + showStatusBar?: boolean; + tabIndex?: number; onConnect?: (status: DesktopStreamReadyStatus) => void; onDisconnect?: () => void; onError?: (error: DesktopStreamErrorStatus | Error) => void; @@ -31,11 +24,7 @@ const shellStyle: CSSProperties = { display: "flex", flexDirection: "column", overflow: "hidden", - border: "1px solid rgba(15, 23, 42, 0.14)", - borderRadius: 14, - background: - "linear-gradient(180deg, rgba(248, 250, 252, 0.96) 0%, rgba(226, 232, 240, 0.92) 100%)", - boxShadow: "0 20px 40px rgba(15, 23, 42, 0.08)", + width: "100%", }; const statusBarStyle: CSSProperties = { @@ -44,28 +33,22 @@ const statusBarStyle: CSSProperties = { justifyContent: "space-between", gap: 12, padding: "10px 14px", - borderBottom: "1px solid rgba(15, 23, 42, 0.08)", - background: "rgba(255, 255, 255, 0.78)", - color: "#0f172a", fontSize: 12, lineHeight: 1.4, }; const viewportStyle: CSSProperties = { position: "relative", - display: "flex", - alignItems: "center", - justifyContent: "center", + width: "100%", overflow: "hidden", - background: - "radial-gradient(circle at top, rgba(14, 165, 233, 0.18), transparent 45%), linear-gradient(180deg, #0f172a 0%, #111827 100%)", + background: "#000", + outline: "none", }; -const imageBaseStyle: CSSProperties = { +const videoBaseStyle: CSSProperties = { display: "block", width: "100%", - height: "100%", - objectFit: "contain", + height: "auto", userSelect: "none", }; @@ -90,90 +73,96 @@ export const DesktopViewer = ({ client, className, style, - imageStyle, - height = 480, + autoStart = true, + showStatusBar = true, + tabIndex = 0, onConnect, onDisconnect, onError, }: DesktopViewerProps) => { + const videoRef = useRef(null); const wrapperRef = useRef(null); - const sessionRef = useRef | null>(null); - const [connectionState, setConnectionState] = useState("connecting"); - const [statusMessage, setStatusMessage] = useState("Starting desktop stream..."); - const [frameUrl, setFrameUrl] = useState(null); + const sessionRef = useRef(null); + const [connectionState, setConnectionState] = useState(autoStart ? "connecting" : "closed"); + const [statusMessage, setStatusMessage] = useState(autoStart ? "Starting desktop stream..." : "Stream not started."); const [resolution, setResolution] = useState<{ width: number; height: number } | null>(null); - useEffect(() => { - let cancelled = false; - let lastObjectUrl: string | null = null; - let session: ReturnType | null = null; + // Store callbacks and client in refs to keep them out of the effect deps. + const onConnectRef = useRef(onConnect); + onConnectRef.current = onConnect; + const onDisconnectRef = useRef(onDisconnect); + onDisconnectRef.current = onDisconnect; + const onErrorRef = useRef(onError); + onErrorRef.current = onError; + const clientRef = useRef(client); + clientRef.current = client; + useEffect(() => { + if (!autoStart) { + setConnectionState("closed"); + setStatusMessage("Stream not started."); + return; + } + + let cancelled = false; setConnectionState("connecting"); setStatusMessage("Starting desktop stream..."); setResolution(null); + const cl = clientRef.current; + const connect = async () => { try { - await client.startDesktopStream(); - if (cancelled) { - return; - } + await cl.startDesktopStream(); + if (cancelled) return; - session = client.connectDesktopStream(); + const session = cl.connectDesktopStream(); sessionRef.current = session; + session.onReady((status) => { - if (cancelled) { - return; + if (cancelled) return; + setResolution({ width: status.width, height: status.height }); + setStatusMessage("Negotiating WebRTC..."); + onConnectRef.current?.(status); + }); + + session.onTrack((stream) => { + if (cancelled) return; + if (videoRef.current) { + videoRef.current.srcObject = stream; } setConnectionState("ready"); setStatusMessage("Desktop stream connected."); - setResolution({ width: status.width, height: status.height }); - onConnect?.(status); + // Grab keyboard focus when connected. + wrapperRef.current?.focus(); }); - session.onFrame((frame) => { - if (cancelled) { - return; - } - const nextUrl = URL.createObjectURL( - new Blob([frame.slice().buffer], { type: "image/jpeg" }), - ); - setFrameUrl((current) => { - if (current) { - URL.revokeObjectURL(current); - } - return nextUrl; - }); - if (lastObjectUrl) { - URL.revokeObjectURL(lastObjectUrl); - } - lastObjectUrl = nextUrl; + + session.onConnect(() => { + if (cancelled) return; + setConnectionState("ready"); + setStatusMessage("Desktop stream connected."); + wrapperRef.current?.focus(); }); + session.onError((error) => { - if (cancelled) { - return; - } + if (cancelled) return; setConnectionState("error"); setStatusMessage(error instanceof Error ? error.message : error.message); - onError?.(error); + onErrorRef.current?.(error); }); - session.onClose(() => { - if (cancelled) { - return; - } - setConnectionState((current) => (current === "error" ? current : "closed")); - setStatusMessage((current) => - current === "Desktop stream connected." ? "Desktop stream disconnected." : current, - ); - onDisconnect?.(); + + session.onDisconnect(() => { + if (cancelled) return; + setConnectionState((cur) => (cur === "error" ? cur : "closed")); + setStatusMessage((cur) => (cur === "Desktop stream connected." ? "Desktop stream disconnected." : cur)); + onDisconnectRef.current?.(); }); } catch (error) { - if (cancelled) { - return; - } - const nextError = error instanceof Error ? error : new Error("Failed to initialize desktop stream."); + if (cancelled) return; + const nextError = error instanceof Error ? error : new Error("Failed to start desktop stream."); setConnectionState("error"); setStatusMessage(nextError.message); - onError?.(nextError); + onErrorRef.current?.(nextError); } }; @@ -181,36 +170,28 @@ export const DesktopViewer = ({ return () => { cancelled = true; - session?.close(); - sessionRef.current = null; - void client.stopDesktopStream().catch(() => undefined); - setFrameUrl((current) => { - if (current) { - URL.revokeObjectURL(current); - } - return null; - }); - if (lastObjectUrl) { - URL.revokeObjectURL(lastObjectUrl); + const session = sessionRef.current; + if (session) { + session.close(); + sessionRef.current = null; } + if (videoRef.current) { + videoRef.current.srcObject = null; + } + // Note: we do NOT call stopDesktopStream() here. The parent component + // manages the stream lifecycle. Calling stop on unmount would kill the + // streaming process and race with subsequent mounts. }; - }, [client, onConnect, onDisconnect, onError]); + }, [autoStart]); const scalePoint = (clientX: number, clientY: number) => { - const wrapper = wrapperRef.current; - if (!wrapper || !resolution) { - return null; - } - const rect = wrapper.getBoundingClientRect(); - if (rect.width === 0 || rect.height === 0) { - return null; - } + const video = videoRef.current; + if (!video || !resolution) return null; + const rect = video.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return null; const x = Math.max(0, Math.min(resolution.width, ((clientX - rect.left) / rect.width) * resolution.width)); const y = Math.max(0, Math.min(resolution.height, ((clientY - rect.top) / rect.height) * resolution.height)); - return { - x: Math.round(x), - y: Math.round(y), - }; + return { x: Math.round(x), y: Math.round(y) }; }; const buttonFromMouseEvent = (event: MouseEvent): DesktopMouseButton => { @@ -224,64 +205,60 @@ export const DesktopViewer = ({ } }; - const withSession = ( - callback: (session: NonNullable>) => void, - ) => { - const session = sessionRef.current; - if (session) { - callback(session); - } + const withSession = (fn: (s: DesktopStreamSession) => void) => { + const s = sessionRef.current; + if (s) fn(s); }; return (
-
- {statusMessage} - - {resolution ? `${resolution.width}×${resolution.height}` : "Awaiting frames"} - -
+ {showStatusBar && ( +
+ {statusMessage} + {resolution ? `${resolution.width}\u00d7${resolution.height}` : "Awaiting stream"} +
+ )}
{ const point = scalePoint(event.clientX, event.clientY); - if (!point) { - return; - } - withSession((session) => session.moveMouse(point.x, point.y)); + if (!point) return; + withSession((s) => s.moveMouse(point.x, point.y)); }} onMouseDown={(event) => { event.preventDefault(); + // Ensure keyboard focus stays on the viewport when clicking. + wrapperRef.current?.focus(); const point = scalePoint(event.clientX, event.clientY); - withSession((session) => - session.mouseDown(buttonFromMouseEvent(event), point?.x, point?.y), - ); + if (!point) return; + withSession((s) => s.mouseDown(buttonFromMouseEvent(event), point.x, point.y)); }} onMouseUp={(event) => { const point = scalePoint(event.clientX, event.clientY); - withSession((session) => session.mouseUp(buttonFromMouseEvent(event), point?.x, point?.y)); + if (!point) return; + withSession((s) => s.mouseUp(buttonFromMouseEvent(event), point.x, point.y)); }} onWheel={(event: WheelEvent) => { event.preventDefault(); const point = scalePoint(event.clientX, event.clientY); - if (!point) { - return; - } - withSession((session) => session.scroll(point.x, point.y, Math.round(event.deltaX), Math.round(event.deltaY))); + if (!point) return; + withSession((s) => s.scroll(point.x, point.y, Math.round(event.deltaX), Math.round(event.deltaY))); }} onKeyDown={(event) => { - withSession((session) => session.keyDown(event.key)); + event.preventDefault(); + event.stopPropagation(); + withSession((s) => s.keyDown(event.key)); }} onKeyUp={(event) => { - withSession((session) => session.keyUp(event.key)); + event.stopPropagation(); + withSession((s) => s.keyUp(event.key)); }} + onContextMenu={(event) => event.preventDefault()} > - {frameUrl ? ( - Desktop stream - ) : null} +
); diff --git a/sdks/typescript/src/client.ts b/sdks/typescript/src/client.ts index b2e3590..a635819 100644 --- a/sdks/typescript/src/client.ts +++ b/sdks/typescript/src/client.ts @@ -23,10 +23,7 @@ import { type SetSessionModeRequest, } from "acp-http-client"; import type { SandboxAgentSpawnHandle, SandboxAgentSpawnOptions } from "./spawn.ts"; -import { - DesktopStreamSession, - type DesktopStreamConnectOptions, -} from "./desktop-stream.ts"; +import { DesktopStreamSession, type DesktopStreamConnectOptions } from "./desktop-stream.ts"; import { type AcpServerListResponse, type AgentInfo, @@ -1530,9 +1527,7 @@ export class SandboxAgent { return this.requestJson("GET", `${API_PREFIX}/desktop/windows`); } - async startDesktopRecording( - request: DesktopRecordingStartRequest = {}, - ): Promise { + async startDesktopRecording(request: DesktopRecordingStartRequest = {}): Promise { return this.requestJson("POST", `${API_PREFIX}/desktop/recording/start`, { body: request, }); @@ -1551,13 +1546,9 @@ export class SandboxAgent { } async downloadDesktopRecording(id: string): Promise { - const response = await this.requestRaw( - "GET", - `${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}/download`, - { - accept: "video/mp4", - }, - ); + const response = await this.requestRaw("GET", `${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}/download`, { + accept: "video/mp4", + }); const buffer = await response.arrayBuffer(); return new Uint8Array(buffer); } @@ -1799,7 +1790,7 @@ export class SandboxAgent { buildDesktopStreamWebSocketUrl(options: ProcessTerminalWebSocketUrlOptions = {}): string { return toWebSocketUrl( - this.buildUrl(`${API_PREFIX}/desktop/stream/ws`, { + this.buildUrl(`${API_PREFIX}/desktop/stream/signaling`, { access_token: options.accessToken ?? this.token, }), ); @@ -1820,7 +1811,7 @@ export class SandboxAgent { } connectDesktopStream(options: DesktopStreamSessionOptions = {}): DesktopStreamSession { - return new DesktopStreamSession(this.connectDesktopStreamWebSocket(options)); + return new DesktopStreamSession(this.connectDesktopStreamWebSocket(options), options); } private async getLiveConnection(agent: string): Promise { diff --git a/sdks/typescript/src/desktop-stream.ts b/sdks/typescript/src/desktop-stream.ts index 1bbf76f..f8810de 100644 --- a/sdks/typescript/src/desktop-stream.ts +++ b/sdks/typescript/src/desktop-stream.ts @@ -1,6 +1,5 @@ import type { DesktopMouseButton } from "./types.ts"; -const WS_READY_STATE_CONNECTING = 0; const WS_READY_STATE_OPEN = 1; const WS_READY_STATE_CLOSED = 3; @@ -21,63 +20,140 @@ export interface DesktopStreamConnectOptions { accessToken?: string; WebSocket?: typeof WebSocket; protocols?: string | string[]; + RTCPeerConnection?: typeof RTCPeerConnection; + rtcConfig?: RTCConfiguration; } -type DesktopStreamClientFrame = - | { - type: "moveMouse"; - x: number; - y: number; - } - | { - type: "mouseDown" | "mouseUp"; - x?: number; - y?: number; - button?: DesktopMouseButton; - } - | { - type: "scroll"; - x: number; - y: number; - deltaX?: number; - deltaY?: number; - } - | { - type: "keyDown" | "keyUp"; - key: string; - } - | { - type: "close"; - }; +/** + * Data channel binary input protocol (Big Endian). + * + * Byte 0: opcode + * 0x01 = mouse_move (bytes 1-2: u16 BE x, bytes 3-4: u16 BE y) + * 0x02 = mouse_down (byte 1: u8 button) + * 0x03 = mouse_up (byte 1: u8 button) + * 0x04 = mouse_scroll (bytes 1-2: i16 BE dx, bytes 3-4: i16 BE dy) + * 0x05 = key_down (bytes 1-4: u32 BE keysym) + * 0x06 = key_up (bytes 1-4: u32 BE keysym) + */ +const OP_MOUSE_MOVE = 0x01; +const OP_MOUSE_DOWN = 0x02; +const OP_MOUSE_UP = 0x03; +const OP_MOUSE_SCROLL = 0x04; +const OP_KEY_DOWN = 0x05; +const OP_KEY_UP = 0x06; + +function mouseButtonToX11(button?: DesktopMouseButton): number { + switch (button) { + case "middle": + return 2; + case "right": + return 3; + default: + return 1; + } +} + +function keyToX11Keysym(key: string): number { + if (key.length === 1) { + const cp = key.charCodeAt(0); + if (cp >= 0x20 && cp <= 0x7e) return cp; + return 0x01000000 + cp; + } + + const map: Record = { + Backspace: 0xff08, + Tab: 0xff09, + Return: 0xff0d, + Enter: 0xff0d, + Escape: 0xff1b, + Delete: 0xffff, + Home: 0xff50, + Left: 0xff51, + ArrowLeft: 0xff51, + Up: 0xff52, + ArrowUp: 0xff52, + Right: 0xff53, + ArrowRight: 0xff53, + Down: 0xff54, + ArrowDown: 0xff54, + PageUp: 0xff55, + PageDown: 0xff56, + End: 0xff57, + Insert: 0xff63, + F1: 0xffbe, + F2: 0xffbf, + F3: 0xffc0, + F4: 0xffc1, + F5: 0xffc2, + F6: 0xffc3, + F7: 0xffc4, + F8: 0xffc5, + F9: 0xffc6, + F10: 0xffc7, + F11: 0xffc8, + F12: 0xffc9, + Shift: 0xffe1, + ShiftLeft: 0xffe1, + ShiftRight: 0xffe2, + Control: 0xffe3, + ControlLeft: 0xffe3, + ControlRight: 0xffe4, + Alt: 0xffe9, + AltLeft: 0xffe9, + AltRight: 0xffea, + Meta: 0xffeb, + MetaLeft: 0xffeb, + MetaRight: 0xffec, + CapsLock: 0xffe5, + NumLock: 0xff7f, + ScrollLock: 0xff14, + " ": 0x0020, + Space: 0x0020, + }; + + return map[key] ?? 0; +} export class DesktopStreamSession { readonly socket: WebSocket; readonly closed: Promise; + private pc: RTCPeerConnection | null = null; + private dataChannel: RTCDataChannel | null = null; + private mediaStream: MediaStream | null = null; + private connected = false; + private pendingCandidates: Record[] = []; + private cachedReadyStatus: DesktopStreamReadyStatus | null = null; + private readonly readyListeners = new Set<(status: DesktopStreamReadyStatus) => void>(); - private readonly frameListeners = new Set<(frame: Uint8Array) => void>(); + private readonly trackListeners = new Set<(stream: MediaStream) => void>(); + private readonly connectListeners = new Set<() => void>(); + private readonly disconnectListeners = new Set<() => void>(); private readonly errorListeners = new Set<(error: DesktopStreamErrorStatus | Error) => void>(); - private readonly closeListeners = new Set<() => void>(); - private closeSignalSent = false; private closedResolve!: () => void; + private readonly PeerConnection: typeof RTCPeerConnection; + private readonly rtcConfig: RTCConfiguration; - constructor(socket: WebSocket) { + constructor(socket: WebSocket, options: DesktopStreamConnectOptions = {}) { this.socket = socket; - this.socket.binaryType = "arraybuffer"; + this.PeerConnection = options.RTCPeerConnection ?? globalThis.RTCPeerConnection; + this.rtcConfig = options.rtcConfig ?? {}; + this.closed = new Promise((resolve) => { this.closedResolve = resolve; }); this.socket.addEventListener("message", (event) => { - void this.handleMessage(event.data); + this.handleMessage(event.data as string); }); this.socket.addEventListener("error", () => { - this.emitError(new Error("Desktop stream websocket connection failed.")); + this.emitError(new Error("Desktop stream signaling connection failed.")); }); this.socket.addEventListener("close", () => { + this.teardownPeerConnection(); this.closedResolve(); - for (const listener of this.closeListeners) { + for (const listener of this.disconnectListeners) { listener(); } }); @@ -85,15 +161,35 @@ export class DesktopStreamSession { onReady(listener: (status: DesktopStreamReadyStatus) => void): () => void { this.readyListeners.add(listener); + if (this.cachedReadyStatus) { + listener(this.cachedReadyStatus); + } return () => { this.readyListeners.delete(listener); }; } - onFrame(listener: (frame: Uint8Array) => void): () => void { - this.frameListeners.add(listener); + onTrack(listener: (stream: MediaStream) => void): () => void { + this.trackListeners.add(listener); + if (this.mediaStream) { + listener(this.mediaStream); + } return () => { - this.frameListeners.delete(listener); + this.trackListeners.delete(listener); + }; + } + + onConnect(listener: () => void): () => void { + this.connectListeners.add(listener); + return () => { + this.connectListeners.delete(listener); + }; + } + + onDisconnect(listener: () => void): () => void { + this.disconnectListeners.add(listener); + return () => { + this.disconnectListeners.delete(listener); }; } @@ -104,97 +200,313 @@ export class DesktopStreamSession { }; } + /** @deprecated Use onDisconnect instead. */ onClose(listener: () => void): () => void { - this.closeListeners.add(listener); - return () => { - this.closeListeners.delete(listener); - }; + return this.onDisconnect(listener); + } + + /** @deprecated No longer emits JPEG frames. Use onTrack for WebRTC media. */ + onFrame(_listener: (frame: Uint8Array) => void): () => void { + return () => {}; + } + + getMediaStream(): MediaStream | null { + return this.mediaStream; } moveMouse(x: number, y: number): void { - this.sendFrame({ type: "moveMouse", x, y }); + if (this.dataChannel?.readyState === "open") { + const buf = new ArrayBuffer(5); + const view = new DataView(buf); + view.setUint8(0, OP_MOUSE_MOVE); + view.setUint16(1, x, false); + view.setUint16(3, y, false); + this.dataChannel.send(buf); + } else { + this.sendSignaling("moveMouse", { x, y }); + } } mouseDown(button?: DesktopMouseButton, x?: number, y?: number): void { - this.sendFrame({ type: "mouseDown", button, x, y }); + if (x != null && y != null) { + this.moveMouse(x, y); + } + if (this.dataChannel?.readyState === "open") { + const buf = new ArrayBuffer(2); + const view = new DataView(buf); + view.setUint8(0, OP_MOUSE_DOWN); + view.setUint8(1, mouseButtonToX11(button)); + this.dataChannel.send(buf); + } else { + this.sendSignaling("mouseDown", { button, x, y }); + } } mouseUp(button?: DesktopMouseButton, x?: number, y?: number): void { - this.sendFrame({ type: "mouseUp", button, x, y }); + if (x != null && y != null) { + this.moveMouse(x, y); + } + if (this.dataChannel?.readyState === "open") { + const buf = new ArrayBuffer(2); + const view = new DataView(buf); + view.setUint8(0, OP_MOUSE_UP); + view.setUint8(1, mouseButtonToX11(button)); + this.dataChannel.send(buf); + } else { + this.sendSignaling("mouseUp", { button, x, y }); + } } scroll(x: number, y: number, deltaX?: number, deltaY?: number): void { - this.sendFrame({ type: "scroll", x, y, deltaX, deltaY }); + this.moveMouse(x, y); + if (this.dataChannel?.readyState === "open") { + const buf = new ArrayBuffer(5); + const view = new DataView(buf); + view.setUint8(0, OP_MOUSE_SCROLL); + view.setInt16(1, deltaX ?? 0, false); + view.setInt16(3, deltaY ?? 0, false); + this.dataChannel.send(buf); + } else { + this.sendSignaling("scroll", { x, y, deltaX, deltaY }); + } } keyDown(key: string): void { - this.sendFrame({ type: "keyDown", key }); + const keysym = keyToX11Keysym(key); + if (keysym === 0) return; + if (this.dataChannel?.readyState === "open") { + const buf = new ArrayBuffer(5); + const view = new DataView(buf); + view.setUint8(0, OP_KEY_DOWN); + view.setUint32(1, keysym, false); + this.dataChannel.send(buf); + } else { + this.sendSignaling("keyDown", { key }); + } } keyUp(key: string): void { - this.sendFrame({ type: "keyUp", key }); + const keysym = keyToX11Keysym(key); + if (keysym === 0) return; + if (this.dataChannel?.readyState === "open") { + const buf = new ArrayBuffer(5); + const view = new DataView(buf); + view.setUint8(0, OP_KEY_UP); + view.setUint32(1, keysym, false); + this.dataChannel.send(buf); + } else { + this.sendSignaling("keyUp", { key }); + } } close(): void { - if (this.socket.readyState === WS_READY_STATE_CONNECTING) { - this.socket.addEventListener( - "open", - () => { - this.close(); - }, - { once: true }, - ); - return; - } - - if (this.socket.readyState === WS_READY_STATE_OPEN) { - if (!this.closeSignalSent) { - this.closeSignalSent = true; - this.sendFrame({ type: "close" }); - } - this.socket.close(); - return; - } - + this.teardownPeerConnection(); if (this.socket.readyState !== WS_READY_STATE_CLOSED) { this.socket.close(); } } - private async handleMessage(data: unknown): Promise { + private handleMessage(data: string): void { + let msg: Record; try { - if (typeof data === "string") { - const frame = parseStatusFrame(data); - if (!frame) { - this.emitError(new Error("Received invalid desktop stream control frame.")); - return; - } + msg = JSON.parse(data) as Record; + } catch { + return; + } - if (frame.type === "ready") { - for (const listener of this.readyListeners) { - listener(frame); - } - return; - } + const type = (msg.type as string) ?? ""; - this.emitError(frame); - return; + switch (type) { + case "ready": { + const status: DesktopStreamReadyStatus = { + type: "ready", + width: Number(msg.width) || 0, + height: Number(msg.height) || 0, + }; + this.cachedReadyStatus = status; + for (const listener of this.readyListeners) { + listener(status); + } + break; } - const bytes = await decodeBinaryFrame(data); - for (const listener of this.frameListeners) { - listener(bytes); + case "offer": { + if (msg.sdp) { + void this.handleOffer(msg.sdp as string); + } + break; } + + case "candidate": { + void this.handleCandidate(msg as unknown as RTCIceCandidateInit); + break; + } + + case "error": { + const errorStatus: DesktopStreamErrorStatus = { + type: "error", + message: (msg.message as string) ?? "Unknown error", + }; + this.emitError(errorStatus); + break; + } + + default: + break; + } + } + + private async handleOffer(sdp: string): Promise { + try { + const config: RTCConfiguration = { + ...this.rtcConfig, + iceServers: this.rtcConfig.iceServers ?? [{ urls: "stun:stun.l.google.com:19302" }], + }; + const pc = new this.PeerConnection(config); + this.pc = pc; + + pc.ontrack = (event) => { + const stream = event.streams[0] ?? new MediaStream([event.track]); + this.mediaStream = stream; + for (const listener of this.trackListeners) { + listener(stream); + } + }; + + pc.onicecandidate = (event) => { + if (event.candidate) { + this.sendJson({ + type: "candidate", + candidate: event.candidate.candidate, + sdpMLineIndex: event.candidate.sdpMLineIndex, + sdpMid: event.candidate.sdpMid, + }); + } + }; + + pc.onconnectionstatechange = () => { + switch (pc.connectionState) { + case "connected": + if (!this.connected) { + this.connected = true; + for (const listener of this.connectListeners) { + listener(); + } + } + break; + case "closed": + case "failed": + this.emitError(new Error(`WebRTC connection ${pc.connectionState}.`)); + break; + } + }; + + pc.oniceconnectionstatechange = () => { + switch (pc.iceConnectionState) { + case "connected": + if (!this.connected) { + this.connected = true; + for (const listener of this.connectListeners) { + listener(); + } + } + break; + case "closed": + case "failed": + this.emitError(new Error(`WebRTC ICE ${pc.iceConnectionState}.`)); + break; + } + }; + + // Server creates the data channel; client receives it. + pc.ondatachannel = (event) => { + this.dataChannel = event.channel; + this.dataChannel.binaryType = "arraybuffer"; + this.dataChannel.onerror = () => { + this.emitError(new Error("WebRTC data channel error.")); + }; + this.dataChannel.onclose = () => { + this.dataChannel = null; + }; + }; + + await pc.setRemoteDescription({ type: "offer", sdp }); + + // Flush any ICE candidates that arrived before the PC was ready. + for (const pending of this.pendingCandidates) { + try { + await pc.addIceCandidate(pending as unknown as RTCIceCandidateInit); + } catch { + // ignore stale candidates + } + } + this.pendingCandidates = []; + + const answer = await pc.createAnswer(); + await pc.setLocalDescription(answer); + + this.sendJson({ type: "answer", sdp: answer.sdp }); } catch (error) { this.emitError(error instanceof Error ? error : new Error(String(error))); } } - private sendFrame(frame: DesktopStreamClientFrame): void { - if (this.socket.readyState !== WS_READY_STATE_OPEN) { + private async handleCandidate(candidate: RTCIceCandidateInit): Promise { + if (!this.pc) { + this.pendingCandidates.push(candidate as unknown as Record); return; } - this.socket.send(JSON.stringify(frame)); + try { + await this.pc.addIceCandidate(candidate); + } catch (error) { + this.emitError(error instanceof Error ? error : new Error(String(error))); + } + } + + /** Send a JSON message to the server. */ + private sendJson(msg: Record): void { + if (this.socket.readyState !== WS_READY_STATE_OPEN) return; + this.socket.send(JSON.stringify(msg)); + } + + /** Send a typed input message over the signaling WebSocket as fallback. */ + private sendSignaling(type: string, data: Record): void { + this.sendJson({ type, ...data }); + } + + /** Tear down the peer connection, nullifying handlers first to prevent stale + * callbacks. */ + private teardownPeerConnection(): void { + if (this.dataChannel) { + this.dataChannel.onerror = null; + this.dataChannel.onmessage = null; + this.dataChannel.onopen = null; + this.dataChannel.onclose = null; + try { + this.dataChannel.close(); + } catch { + /* ignore */ + } + this.dataChannel = null; + } + if (this.pc) { + this.pc.onicecandidate = null; + this.pc.onicecandidateerror = null; + this.pc.onconnectionstatechange = null; + this.pc.oniceconnectionstatechange = null; + this.pc.onsignalingstatechange = null; + this.pc.onnegotiationneeded = null; + this.pc.ontrack = null; + this.pc.ondatachannel = null; + try { + this.pc.close(); + } catch { + /* ignore */ + } + this.pc = null; + } + this.mediaStream = null; + this.connected = false; } private emitError(error: DesktopStreamErrorStatus | Error): void { @@ -203,34 +515,3 @@ export class DesktopStreamSession { } } } - -function parseStatusFrame(payload: string): DesktopStreamStatusMessage | null { - const value = JSON.parse(payload) as Record; - if (value.type === "ready" && typeof value.width === "number" && typeof value.height === "number") { - return { - type: "ready", - width: value.width, - height: value.height, - }; - } - if (value.type === "error" && typeof value.message === "string") { - return { - type: "error", - message: value.message, - }; - } - return null; -} - -async function decodeBinaryFrame(data: unknown): Promise { - if (data instanceof ArrayBuffer) { - return new Uint8Array(data); - } - if (ArrayBuffer.isView(data)) { - return new Uint8Array(data.buffer, data.byteOffset, data.byteLength); - } - if (typeof Blob !== "undefined" && data instanceof Blob) { - return new Uint8Array(await data.arrayBuffer()); - } - throw new Error("Unsupported desktop stream binary frame type."); -} diff --git a/sdks/typescript/src/generated/openapi.ts b/sdks/typescript/src/generated/openapi.ts index 6d1fb26..a987992 100644 --- a/sdks/typescript/src/generated/openapi.ts +++ b/sdks/typescript/src/generated/openapi.ts @@ -3,7 +3,6 @@ * Do not make direct changes to the file. */ - export interface paths { "/v1/acp": { get: operations["get_v1_acp_servers"]; @@ -225,9 +224,10 @@ export interface paths { }; "/v1/desktop/stream/ws": { /** - * Open a desktop websocket streaming session. - * @description Upgrades the connection to a websocket that streams JPEG desktop frames and - * accepts mouse and keyboard control frames. + * Open a desktop WebRTC signaling session. + * @description Upgrades the connection to a WebSocket used for WebRTC signaling between + * the browser client and the desktop streaming process. Also accepts mouse + * and keyboard input frames as a fallback transport. */ get: operations["get_v1_desktop_stream_ws"]; }; @@ -633,7 +633,23 @@ export interface components { windows: components["schemas"]["DesktopWindowInfo"][]; }; /** @enum {string} */ - ErrorType: "invalid_request" | "conflict" | "unsupported_agent" | "agent_not_installed" | "install_failed" | "agent_process_exited" | "token_invalid" | "permission_denied" | "not_acceptable" | "unsupported_media_type" | "not_found" | "session_not_found" | "session_already_exists" | "mode_not_supported" | "stream_error" | "timeout"; + ErrorType: + | "invalid_request" + | "conflict" + | "unsupported_agent" + | "agent_not_installed" + | "install_failed" + | "agent_process_exited" + | "token_invalid" + | "permission_denied" + | "not_acceptable" + | "unsupported_media_type" + | "not_found" + | "session_not_found" + | "session_already_exists" + | "mode_not_supported" + | "stream_error" + | "timeout"; FsActionResponse: { path: string; }; @@ -692,35 +708,37 @@ export interface components { directory: string; mcpName: string; }; - McpServerConfig: ({ - args?: string[]; - command: string; - cwd?: string | null; - enabled?: boolean | null; - env?: { - [key: string]: string; - } | null; - /** Format: int64 */ - timeoutMs?: number | null; - /** @enum {string} */ - type: "local"; - }) | ({ - bearerTokenEnvVar?: string | null; - enabled?: boolean | null; - envHeaders?: { - [key: string]: string; - } | null; - headers?: { - [key: string]: string; - } | null; - oauth?: Record | null | null; - /** Format: int64 */ - timeoutMs?: number | null; - transport?: string | null; - /** @enum {string} */ - type: "remote"; - url: string; - }); + McpServerConfig: + | { + args?: string[]; + command: string; + cwd?: string | null; + enabled?: boolean | null; + env?: { + [key: string]: string; + } | null; + /** Format: int64 */ + timeoutMs?: number | null; + /** @enum {string} */ + type: "local"; + } + | { + bearerTokenEnvVar?: string | null; + enabled?: boolean | null; + envHeaders?: { + [key: string]: string; + } | null; + headers?: { + [key: string]: string; + } | null; + oauth?: Record | null | null; + /** Format: int64 */ + timeoutMs?: number | null; + transport?: string | null; + /** @enum {string} */ + type: "remote"; + url: string; + }; ProblemDetails: { detail?: string | null; instance?: string | null; @@ -880,7 +898,6 @@ export type $defs = Record; export type external = Record; export interface operations { - get_v1_acp_servers: { responses: { /** @description Active ACP server instances */ @@ -2002,9 +2019,10 @@ export interface operations { }; }; /** - * Open a desktop websocket streaming session. - * @description Upgrades the connection to a websocket that streams JPEG desktop frames and - * accepts mouse and keyboard control frames. + * Open a desktop WebRTC signaling session. + * @description Upgrades the connection to a WebSocket used for WebRTC signaling between + * the browser client and the desktop streaming process. Also accepts mouse + * and keyboard input frames as a fallback transport. */ get_v1_desktop_stream_ws: { parameters: { diff --git a/server/compose.dev.yaml b/server/compose.dev.yaml new file mode 100644 index 0000000..68acff5 --- /dev/null +++ b/server/compose.dev.yaml @@ -0,0 +1,40 @@ +name: sandbox-agent-dev + +services: + backend: + build: + context: .. + dockerfile: docker/test-agent/Dockerfile + image: sandbox-agent-dev + command: ["server", "--host", "0.0.0.0", "--port", "3000", "--no-token"] + environment: + RUST_LOG: "${RUST_LOG:-info}" + ports: + - "2468:3000" + + frontend: + build: + context: .. + dockerfile: server/docker/frontend.dev.Dockerfile + working_dir: /app + depends_on: + - backend + environment: + SANDBOX_AGENT_URL: "http://backend:3000" + ports: + - "5173:5173" + volumes: + - "..:/app" + # Keep Linux-native node_modules inside the container. + - "sa_root_node_modules:/app/node_modules" + - "sa_inspector_node_modules:/app/frontend/packages/inspector/node_modules" + - "sa_react_node_modules:/app/sdks/react/node_modules" + - "sa_typescript_node_modules:/app/sdks/typescript/node_modules" + - "sa_pnpm_store:/root/.local/share/pnpm/store" + +volumes: + sa_root_node_modules: {} + sa_inspector_node_modules: {} + sa_react_node_modules: {} + sa_typescript_node_modules: {} + sa_pnpm_store: {} diff --git a/server/docker/frontend.dev.Dockerfile b/server/docker/frontend.dev.Dockerfile new file mode 100644 index 0000000..f24c80a --- /dev/null +++ b/server/docker/frontend.dev.Dockerfile @@ -0,0 +1,5 @@ +FROM node:22-bookworm-slim +RUN npm install -g pnpm +WORKDIR /app +EXPOSE 5173 +CMD ["sh", "-c", "pnpm install && cd frontend/packages/inspector && npx vite --host 0.0.0.0"] diff --git a/server/packages/sandbox-agent/Cargo.toml b/server/packages/sandbox-agent/Cargo.toml index a8ae1db..a30a0c7 100644 --- a/server/packages/sandbox-agent/Cargo.toml +++ b/server/packages/sandbox-agent/Cargo.toml @@ -42,6 +42,9 @@ toml_edit.workspace = true tar.workspace = true zip.workspace = true tempfile = { workspace = true, optional = true } +gstreamer = { version = "0.23", optional = true } +gstreamer-sdp = { version = "0.23", optional = true } +gstreamer-webrtc = { version = "0.23", optional = true } [target.'cfg(unix)'.dependencies] libc = "0.2" @@ -59,3 +62,4 @@ tokio-tungstenite = "0.24" [features] test-utils = ["tempfile"] +desktop-gstreamer = ["gstreamer", "gstreamer-sdp", "gstreamer-webrtc"] diff --git a/server/packages/sandbox-agent/src/desktop_gstreamer.rs b/server/packages/sandbox-agent/src/desktop_gstreamer.rs new file mode 100644 index 0000000..e6b0b7a --- /dev/null +++ b/server/packages/sandbox-agent/src/desktop_gstreamer.rs @@ -0,0 +1,246 @@ +/// GStreamer WebRTC pipeline for desktop streaming. +/// +/// Creates a pipeline that captures the X11 display via `ximagesrc`, encodes to +/// VP8, and streams over WebRTC using `webrtcbin`. Signaling (SDP offer/answer, +/// ICE candidate exchange) is handled via channels that the caller bridges to +/// the client WebSocket. +#[cfg(feature = "desktop-gstreamer")] +pub mod pipeline { + use gstreamer as gst; + use gstreamer::prelude::*; + use gstreamer_sdp as gst_sdp; + use gstreamer_webrtc as gst_webrtc; + use tokio::sync::mpsc; + + /// Messages sent from the GStreamer pipeline to the WebSocket handler. + #[derive(Debug)] + pub enum PipelineEvent { + /// SDP offer generated by webrtcbin. + Offer(String), + /// ICE candidate produced by webrtcbin. + IceCandidate { + candidate: String, + sdp_m_line_index: u32, + }, + } + + /// Messages sent from the WebSocket handler to the GStreamer pipeline. + #[derive(Debug)] + pub enum SignalingCommand { + /// SDP answer from the client. + Answer(String), + /// ICE candidate from the client. + IceCandidate { + candidate: String, + sdp_m_line_index: u32, + }, + } + + pub struct GStreamerPipeline { + pipeline: gst::Pipeline, + cmd_tx: mpsc::UnboundedSender, + } + + impl GStreamerPipeline { + /// Create and start a new GStreamer WebRTC pipeline for the given display. + /// + /// Returns the pipeline handle and a receiver for pipeline events (offers, + /// ICE candidates) that should be forwarded to the client. + pub fn new( + display: &str, + ) -> Result<(Self, mpsc::UnboundedReceiver), String> { + gst::init().map_err(|e| { + format!( + "Desktop streaming requires GStreamer. Install it with: \ + sandbox-agent desktop install\n\ + Error: {e}" + ) + })?; + + let pipeline_str = format!( + "ximagesrc display-name={display} use-damage=true show-pointer=true \ + ! video/x-raw,framerate=30/1 \ + ! videorate \ + ! videoconvert \ + ! queue max-size-buffers=1 leaky=downstream \ + ! vp8enc deadline=1 target-bitrate=3000000 cpu-used=16 threads=4 \ + keyframe-max-dist=60 end-usage=cbr buffer-size=500 buffer-initial-size=300 \ + error-resilient=partitions \ + ! rtpvp8pay picture-id-mode=15bit \ + ! queue max-size-buffers=1 leaky=downstream \ + ! application/x-rtp,media=video,encoding-name=VP8,payload=96 \ + ! webrtcbin name=wb bundle-policy=max-bundle" + ); + + let pipeline = gst::parse::launch(&pipeline_str) + .map_err(|e| format!("failed to create GStreamer pipeline: {e}"))? + .downcast::() + .map_err(|_| "pipeline is not a GstPipeline".to_string())?; + + let webrtcbin = pipeline + .by_name("wb") + .ok_or_else(|| "webrtcbin element not found in pipeline".to_string())?; + + // Configure STUN for ICE connectivity (used for server-reflexive + // candidates when behind NAT). + webrtcbin.set_property_from_str("stun-server", "stun://stun.l.google.com:19302"); + + // Restrict the UDP port range so Docker port forwarding works. + // The ice-agent is a GstWebRTCICE which wraps a NiceAgent. + let ice_agent: gst::glib::Object = webrtcbin.property("ice-agent"); + // GstWebRTCNice has a "min-rtp-port" and "max-rtp-port" property + // in newer versions, but on GStreamer 1.22 we need to access the + // underlying NiceAgent via the "agent" property. + if ice_agent.has_property("min-rtp-port", None) { + ice_agent.set_property("min-rtp-port", 30000u32); + ice_agent.set_property("max-rtp-port", 30100u32); + } else if ice_agent.has_property("agent", None) { + let nice_agent: gst::glib::Object = ice_agent.property("agent"); + nice_agent.set_property("max-port", 30100u32); + nice_agent.set_property("min-port", 30000u32); + } + + // Channel for pipeline -> WS handler events. + let (event_tx, event_rx) = mpsc::unbounded_channel::(); + + // Channel for WS handler -> pipeline commands. + let (cmd_tx, mut cmd_rx) = mpsc::unbounded_channel::(); + + // Note: Data channel for input will be created once we establish + // the WebRTC connection. Input falls back to the WS transport. + + // When webrtcbin needs to negotiate, create an offer. + let wb_clone = webrtcbin.clone(); + let event_tx_offer = event_tx.clone(); + webrtcbin.connect("on-negotiation-needed", false, move |_| { + let wb_offer = wb_clone.clone(); + let wb_create = wb_clone.clone(); + let tx = event_tx_offer.clone(); + let promise = gst::Promise::with_change_func(move |reply| { + let reply = match reply { + Ok(Some(reply)) => reply, + _ => return, + }; + let offer = match reply.value("offer") { + Ok(offer) => offer, + Err(_) => return, + }; + let offer = offer + .get::() + .expect("offer is WebRTCSessionDescription"); + wb_offer.emit_by_name::<()>( + "set-local-description", + &[&offer, &None::], + ); + if let Ok(sdp_text) = offer.sdp().as_text() { + let _ = tx.send(PipelineEvent::Offer(sdp_text.to_string())); + } + }); + wb_create.emit_by_name::<()>("create-offer", &[&None::, &promise]); + None + }); + + // When webrtcbin produces an ICE candidate, send it to client. + // We rewrite host candidates to use 127.0.0.1 so the browser can + // reach the server when running inside Docker. + let event_tx_ice = event_tx; + webrtcbin.connect("on-ice-candidate", false, move |values| { + let sdp_m_line_index = values[1].get::().expect("m-line index is u32"); + let candidate = values[2].get::().expect("candidate is String"); + + // Only forward UDP host candidates, rewritten to 127.0.0.1. + // Skip TCP candidates (browsers rarely use TCP for WebRTC media) + // and server-reflexive candidates (STUN responses with public IPs). + if candidate.contains("UDP") && candidate.contains("typ host") { + // Replace the Docker-internal IP with 127.0.0.1 so the + // browser on the host can connect. + let rewritten = rewrite_candidate_ip(&candidate, "127.0.0.1"); + let _ = event_tx_ice.send(PipelineEvent::IceCandidate { + candidate: rewritten, + sdp_m_line_index, + }); + } + + None + }); + + // Start the pipeline. + pipeline + .set_state(gst::State::Playing) + .map_err(|e| format!("failed to start GStreamer pipeline: {e}"))?; + + // Spawn a thread to process signaling commands from the WS handler. + let wb_cmd = webrtcbin.clone(); + std::thread::spawn(move || { + while let Some(cmd) = cmd_rx.blocking_recv() { + match cmd { + SignalingCommand::Answer(sdp_str) => { + let sdp = match gst_sdp::SDPMessage::parse_buffer(sdp_str.as_bytes()) { + Ok(sdp) => sdp, + Err(e) => { + tracing::warn!(error = ?e, "failed to parse SDP answer"); + continue; + } + }; + let answer = gst_webrtc::WebRTCSessionDescription::new( + gst_webrtc::WebRTCSDPType::Answer, + sdp, + ); + wb_cmd.emit_by_name::<()>( + "set-remote-description", + &[&answer, &None::], + ); + } + SignalingCommand::IceCandidate { + candidate, + sdp_m_line_index, + } => { + wb_cmd.emit_by_name::<()>( + "add-ice-candidate", + &[&sdp_m_line_index, &candidate], + ); + } + } + } + }); + + Ok((Self { pipeline, cmd_tx }, event_rx)) + } + + /// Send a signaling command to the pipeline. + pub fn send_command(&self, cmd: SignalingCommand) { + let _ = self.cmd_tx.send(cmd); + } + } + + impl Drop for GStreamerPipeline { + fn drop(&mut self) { + let _ = self.pipeline.set_state(gst::State::Null); + } + } + + /// Rewrite the IP address in an ICE candidate string. + /// + /// ICE candidate format: + /// candidate:1 1 UDP 2015363327 172.17.0.6 39395 typ host + /// + /// We replace the IP (field 5, 0-indexed) with the target IP. + fn rewrite_candidate_ip(candidate: &str, target_ip: &str) -> String { + let parts: Vec<&str> = candidate.splitn(6, ' ').collect(); + if parts.len() >= 6 { + // parts[4] is the IP address + let rest_after_ip = &candidate[parts[..5].join(" ").len()..]; + format!( + "{} {} {} {} {}{}", + parts[0], parts[1], parts[2], parts[3], target_ip, rest_after_ip + ) + } else { + candidate.to_string() + } + } +} + +/// Check if GStreamer support is compiled in. +pub fn is_available() -> bool { + cfg!(feature = "desktop-gstreamer") +} diff --git a/server/packages/sandbox-agent/src/desktop_install.rs b/server/packages/sandbox-agent/src/desktop_install.rs index 480da7d..ca0ceee 100644 --- a/server/packages/sandbox-agent/src/desktop_install.rs +++ b/server/packages/sandbox-agent/src/desktop_install.rs @@ -110,6 +110,13 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V "dbus-x11", "xauth", "fonts-dejavu-core", + "libgstreamer1.0-0", + "gstreamer1.0-plugins-base", + "gstreamer1.0-plugins-good", + "gstreamer1.0-plugins-bad", + "gstreamer1.0-plugins-ugly", + "gstreamer1.0-nice", + "gstreamer1.0-x", ], DesktopPackageManager::Dnf => vec![ "xorg-x11-server-Xvfb", @@ -121,6 +128,13 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V "dbus-x11", "xauth", "dejavu-sans-fonts", + "gstreamer1", + "gstreamer1-plugins-base", + "gstreamer1-plugins-good", + "gstreamer1-plugins-bad-free", + "gstreamer1-plugins-ugly-free", + "gstreamer1-plugin-libnice", + "gstreamer1-plugins-good-extras", ], DesktopPackageManager::Apk => vec![ "xvfb", @@ -132,6 +146,12 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V "dbus", "xauth", "ttf-dejavu", + "gstreamer", + "gst-plugins-base", + "gst-plugins-good", + "gst-plugins-bad", + "gst-plugins-ugly", + "libnice-gstreamer", ], } .into_iter() diff --git a/server/packages/sandbox-agent/src/desktop_runtime.rs b/server/packages/sandbox-agent/src/desktop_runtime.rs index 4af13ed..192ab11 100644 --- a/server/packages/sandbox-agent/src/desktop_runtime.rs +++ b/server/packages/sandbox-agent/src/desktop_runtime.rs @@ -10,20 +10,20 @@ use tokio::sync::Mutex; use sandbox_agent_error::SandboxError; -use crate::desktop_recording::{DesktopRecordingContext, DesktopRecordingManager}; use crate::desktop_errors::DesktopProblem; use crate::desktop_install::desktop_platform_support_message; +use crate::desktop_recording::{DesktopRecordingContext, DesktopRecordingManager}; use crate::desktop_streaming::DesktopStreamingManager; use crate::desktop_types::{ - DesktopActionResponse, DesktopDisplayInfoResponse, DesktopErrorInfo, - DesktopKeyModifiers, DesktopKeyboardDownRequest, DesktopKeyboardPressRequest, - DesktopKeyboardTypeRequest, DesktopKeyboardUpRequest, DesktopMouseButton, - DesktopMouseClickRequest, DesktopMouseDownRequest, DesktopMouseDragRequest, - DesktopMouseMoveRequest, DesktopMousePositionResponse, DesktopMouseScrollRequest, - DesktopMouseUpRequest, DesktopProcessInfo, DesktopRecordingInfo, - DesktopRecordingListResponse, DesktopRecordingStartRequest, DesktopRegionScreenshotQuery, - DesktopResolution, DesktopScreenshotFormat, DesktopScreenshotQuery, DesktopStartRequest, - DesktopState, DesktopStatusResponse, DesktopStreamStatusResponse, DesktopWindowInfo, + DesktopActionResponse, DesktopDisplayInfoResponse, DesktopErrorInfo, DesktopKeyModifiers, + DesktopKeyboardDownRequest, DesktopKeyboardPressRequest, DesktopKeyboardTypeRequest, + DesktopKeyboardUpRequest, DesktopMouseButton, DesktopMouseClickRequest, + DesktopMouseDownRequest, DesktopMouseDragRequest, DesktopMouseMoveRequest, + DesktopMousePositionResponse, DesktopMouseScrollRequest, DesktopMouseUpRequest, + DesktopProcessInfo, DesktopRecordingInfo, DesktopRecordingListResponse, + DesktopRecordingStartRequest, DesktopRegionScreenshotQuery, DesktopResolution, + DesktopScreenshotFormat, DesktopScreenshotQuery, DesktopStartRequest, DesktopState, + DesktopStatusResponse, DesktopStreamStatusResponse, DesktopWindowInfo, DesktopWindowListResponse, }; use crate::process_runtime::{ @@ -172,9 +172,9 @@ impl DesktopRuntime { let recording_manager = DesktopRecordingManager::new(process_runtime.clone(), config.state_dir.clone()); Self { + streaming_manager: DesktopStreamingManager::new(), process_runtime, recording_manager, - streaming_manager: DesktopStreamingManager::new(), inner: Arc::new(Mutex::new(DesktopRuntimeStateData { state: DesktopState::Inactive, display_num: config.display_num, @@ -197,7 +197,10 @@ impl DesktopRuntime { pub async fn status(&self) -> DesktopStatusResponse { let mut state = self.inner.lock().await; self.refresh_status_locked(&mut state).await; - self.snapshot_locked(&state) + let mut response = self.snapshot_locked(&state); + drop(state); + + response } pub async fn start( @@ -221,7 +224,10 @@ impl DesktopRuntime { self.refresh_status_locked(&mut state).await; if state.state == DesktopState::Active { - return Ok(self.snapshot_locked(&state)); + let mut response = self.snapshot_locked(&state); + drop(state); + + return Ok(response); } if !state.missing_dependencies.is_empty() { @@ -307,7 +313,10 @@ impl DesktopRuntime { ), ); - Ok(self.snapshot_locked(&state)) + let mut response = self.snapshot_locked(&state); + drop(state); + + Ok(response) } pub async fn stop(&self) -> Result { @@ -336,7 +345,10 @@ impl DesktopRuntime { state.install_command = self.install_command_for(&state.missing_dependencies); state.environment.clear(); - Ok(self.snapshot_locked(&state)) + let mut response = self.snapshot_locked(&state); + drop(state); + + Ok(response) } pub async fn shutdown(&self) { @@ -630,8 +642,23 @@ impl DesktopRuntime { self.recording_manager.delete(id).await } - pub async fn start_streaming(&self) -> DesktopStreamStatusResponse { - self.streaming_manager.start().await + pub async fn start_streaming(&self) -> Result { + let state = self.inner.lock().await; + let display = state + .display + .as_deref() + .ok_or_else(|| SandboxError::Conflict { + message: "desktop runtime is not active".to_string(), + })?; + let resolution = state + .resolution + .clone() + .ok_or_else(|| SandboxError::Conflict { + message: "desktop runtime is not active".to_string(), + })?; + let display = display.to_string(); + drop(state); + Ok(self.streaming_manager.start(&display, resolution).await) } pub async fn stop_streaming(&self) -> DesktopStreamStatusResponse { @@ -639,7 +666,17 @@ impl DesktopRuntime { } pub async fn ensure_streaming_active(&self) -> Result<(), SandboxError> { - self.streaming_manager.ensure_active().await + if self.streaming_manager.is_active().await { + Ok(()) + } else { + Err(SandboxError::Conflict { + message: "desktop streaming is not active".to_string(), + }) + } + } + + pub fn streaming_manager(&self) -> &DesktopStreamingManager { + &self.streaming_manager } async fn recording_context(&self) -> Result { @@ -831,8 +868,14 @@ impl DesktopRuntime { name: &str, ) -> Result<(), DesktopProblem> { let process_id = match name { - "Xvfb" => state.xvfb.as_ref().map(|process| process.process_id.clone()), - "openbox" => state.openbox.as_ref().map(|process| process.process_id.clone()), + "Xvfb" => state + .xvfb + .as_ref() + .map(|process| process.process_id.clone()), + "openbox" => state + .openbox + .as_ref() + .map(|process| process.process_id.clone()), _ => None, }; diff --git a/server/packages/sandbox-agent/src/desktop_streaming.rs b/server/packages/sandbox-agent/src/desktop_streaming.rs index 86fb611..d244d83 100644 --- a/server/packages/sandbox-agent/src/desktop_streaming.rs +++ b/server/packages/sandbox-agent/src/desktop_streaming.rs @@ -2,9 +2,7 @@ use std::sync::Arc; use tokio::sync::Mutex; -use sandbox_agent_error::SandboxError; - -use crate::desktop_types::DesktopStreamStatusResponse; +use crate::desktop_types::{DesktopResolution, DesktopStreamStatusResponse}; #[derive(Debug, Clone)] pub struct DesktopStreamingManager { @@ -14,6 +12,8 @@ pub struct DesktopStreamingManager { #[derive(Debug, Default)] struct DesktopStreamingState { active: bool, + display: Option, + resolution: Option, } impl DesktopStreamingManager { @@ -23,25 +23,46 @@ impl DesktopStreamingManager { } } - pub async fn start(&self) -> DesktopStreamStatusResponse { + /// Mark desktop streaming as active for the given display and resolution. + /// + /// The actual GStreamer pipeline is created per-WebSocket-session in the + /// signaling handler — this method just records that streaming is enabled. + pub async fn start( + &self, + display: &str, + resolution: DesktopResolution, + ) -> DesktopStreamStatusResponse { let mut state = self.inner.lock().await; + + if state.active { + return DesktopStreamStatusResponse { active: true }; + } + state.active = true; + state.display = Some(display.to_string()); + state.resolution = Some(resolution); + DesktopStreamStatusResponse { active: true } } + /// Stop streaming and clear state. pub async fn stop(&self) -> DesktopStreamStatusResponse { let mut state = self.inner.lock().await; state.active = false; + state.display = None; + state.resolution = None; DesktopStreamStatusResponse { active: false } } - pub async fn ensure_active(&self) -> Result<(), SandboxError> { - if self.inner.lock().await.active { - Ok(()) - } else { - Err(SandboxError::Conflict { - message: "desktop streaming is not active".to_string(), - }) - } + pub async fn is_active(&self) -> bool { + self.inner.lock().await.active + } + + pub async fn resolution(&self) -> Option { + self.inner.lock().await.resolution.clone() + } + + pub async fn display_name(&self) -> Option { + self.inner.lock().await.display.clone() } } diff --git a/server/packages/sandbox-agent/src/lib.rs b/server/packages/sandbox-agent/src/lib.rs index d7b92d6..31103a6 100644 --- a/server/packages/sandbox-agent/src/lib.rs +++ b/server/packages/sandbox-agent/src/lib.rs @@ -4,6 +4,7 @@ mod acp_proxy_runtime; pub mod cli; pub mod daemon; mod desktop_errors; +mod desktop_gstreamer; mod desktop_install; mod desktop_recording; mod desktop_runtime; diff --git a/server/packages/sandbox-agent/src/router.rs b/server/packages/sandbox-agent/src/router.rs index ab09ee6..37bf699 100644 --- a/server/packages/sandbox-agent/src/router.rs +++ b/server/packages/sandbox-agent/src/router.rs @@ -41,9 +41,9 @@ use crate::desktop_errors::DesktopProblem; use crate::desktop_runtime::DesktopRuntime; use crate::desktop_types::*; use crate::process_runtime::{ - decode_input_bytes, ProcessLogFilter, ProcessLogFilterStream, ProcessOwner as RuntimeProcessOwner, - ProcessRuntime, ProcessRuntimeConfig, ProcessSnapshot, ProcessStartSpec, ProcessStatus, - ProcessStream, RunSpec, + decode_input_bytes, ProcessLogFilter, ProcessLogFilterStream, + ProcessOwner as RuntimeProcessOwner, ProcessRuntime, ProcessRuntimeConfig, ProcessSnapshot, + ProcessStartSpec, ProcessStatus, ProcessStream, RunSpec, }; use crate::ui; @@ -235,7 +235,7 @@ pub fn build_router_with_state(shared: Arc) -> (Router, Arc) ) .route("/desktop/stream/start", post(post_v1_desktop_stream_start)) .route("/desktop/stream/stop", post(post_v1_desktop_stream_stop)) - .route("/desktop/stream/ws", get(get_v1_desktop_stream_ws)) + .route("/desktop/stream/signaling", get(get_v1_desktop_stream_ws)) .route("/agents", get(get_v1_agents)) .route("/agents/:agent", get(get_v1_agent)) .route("/agents/:agent/install", post(post_v1_agent_install)) @@ -1135,9 +1135,11 @@ async fn get_v1_desktop_recording_download( Path(id): Path, ) -> Result { let path = state.desktop_runtime().recording_download_path(&id).await?; - let bytes = tokio::fs::read(&path).await.map_err(|err| SandboxError::StreamError { - message: format!("failed to read desktop recording {}: {err}", path.display()), - })?; + let bytes = tokio::fs::read(&path) + .await + .map_err(|err| SandboxError::StreamError { + message: format!("failed to read desktop recording {}: {err}", path.display()), + })?; Ok(([(header::CONTENT_TYPE, "video/mp4")], Bytes::from(bytes)).into_response()) } @@ -1179,7 +1181,7 @@ async fn delete_v1_desktop_recording( async fn post_v1_desktop_stream_start( State(state): State>, ) -> Result, ApiError> { - Ok(Json(state.desktop_runtime().start_streaming().await)) + Ok(Json(state.desktop_runtime().start_streaming().await?)) } /// Stop desktop streaming. @@ -1199,13 +1201,14 @@ async fn post_v1_desktop_stream_stop( Ok(Json(state.desktop_runtime().stop_streaming().await)) } -/// Open a desktop websocket streaming session. +/// Open a desktop WebRTC signaling session. /// -/// Upgrades the connection to a websocket that streams JPEG desktop frames and -/// accepts mouse and keyboard control frames. +/// Upgrades the connection to a WebSocket used for WebRTC signaling between +/// the browser client and the desktop streaming process. Also accepts mouse +/// and keyboard input frames as a fallback transport. #[utoipa::path( get, - path = "/v1/desktop/stream/ws", + path = "/v1/desktop/stream/signaling", tag = "v1", params( ("access_token" = Option, Query, description = "Bearer token alternative for WS auth") @@ -2449,46 +2452,6 @@ enum TerminalClientFrame { Close, } -#[derive(Debug, Deserialize)] -#[serde(tag = "type", rename_all = "camelCase")] -enum DesktopStreamClientFrame { - MoveMouse { - x: i32, - y: i32, - }, - MouseDown { - #[serde(default)] - x: Option, - #[serde(default)] - y: Option, - #[serde(default)] - button: Option, - }, - MouseUp { - #[serde(default)] - x: Option, - #[serde(default)] - y: Option, - #[serde(default)] - button: Option, - }, - Scroll { - x: i32, - y: i32, - #[serde(default)] - delta_x: Option, - #[serde(default)] - delta_y: Option, - }, - KeyDown { - key: String, - }, - KeyUp { - key: String, - }, - Close, -} - async fn process_terminal_ws_session( mut socket: WebSocket, runtime: Arc, @@ -2601,22 +2564,38 @@ async fn process_terminal_ws_session( } } -async fn desktop_stream_ws_session(mut socket: WebSocket, desktop_runtime: Arc) { - let display_info = match desktop_runtime.display_info().await { - Ok(info) => info, - Err(err) => { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - let _ = socket.close().await; - return; - } - }; +/// WebRTC signaling and input session. +/// +/// Handles WebRTC signaling (SDP offer/answer, ICE candidate exchange) and +/// accepts mouse/keyboard input as a fallback transport when the WebRTC data +/// channel is not established. When compiled with the `desktop-gstreamer` +/// feature, creates a GStreamer pipeline for real video streaming. +async fn desktop_stream_ws_session(mut ws: WebSocket, desktop_runtime: Arc) { + let streaming = desktop_runtime.streaming_manager(); + // Get resolution for the ready message. + let resolution = + streaming + .resolution() + .await + .unwrap_or(crate::desktop_types::DesktopResolution { + width: 1440, + height: 900, + dpi: None, + }); + + let x_display = streaming + .display_name() + .await + .unwrap_or_else(|| ":99".to_string()); + + // Send stream metadata immediately. if send_ws_json( - &mut socket, + &mut ws, json!({ "type": "ready", - "width": display_info.resolution.width, - "height": display_info.resolution.height, + "width": resolution.width, + "height": resolution.height, }), ) .await @@ -2625,109 +2604,270 @@ async fn desktop_stream_ws_session(mut socket: WebSocket, desktop_runtime: Arc { + tracing::info!(display = %x_display, "GStreamer WebRTC pipeline started"); + // Run the session with the GStreamer pipeline active. + desktop_stream_ws_loop_gstreamer( + &mut ws, + &desktop_runtime, + &pipeline, + &mut event_rx, + ) + .await; + // Pipeline is dropped here, stopping GStreamer. + let _ = ws.close().await; + return; + } + Err(e) => { + tracing::warn!(error = %e, "GStreamer pipeline creation failed"); + let _ = send_ws_error(&mut ws, &e).await; + } + } + } + + // Fallback: run without GStreamer (input-only, no video). + desktop_stream_ws_loop_simple(&mut ws, &desktop_runtime).await; + let _ = ws.close().await; +} + +/// Inner WS message loop — input-only, no GStreamer pipeline. +async fn desktop_stream_ws_loop_simple(ws: &mut WebSocket, desktop_runtime: &Arc) { + loop { + let ws_msg = ws.recv().await; + if !handle_ws_message_simple(ws_msg, ws, desktop_runtime).await { + break; + } + } +} + +/// Inner WS message loop with GStreamer pipeline — polls both pipeline events +/// and client WS messages. +#[cfg(feature = "desktop-gstreamer")] +async fn desktop_stream_ws_loop_gstreamer( + ws: &mut WebSocket, + desktop_runtime: &Arc, + pipeline: &crate::desktop_gstreamer::pipeline::GStreamerPipeline, + event_rx: &mut tokio::sync::mpsc::UnboundedReceiver< + crate::desktop_gstreamer::pipeline::PipelineEvent, + >, +) { + use crate::desktop_gstreamer::pipeline::{PipelineEvent, SignalingCommand}; loop { tokio::select! { - ws_in = socket.recv() => { - match ws_in { - Some(Ok(Message::Text(text))) => { - match serde_json::from_str::(&text) { - Ok(DesktopStreamClientFrame::MoveMouse { x, y }) => { - if let Err(err) = desktop_runtime - .move_mouse(DesktopMouseMoveRequest { x, y }) - .await - { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - } - } - Ok(DesktopStreamClientFrame::MouseDown { x, y, button }) => { - if let Err(err) = desktop_runtime - .mouse_down(DesktopMouseDownRequest { x, y, button }) - .await - { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - } - } - Ok(DesktopStreamClientFrame::MouseUp { x, y, button }) => { - if let Err(err) = desktop_runtime - .mouse_up(DesktopMouseUpRequest { x, y, button }) - .await - { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - } - } - Ok(DesktopStreamClientFrame::Scroll { x, y, delta_x, delta_y }) => { - if let Err(err) = desktop_runtime - .scroll_mouse(DesktopMouseScrollRequest { - x, - y, - delta_x, - delta_y, - }) - .await - { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - } - } - Ok(DesktopStreamClientFrame::KeyDown { key }) => { - if let Err(err) = desktop_runtime - .key_down(DesktopKeyboardDownRequest { key }) - .await - { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - } - } - Ok(DesktopStreamClientFrame::KeyUp { key }) => { - if let Err(err) = desktop_runtime - .key_up(DesktopKeyboardUpRequest { key }) - .await - { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - } - } - Ok(DesktopStreamClientFrame::Close) => { - let _ = socket.close().await; - break; - } - Err(err) => { - let _ = send_ws_error(&mut socket, &format!("invalid desktop stream frame: {err}")).await; - } - } - } - Some(Ok(Message::Ping(payload))) => { - let _ = socket.send(Message::Pong(payload)).await; - } - Some(Ok(Message::Close(_))) | None => break, - Some(Ok(Message::Binary(_))) | Some(Ok(Message::Pong(_))) => {} - Some(Err(_)) => break, - } - } - _ = frame_tick.tick() => { - let frame = desktop_runtime - .screenshot(DesktopScreenshotQuery { - format: Some(DesktopScreenshotFormat::Jpeg), - quality: Some(60), - scale: Some(1.0), - }) - .await; - match frame { - Ok(frame) => { - if socket.send(Message::Binary(frame.bytes.into())).await.is_err() { + pipeline_event = event_rx.recv() => { + match pipeline_event { + Some(PipelineEvent::Offer(sdp)) => { + if send_ws_json(ws, json!({"type": "offer", "sdp": sdp})).await.is_err() { break; } } - Err(err) => { - let _ = send_ws_error(&mut socket, &err.to_error_info().message).await; - let _ = socket.close().await; - break; + Some(PipelineEvent::IceCandidate { candidate, sdp_m_line_index }) => { + if send_ws_json(ws, json!({ + "type": "candidate", + "candidate": candidate, + "sdpMLineIndex": sdp_m_line_index, + })).await.is_err() { + break; + } } + None => break, + } + } + ws_msg = ws.recv() => { + match ws_msg { + Some(Ok(Message::Text(text))) => { + let parsed: Value = match serde_json::from_str(&text) { + Ok(v) => v, + Err(_) => continue, + }; + match parsed.get("type").and_then(|v| v.as_str()) { + Some("answer") => { + if let Some(sdp) = parsed.get("sdp").and_then(|v| v.as_str()) { + pipeline.send_command(SignalingCommand::Answer(sdp.to_string())); + } + } + Some("candidate") => { + if let Some(candidate) = parsed.get("candidate").and_then(|v| v.as_str()) { + let sdp_m_line_index = parsed + .get("sdpMLineIndex") + .and_then(|v| v.as_u64()) + .unwrap_or(0) as u32; + pipeline.send_command(SignalingCommand::IceCandidate { + candidate: candidate.to_string(), + sdp_m_line_index, + }); + } + } + // Input messages (fallback transport) + Some("moveMouse") => { + if let (Some(x), Some(y)) = ( + parsed.get("x").and_then(|v| v.as_i64()), + parsed.get("y").and_then(|v| v.as_i64()), + ) { + let _ = desktop_runtime + .move_mouse(DesktopMouseMoveRequest { x: x as i32, y: y as i32 }) + .await; + } + } + Some("mouseDown") => { + let button = parsed.get("button").and_then(|v| serde_json::from_value(v.clone()).ok()); + let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32); + let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32); + let _ = desktop_runtime.mouse_down(DesktopMouseDownRequest { x, y, button }).await; + } + Some("mouseUp") => { + let button = parsed.get("button").and_then(|v| serde_json::from_value(v.clone()).ok()); + let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32); + let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32); + let _ = desktop_runtime.mouse_up(DesktopMouseUpRequest { x, y, button }).await; + } + Some("scroll") => { + if let (Some(x), Some(y)) = ( + parsed.get("x").and_then(|v| v.as_i64()), + parsed.get("y").and_then(|v| v.as_i64()), + ) { + let dx = parsed.get("deltaX").and_then(|v| v.as_i64()).map(|v| v as i32); + let dy = parsed.get("deltaY").and_then(|v| v.as_i64()).map(|v| v as i32); + let _ = desktop_runtime.scroll_mouse(DesktopMouseScrollRequest { x: x as i32, y: y as i32, delta_x: dx, delta_y: dy }).await; + } + } + Some("keyDown") => { + if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) { + let _ = desktop_runtime.key_down(DesktopKeyboardDownRequest { key: key.to_string() }).await; + } + } + Some("keyUp") => { + if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) { + let _ = desktop_runtime.key_up(DesktopKeyboardUpRequest { key: key.to_string() }).await; + } + } + _ => {} + } + } + Some(Ok(Message::Ping(payload))) => { + let _ = ws.send(Message::Pong(payload)).await; + } + Some(Ok(Message::Close(_))) | None | Some(Err(_)) => break, + _ => {} } } } } } +/// Process a single WebSocket message (no pipeline). Returns false to close. +async fn handle_ws_message_simple( + msg: Option>, + ws: &mut WebSocket, + desktop_runtime: &Arc, +) -> bool { + match msg { + Some(Ok(Message::Text(text))) => { + let parsed: Value = match serde_json::from_str(&text) { + Ok(v) => v, + Err(_) => return true, + }; + + match parsed.get("type").and_then(|v| v.as_str()) { + // --- Input messages (fallback transport) --- + Some("moveMouse") => { + if let (Some(x), Some(y)) = ( + parsed.get("x").and_then(|v| v.as_i64()), + parsed.get("y").and_then(|v| v.as_i64()), + ) { + let _ = desktop_runtime + .move_mouse(DesktopMouseMoveRequest { + x: x as i32, + y: y as i32, + }) + .await; + } + } + Some("mouseDown") => { + let button = parsed + .get("button") + .and_then(|v| serde_json::from_value(v.clone()).ok()); + let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32); + let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32); + let _ = desktop_runtime + .mouse_down(DesktopMouseDownRequest { x, y, button }) + .await; + } + Some("mouseUp") => { + let button = parsed + .get("button") + .and_then(|v| serde_json::from_value(v.clone()).ok()); + let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32); + let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32); + let _ = desktop_runtime + .mouse_up(DesktopMouseUpRequest { x, y, button }) + .await; + } + Some("scroll") => { + if let (Some(x), Some(y)) = ( + parsed.get("x").and_then(|v| v.as_i64()), + parsed.get("y").and_then(|v| v.as_i64()), + ) { + let delta_x = parsed + .get("deltaX") + .and_then(|v| v.as_i64()) + .map(|v| v as i32); + let delta_y = parsed + .get("deltaY") + .and_then(|v| v.as_i64()) + .map(|v| v as i32); + let _ = desktop_runtime + .scroll_mouse(DesktopMouseScrollRequest { + x: x as i32, + y: y as i32, + delta_x, + delta_y, + }) + .await; + } + } + Some("keyDown") => { + if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) { + let _ = desktop_runtime + .key_down(DesktopKeyboardDownRequest { + key: key.to_string(), + }) + .await; + } + } + Some("keyUp") => { + if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) { + let _ = desktop_runtime + .key_up(DesktopKeyboardUpRequest { + key: key.to_string(), + }) + .await; + } + } + + // --- WebRTC signaling messages (accepted without error) --- + Some("answer") | Some("candidate") | Some("offer") => {} + + _ => {} + } + true + } + Some(Ok(Message::Ping(payload))) => { + let _ = ws.send(Message::Pong(payload)).await; + true + } + Some(Ok(Message::Close(_))) | None | Some(Err(_)) => false, + _ => true, + } +} + async fn send_ws_json(socket: &mut WebSocket, payload: Value) -> Result<(), ()> { socket .send(Message::Text( diff --git a/server/packages/sandbox-agent/tests/v1_api/desktop.rs b/server/packages/sandbox-agent/tests/v1_api/desktop.rs index 76d9389..7208e8a 100644 --- a/server/packages/sandbox-agent/tests/v1_api/desktop.rs +++ b/server/packages/sandbox-agent/tests/v1_api/desktop.rs @@ -432,7 +432,7 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() { assert_eq!(status, StatusCode::OK); assert_eq!(parse_json(&body)["active"], true); - let (mut ws, _) = connect_async(test_app.app.ws_url("/v1/desktop/stream/ws")) + let (mut ws, _) = connect_async(test_app.app.ws_url("/v1/desktop/stream/signaling")) .await .expect("connect desktop stream websocket"); @@ -447,12 +447,9 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() { other => panic!("expected text ready frame, got {other:?}"), } - let frame = recv_ws_message(&mut ws).await; - match frame { - Message::Binary(bytes) => assert!(bytes.starts_with(&[0xff, 0xd8, 0xff])), - other => panic!("expected binary jpeg frame, got {other:?}"), - } - + // The signaling WebSocket now accepts input frames as fallback transport + // (when the WebRTC data channel is not established). Send a mouse move to + // verify input dispatch still works over the signaling channel. ws.send(Message::Text( json!({ "type": "moveMouse", @@ -464,6 +461,20 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() { )) .await .expect("send desktop stream mouse move"); + + // Send a WebRTC signaling message (offer) to verify the signaling path + // accepts it without error. + ws.send(Message::Text( + json!({ + "type": "offer", + "sdp": "v=0\r\n" + }) + .to_string() + .into(), + )) + .await + .expect("send desktop stream offer"); + let _ = ws.close(None).await; let (status, _, body) = send_request(