feat: replace neko with native GStreamer WebRTC for desktop streaming

Replace the neko binary dependency with a native GStreamer pipeline
(ximagesrc -> vp8enc -> webrtcbin) for desktop video streaming. This
removes the external neko process and integrates screen capture directly
via gstreamer-rs crate bindings behind a `desktop-gstreamer` feature flag.

Key changes:
- Add desktop_gstreamer.rs with GStreamer WebRTC pipeline management
- Rewrite signaling protocol (ready/offer/answer/candidate over WS)
- Add leaky queues and videorate for low-latency streaming
- Rewrite ICE candidates to 127.0.0.1 for Docker connectivity
- Constrain UDP port range (30000-30100) via libnice agent
- Update TypeScript SDK desktop-stream.ts for new signaling
- Update inspector DesktopTab with WebRTC Live View
- Update Dockerfiles to install GStreamer dev packages

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nathan Flurry 2026-03-16 17:54:39 -07:00
parent e638148345
commit 944ad1ba55
22 changed files with 1848 additions and 1170 deletions

View file

@ -33,6 +33,12 @@
- `docs/agent-capabilities.mdx` lists models/modes/thought levels per agent. Update it when adding a new agent or changing `fallback_config_options`. If its "Last updated" date is >2 weeks old, re-run `cd scripts/agent-configs && npx tsx dump.ts` and update the doc to match. Source data: `scripts/agent-configs/resources/*.json` and hardcoded entries in `server/packages/sandbox-agent/src/router/support.rs` (`fallback_config_options`).
- Some agent models are gated by subscription (e.g. Claude `opus`). The live report only shows models available to the current credentials. The static doc and JSON resource files should list all known models regardless of subscription tier.
## .context Directory
- The `.context/` directory is gitignored and used for temporary workspace files (plans, screenshots, build stamps, etc.).
- Never commit files from `.context/` to git.
- Clean up temporary files (screenshots, attachments, build artifacts) from `.context/` when they are no longer needed.
## Docker Test Image
- Docker-backed Rust and TypeScript tests build `docker/test-agent/Dockerfile` directly in-process and cache the image tag only in memory (`OnceLock` in Rust, module-level variable in TypeScript).

View file

@ -153,7 +153,15 @@ RUN apt-get update && apt-get install -y \
ca-certificates \
curl \
git \
ffmpeg && \
ffmpeg \
gstreamer1.0-tools \
gstreamer1.0-plugins-base \
gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad \
gstreamer1.0-plugins-ugly \
gstreamer1.0-nice \
gstreamer1.0-x \
libgstreamer1.0-0 && \
rm -rf /var/lib/apt/lists/*
# Copy the binary from builder

View file

@ -1,6 +1,16 @@
FROM rust:1.88.0-bookworm AS builder
WORKDIR /build
# Install GStreamer dev packages for the desktop-gstreamer feature.
RUN apt-get update -qq && \
apt-get install -y -qq --no-install-recommends \
libgstreamer1.0-dev \
libgstreamer-plugins-base1.0-dev \
libgstreamer-plugins-bad1.0-dev \
libnice-dev \
> /dev/null 2>&1 && \
rm -rf /var/lib/apt/lists/*
COPY Cargo.toml Cargo.lock ./
COPY server/ ./server/
COPY gigacode/ ./gigacode/
@ -12,7 +22,7 @@ ENV SANDBOX_AGENT_SKIP_INSPECTOR=1
RUN --mount=type=cache,target=/usr/local/cargo/registry \
--mount=type=cache,target=/usr/local/cargo/git \
--mount=type=cache,target=/build/target \
cargo build -p sandbox-agent --release && \
cargo build -p sandbox-agent --release --features desktop-gstreamer && \
cp target/release/sandbox-agent /sandbox-agent
FROM node:22-bookworm-slim
@ -26,6 +36,15 @@ RUN apt-get update -qq && \
xdotool \
imagemagick \
ffmpeg \
gstreamer1.0-tools \
gstreamer1.0-plugins-base \
gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad \
gstreamer1.0-plugins-ugly \
gstreamer1.0-nice \
gstreamer1.0-x \
gstreamer1.0-pulseaudio \
libxcvt0 \
x11-xserver-utils \
dbus-x11 \
xauth \

File diff suppressed because it is too large Load diff

View file

@ -80,10 +80,6 @@ const DebugPanel = ({
<Monitor className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
Desktop
</button>
<button className={`debug-tab ${debugTab === "mcp" ? "active" : ""}`} onClick={() => onDebugTabChange("mcp")}>
<Server className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
MCP
</button>
<button className={`debug-tab ${debugTab === "processes" ? "active" : ""}`} onClick={() => onDebugTabChange("processes")}>
<Terminal className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
Processes
@ -92,6 +88,10 @@ const DebugPanel = ({
<Play className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
Run Once
</button>
<button className={`debug-tab ${debugTab === "mcp" ? "active" : ""}`} onClick={() => onDebugTabChange("mcp")}>
<Server className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
MCP
</button>
<button className={`debug-tab ${debugTab === "skills" ? "active" : ""}`} onClick={() => onDebugTabChange("skills")}>
<Wrench className="button-icon" style={{ marginRight: 4, width: 12, height: 12 }} />
Skills
@ -117,13 +117,9 @@ const DebugPanel = ({
/>
)}
{debugTab === "desktop" && (
<DesktopTab getClient={getClient} />
)}
{debugTab === "desktop" && <DesktopTab getClient={getClient} />}
{debugTab === "mcp" && (
<McpTab getClient={getClient} />
)}
{debugTab === "mcp" && <McpTab getClient={getClient} />}
{debugTab === "processes" && <ProcessesTab getClient={getClient} />}

View file

@ -1,36 +1,43 @@
import { Loader2, Monitor, Play, RefreshCw, Square, Camera } from "lucide-react";
import { Camera, Circle, Download, Loader2, Monitor, Play, RefreshCw, Square, Trash2, Video } from "lucide-react";
import { useCallback, useEffect, useMemo, useState } from "react";
import { SandboxAgentError } from "sandbox-agent";
import type {
DesktopStatusResponse,
SandboxAgent,
} from "sandbox-agent";
import type { DesktopRecordingInfo, DesktopStatusResponse, SandboxAgent } from "sandbox-agent";
import { DesktopViewer } from "@sandbox-agent/react";
import type { DesktopViewerClient } from "@sandbox-agent/react";
const MIN_SPIN_MS = 350;
const extractErrorMessage = (error: unknown, fallback: string): string => {
if (error instanceof SandboxAgentError && error.problem?.detail) return error.problem.detail;
if (error instanceof Error) return error.message;
return fallback;
};
const formatStartedAt = (value: string | null | undefined): string => {
if (!value) {
return "Not started";
}
if (!value) return "Not started";
const parsed = new Date(value);
return Number.isNaN(parsed.getTime()) ? value : parsed.toLocaleString();
};
const formatBytes = (bytes: number): string => {
if (bytes === 0) return "0 B";
const units = ["B", "KB", "MB", "GB"];
const i = Math.floor(Math.log(bytes) / Math.log(1024));
return `${(bytes / 1024 ** i).toFixed(i > 0 ? 1 : 0)} ${units[i]}`;
};
const formatDuration = (start: string, end?: string | null): string => {
const startMs = new Date(start).getTime();
const endMs = end ? new Date(end).getTime() : Date.now();
if (Number.isNaN(startMs) || Number.isNaN(endMs)) return "Unknown";
const seconds = Math.round((endMs - startMs) / 1000);
if (seconds < 60) return `${seconds}s`;
const mins = Math.floor(seconds / 60);
const secs = seconds % 60;
return `${mins}m ${secs}s`;
};
const createScreenshotUrl = async (bytes: Uint8Array): Promise<string> => {
const payload = new Uint8Array(bytes.byteLength);
payload.set(bytes);
const blob = new Blob([payload.buffer], { type: "image/png" });
if (typeof URL.createObjectURL === "function") {
return URL.createObjectURL(blob);
}
return await new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onerror = () => reject(reader.error ?? new Error("Unable to read screenshot blob."));
@ -44,26 +51,42 @@ const createScreenshotUrl = async (bytes: Uint8Array): Promise<string> => {
reader.readAsDataURL(blob);
});
};
const DesktopTab = ({
getClient,
}: {
getClient: () => SandboxAgent;
}) => {
const DesktopTab = ({ getClient }: { getClient: () => SandboxAgent }) => {
const [status, setStatus] = useState<DesktopStatusResponse | null>(null);
const [loading, setLoading] = useState(false);
const [refreshing, setRefreshing] = useState(false);
const [acting, setActing] = useState<"start" | "stop" | null>(null);
const [error, setError] = useState<string | null>(null);
const [width, setWidth] = useState("1440");
const [height, setHeight] = useState("900");
const [dpi, setDpi] = useState("96");
// Screenshot fallback
const [screenshotUrl, setScreenshotUrl] = useState<string | null>(null);
const [screenshotLoading, setScreenshotLoading] = useState(false);
const [screenshotError, setScreenshotError] = useState<string | null>(null);
// Live view
const [liveViewActive, setLiveViewActive] = useState(false);
const [liveViewError, setLiveViewError] = useState<string | null>(null);
// Memoize the client as a DesktopViewerClient so the reference is stable
// across renders and doesn't cause the DesktopViewer effect to re-fire.
const viewerClient = useMemo<DesktopViewerClient>(() => {
const c = getClient();
return {
startDesktopStream: () => c.startDesktopStream(),
stopDesktopStream: () => c.stopDesktopStream(),
connectDesktopStream: (opts?: Parameters<SandboxAgent["connectDesktopStream"]>[0]) => c.connectDesktopStream(opts),
};
}, [getClient]);
// Recording
const [recordings, setRecordings] = useState<DesktopRecordingInfo[]>([]);
const [recordingLoading, setRecordingLoading] = useState(false);
const [recordingActing, setRecordingActing] = useState<"start" | "stop" | null>(null);
const [recordingError, setRecordingError] = useState<string | null>(null);
const [recordingFps, setRecordingFps] = useState("30");
const [deletingRecordingId, setDeletingRecordingId] = useState<string | null>(null);
const [downloadingRecordingId, setDownloadingRecordingId] = useState<string | null>(null);
// Active recording tracking
const activeRecording = useMemo(() => recordings.find((r) => r.status === "recording"), [recordings]);
const revokeScreenshotUrl = useCallback(() => {
setScreenshotUrl((current) => {
if (current?.startsWith("blob:") && typeof URL.revokeObjectURL === "function") {
@ -72,27 +95,25 @@ const DesktopTab = ({
return null;
});
}, []);
const loadStatus = useCallback(async (mode: "initial" | "refresh" = "initial") => {
if (mode === "initial") {
setLoading(true);
} else {
setRefreshing(true);
}
setError(null);
try {
const next = await getClient().getDesktopStatus();
setStatus(next);
return next;
} catch (loadError) {
setError(extractErrorMessage(loadError, "Unable to load desktop status."));
return null;
} finally {
setLoading(false);
setRefreshing(false);
}
}, [getClient]);
const loadStatus = useCallback(
async (mode: "initial" | "refresh" = "initial") => {
if (mode === "initial") setLoading(true);
else setRefreshing(true);
setError(null);
try {
const next = await getClient().getDesktopStatus();
setStatus(next);
return next;
} catch (loadError) {
setError(extractErrorMessage(loadError, "Unable to load desktop status."));
return null;
} finally {
setLoading(false);
setRefreshing(false);
}
},
[getClient],
);
const refreshScreenshot = useCallback(async () => {
setScreenshotLoading(true);
setScreenshotError(null);
@ -107,25 +128,38 @@ const DesktopTab = ({
setScreenshotLoading(false);
}
}, [getClient, revokeScreenshotUrl]);
const loadRecordings = useCallback(async () => {
setRecordingLoading(true);
setRecordingError(null);
try {
const result = await getClient().listDesktopRecordings();
setRecordings(result.recordings);
} catch (loadError) {
setRecordingError(extractErrorMessage(loadError, "Unable to load recordings."));
} finally {
setRecordingLoading(false);
}
}, [getClient]);
useEffect(() => {
void loadStatus();
}, [loadStatus]);
useEffect(() => {
if (status?.state === "active") {
void refreshScreenshot();
void loadRecordings();
} else {
revokeScreenshotUrl();
setLiveViewActive(false);
}
}, [refreshScreenshot, revokeScreenshotUrl, status?.state]);
}, [status?.state, loadRecordings, revokeScreenshotUrl]);
useEffect(() => {
return () => {
revokeScreenshotUrl();
};
return () => revokeScreenshotUrl();
}, [revokeScreenshotUrl]);
// Poll recording list while a recording is active
useEffect(() => {
if (!activeRecording) return;
const interval = setInterval(() => void loadRecordings(), 3000);
return () => clearInterval(interval);
}, [activeRecording, loadRecordings]);
const handleStart = async () => {
const parsedWidth = Number.parseInt(width, 10);
const parsedHeight = Number.parseInt(height, 10);
@ -140,9 +174,6 @@ const DesktopTab = ({
dpi: Number.isFinite(parsedDpi) ? parsedDpi : undefined,
});
setStatus(next);
if (next.state === "active") {
await refreshScreenshot();
}
} catch (startError) {
setError(extractErrorMessage(startError, "Unable to start desktop runtime."));
await loadStatus("refresh");
@ -154,7 +185,6 @@ const DesktopTab = ({
setActing(null);
}
};
const handleStop = async () => {
setActing("stop");
setError(null);
@ -163,6 +193,7 @@ const DesktopTab = ({
const next = await getClient().stopDesktop();
setStatus(next);
revokeScreenshotUrl();
setLiveViewActive(false);
} catch (stopError) {
setError(extractErrorMessage(stopError, "Unable to stop desktop runtime."));
await loadStatus("refresh");
@ -174,62 +205,102 @@ const DesktopTab = ({
setActing(null);
}
};
const handleStartRecording = async () => {
const fps = Number.parseInt(recordingFps, 10);
setRecordingActing("start");
setRecordingError(null);
try {
await getClient().startDesktopRecording({
fps: Number.isFinite(fps) && fps > 0 ? fps : undefined,
});
await loadRecordings();
} catch (err) {
setRecordingError(extractErrorMessage(err, "Unable to start recording."));
} finally {
setRecordingActing(null);
}
};
const handleStopRecording = async () => {
setRecordingActing("stop");
setRecordingError(null);
try {
await getClient().stopDesktopRecording();
await loadRecordings();
} catch (err) {
setRecordingError(extractErrorMessage(err, "Unable to stop recording."));
} finally {
setRecordingActing(null);
}
};
const handleDeleteRecording = async (id: string) => {
setDeletingRecordingId(id);
try {
await getClient().deleteDesktopRecording(id);
setRecordings((prev) => prev.filter((r) => r.id !== id));
} catch (err) {
setRecordingError(extractErrorMessage(err, "Unable to delete recording."));
} finally {
setDeletingRecordingId(null);
}
};
const handleDownloadRecording = async (id: string, fileName: string) => {
setDownloadingRecordingId(id);
try {
const bytes = await getClient().downloadDesktopRecording(id);
const blob = new Blob([bytes], { type: "video/mp4" });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = fileName;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
} catch (err) {
setRecordingError(extractErrorMessage(err, "Unable to download recording."));
} finally {
setDownloadingRecordingId(null);
}
};
const canRefreshScreenshot = status?.state === "active";
const isActive = status?.state === "active";
const resolutionLabel = useMemo(() => {
const resolution = status?.resolution;
if (!resolution) return "Unknown";
const dpiLabel = resolution.dpi ? ` @ ${resolution.dpi} DPI` : "";
return `${resolution.width} x ${resolution.height}${dpiLabel}`;
}, [status?.resolution]);
return (
<div className="desktop-panel">
<div className="inline-row" style={{ marginBottom: 16 }}>
<button
className="button secondary small"
onClick={() => void loadStatus("refresh")}
disabled={loading || refreshing}
>
<button className="button secondary small" onClick={() => void loadStatus("refresh")} disabled={loading || refreshing}>
<RefreshCw className={`button-icon ${loading || refreshing ? "spinner-icon" : ""}`} />
Refresh Status
</button>
<button
className="button secondary small"
onClick={() => void refreshScreenshot()}
disabled={!canRefreshScreenshot || screenshotLoading}
>
{screenshotLoading ? (
<Loader2 className="button-icon spinner-icon" />
) : (
<Camera className="button-icon" />
)}
Refresh Screenshot
</button>
{isActive && !liveViewActive && (
<button className="button secondary small" onClick={() => void refreshScreenshot()} disabled={!canRefreshScreenshot || screenshotLoading}>
{screenshotLoading ? <Loader2 className="button-icon spinner-icon" /> : <Camera className="button-icon" />}
Screenshot
</button>
)}
</div>
{error && <div className="banner error">{error}</div>}
{screenshotError && <div className="banner error">{screenshotError}</div>}
{/* ========== Runtime Section ========== */}
<div className="card">
<div className="card-header">
<span className="card-title">
<Monitor size={14} style={{ marginRight: 6 }} />
Desktop Runtime
</span>
<span className={`pill ${
status?.state === "active"
? "success"
: status?.state === "install_required"
? "warning"
: status?.state === "failed"
? "danger"
: ""
}`}>
<span
className={`pill ${
status?.state === "active" ? "success" : status?.state === "install_required" ? "warning" : status?.state === "failed" ? "danger" : ""
}`}
>
{status?.state ?? "unknown"}
</span>
</div>
<div className="desktop-state-grid">
<div>
<div className="card-meta">Display</div>
@ -244,65 +315,35 @@ const DesktopTab = ({
<div>{formatStartedAt(status?.startedAt)}</div>
</div>
</div>
<div className="desktop-start-controls">
<div className="desktop-input-group">
<label className="label">Width</label>
<input
className="setup-input mono"
value={width}
onChange={(event) => setWidth(event.target.value)}
inputMode="numeric"
/>
<input className="setup-input mono" value={width} onChange={(event) => setWidth(event.target.value)} inputMode="numeric" />
</div>
<div className="desktop-input-group">
<label className="label">Height</label>
<input
className="setup-input mono"
value={height}
onChange={(event) => setHeight(event.target.value)}
inputMode="numeric"
/>
<input className="setup-input mono" value={height} onChange={(event) => setHeight(event.target.value)} inputMode="numeric" />
</div>
<div className="desktop-input-group">
<label className="label">DPI</label>
<input
className="setup-input mono"
value={dpi}
onChange={(event) => setDpi(event.target.value)}
inputMode="numeric"
/>
<input className="setup-input mono" value={dpi} onChange={(event) => setDpi(event.target.value)} inputMode="numeric" />
</div>
</div>
<div className="card-actions">
<button
className="button success small"
onClick={() => void handleStart()}
disabled={acting === "start"}
>
{acting === "start" ? (
<Loader2 className="button-icon spinner-icon" />
) : (
<Play className="button-icon" />
)}
Start Desktop
</button>
<button
className="button danger small"
onClick={() => void handleStop()}
disabled={acting === "stop"}
>
{acting === "stop" ? (
<Loader2 className="button-icon spinner-icon" />
) : (
<Square className="button-icon" />
)}
Stop Desktop
</button>
{isActive ? (
<button className="button danger small" onClick={() => void handleStop()} disabled={acting === "stop"}>
{acting === "stop" ? <Loader2 className="button-icon spinner-icon" /> : <Square className="button-icon" />}
Stop Desktop
</button>
) : (
<button className="button success small" onClick={() => void handleStart()} disabled={acting === "start"}>
{acting === "start" ? <Loader2 className="button-icon spinner-icon" /> : <Play className="button-icon" />}
Start Desktop
</button>
)}
</div>
</div>
{/* ========== Missing Dependencies ========== */}
{status?.missingDependencies && status.missingDependencies.length > 0 && (
<div className="card">
<div className="card-header">
@ -310,18 +351,188 @@ const DesktopTab = ({
</div>
<div className="desktop-chip-list">
{status.missingDependencies.map((dependency) => (
<span key={dependency} className="pill warning">{dependency}</span>
<span key={dependency} className="pill warning">
{dependency}
</span>
))}
</div>
{status.installCommand && (
<>
<div className="card-meta" style={{ marginTop: 12 }}>Install command</div>
<div className="card-meta" style={{ marginTop: 12 }}>
Install command
</div>
<div className="mono desktop-command">{status.installCommand}</div>
</>
)}
</div>
)}
{/* ========== Live View Section ========== */}
<div className="card">
<div className="card-header">
<span className="card-title">
<Video size={14} style={{ marginRight: 6 }} />
Live View
</span>
{isActive && (
<button
className={`button small ${liveViewActive ? "danger" : "success"}`}
onClick={(e) => {
e.stopPropagation();
if (liveViewActive) {
// Stop: close viewer then stop the stream process
setLiveViewActive(false);
void getClient()
.stopDesktopStream()
.catch(() => undefined);
} else {
setLiveViewActive(true);
}
}}
style={{ padding: "4px 10px", fontSize: 11 }}
>
{liveViewActive ? (
<>
<Square size={12} style={{ marginRight: 4 }} />
Stop Stream
</>
) : (
<>
<Play size={12} style={{ marginRight: 4 }} />
Start Stream
</>
)}
</button>
)}
</div>
{liveViewError && (
<div className="banner error" style={{ marginBottom: 8 }}>
{liveViewError}
</div>
)}
{!isActive && <div className="desktop-screenshot-empty">Start the desktop runtime to enable live view.</div>}
{isActive && liveViewActive && <DesktopViewer client={viewerClient} autoStart={true} showStatusBar={true} />}
{isActive && !liveViewActive && (
<>
{screenshotUrl ? (
<div className="desktop-screenshot-frame">
<img src={screenshotUrl} alt="Desktop screenshot" className="desktop-screenshot-image" />
</div>
) : (
<div className="desktop-screenshot-empty">Click "Start Stream" for live desktop view, or use the Screenshot button above.</div>
)}
</>
)}
</div>
{/* ========== Recording Section ========== */}
<div className="card">
<div className="card-header">
<span className="card-title">
<Circle size={14} style={{ marginRight: 6, fill: activeRecording ? "#ff3b30" : "none" }} />
Recording
</span>
{activeRecording && <span className="pill danger">Recording</span>}
</div>
{recordingError && (
<div className="banner error" style={{ marginBottom: 8 }}>
{recordingError}
</div>
)}
{!isActive && <div className="desktop-screenshot-empty">Start the desktop runtime to enable recording.</div>}
{isActive && (
<>
<div className="desktop-start-controls" style={{ gridTemplateColumns: "1fr" }}>
<div className="desktop-input-group">
<label className="label">FPS</label>
<input
className="setup-input mono"
value={recordingFps}
onChange={(e) => setRecordingFps(e.target.value)}
inputMode="numeric"
style={{ maxWidth: 80 }}
disabled={!!activeRecording}
/>
</div>
</div>
<div className="card-actions">
{!activeRecording ? (
<button className="button danger small" onClick={() => void handleStartRecording()} disabled={recordingActing === "start"}>
{recordingActing === "start" ? (
<Loader2 className="button-icon spinner-icon" />
) : (
<Circle size={14} className="button-icon" style={{ fill: "#ff3b30" }} />
)}
Start Recording
</button>
) : (
<button className="button secondary small" onClick={() => void handleStopRecording()} disabled={recordingActing === "stop"}>
{recordingActing === "stop" ? <Loader2 className="button-icon spinner-icon" /> : <Square className="button-icon" />}
Stop Recording
</button>
)}
<button className="button secondary small" onClick={() => void loadRecordings()} disabled={recordingLoading}>
<RefreshCw className={`button-icon ${recordingLoading ? "spinner-icon" : ""}`} />
Refresh
</button>
</div>
{recordings.length > 0 && (
<div className="desktop-process-list" style={{ marginTop: 12 }}>
{recordings.map((rec) => (
<div key={rec.id} className="desktop-process-item">
<div style={{ display: "flex", alignItems: "center", justifyContent: "space-between" }}>
<div>
<strong className="mono" style={{ fontSize: 12 }}>
{rec.fileName}
</strong>
<span
className={`pill ${rec.status === "recording" ? "danger" : rec.status === "completed" ? "success" : "warning"}`}
style={{ marginLeft: 8 }}
>
{rec.status}
</span>
</div>
{rec.status === "completed" && (
<div style={{ display: "flex", gap: 4 }}>
<button
className="button ghost small"
title="Download"
onClick={() => void handleDownloadRecording(rec.id, rec.fileName)}
disabled={downloadingRecordingId === rec.id}
style={{ padding: "4px 6px" }}
>
{downloadingRecordingId === rec.id ? <Loader2 size={14} className="spinner-icon" /> : <Download size={14} />}
</button>
<button
className="button ghost small"
title="Delete"
onClick={() => void handleDeleteRecording(rec.id)}
disabled={deletingRecordingId === rec.id}
style={{ padding: "4px 6px", color: "var(--danger)" }}
>
{deletingRecordingId === rec.id ? <Loader2 size={14} className="spinner-icon" /> : <Trash2 size={14} />}
</button>
</div>
)}
</div>
<div className="mono" style={{ fontSize: 11, color: "var(--muted)", marginTop: 4 }}>
{formatBytes(rec.bytes)}
{" \u00b7 "}
{formatDuration(rec.startedAt, rec.endedAt)}
{" \u00b7 "}
{formatStartedAt(rec.startedAt)}
</div>
</div>
))}
</div>
)}
{recordings.length === 0 && !recordingLoading && (
<div className="desktop-screenshot-empty" style={{ marginTop: 8 }}>
No recordings yet. Click "Start Recording" to begin.
</div>
)}
</>
)}
</div>
{/* ========== Diagnostics Section ========== */}
{(status?.lastError || status?.runtimeLogPath || (status?.processes?.length ?? 0) > 0) && (
<div className="card">
<div className="card-header">
@ -352,9 +563,7 @@ const DesktopTab = ({
{process.running ? "running" : "stopped"}
</span>
</div>
<div className="mono">
{process.pid ? `pid ${process.pid}` : "no pid"}
</div>
<div className="mono">{process.pid ? `pid ${process.pid}` : "no pid"}</div>
{process.logPath && <div className="mono">{process.logPath}</div>}
</div>
))}
@ -363,31 +572,7 @@ const DesktopTab = ({
)}
</div>
)}
<div className="card">
<div className="card-header">
<span className="card-title">Latest Screenshot</span>
{status?.state === "active" ? (
<span className="card-meta">Manual refresh only</span>
) : null}
</div>
{loading ? <div className="card-meta">Loading...</div> : null}
{!loading && !screenshotUrl && (
<div className="desktop-screenshot-empty">
{status?.state === "active"
? "No screenshot loaded yet."
: "Start the desktop runtime to capture a screenshot."}
</div>
)}
{screenshotUrl && (
<div className="desktop-screenshot-frame">
<img src={screenshotUrl} alt="Desktop screenshot" className="desktop-screenshot-image" />
</div>
)}
</div>
</div>
);
};
export default DesktopTab;

View file

@ -8,7 +8,7 @@ export default defineConfig(({ command }) => ({
port: 5173,
proxy: {
"/v1": {
target: "http://localhost:2468",
target: process.env.SANDBOX_AGENT_URL || "http://localhost:2468",
changeOrigin: true,
ws: true,
},

View file

@ -76,6 +76,26 @@ run-gigacode *ARGS:
dev-docs:
cd docs && pnpm dlx mintlify dev --host 0.0.0.0
# Start the desktop dev stack (sandbox-agent backend in Docker + inspector frontend)
[group('server')]
server-dev:
docker compose -f server/compose.dev.yaml up --build --force-recreate -d
# Stop the desktop dev stack
[group('server')]
server-dev-down:
docker compose -f server/compose.dev.yaml down
# Tail desktop dev stack logs
[group('server')]
server-dev-logs *ARGS:
docker compose -f server/compose.dev.yaml logs -f --tail=200 {{ ARGS }}
# Rebuild and restart only the backend container
[group('server')]
server-dev-restart-backend:
docker compose -f server/compose.dev.yaml up --build --force-recreate -d backend
install:
pnpm install
pnpm build --filter @sandbox-agent/inspector...

View file

@ -2,26 +2,19 @@
import type { CSSProperties, MouseEvent, WheelEvent } from "react";
import { useEffect, useRef, useState } from "react";
import type {
DesktopMouseButton,
DesktopStreamErrorStatus,
DesktopStreamReadyStatus,
SandboxAgent,
} from "sandbox-agent";
import type { DesktopMouseButton, DesktopStreamErrorStatus, DesktopStreamReadyStatus, DesktopStreamSession, SandboxAgent } from "sandbox-agent";
type ConnectionState = "connecting" | "ready" | "closed" | "error";
export type DesktopViewerClient = Pick<
SandboxAgent,
"startDesktopStream" | "stopDesktopStream" | "connectDesktopStream"
>;
export type DesktopViewerClient = Pick<SandboxAgent, "startDesktopStream" | "stopDesktopStream" | "connectDesktopStream">;
export interface DesktopViewerProps {
client: DesktopViewerClient;
className?: string;
style?: CSSProperties;
imageStyle?: CSSProperties;
height?: number | string;
autoStart?: boolean;
showStatusBar?: boolean;
tabIndex?: number;
onConnect?: (status: DesktopStreamReadyStatus) => void;
onDisconnect?: () => void;
onError?: (error: DesktopStreamErrorStatus | Error) => void;
@ -31,11 +24,7 @@ const shellStyle: CSSProperties = {
display: "flex",
flexDirection: "column",
overflow: "hidden",
border: "1px solid rgba(15, 23, 42, 0.14)",
borderRadius: 14,
background:
"linear-gradient(180deg, rgba(248, 250, 252, 0.96) 0%, rgba(226, 232, 240, 0.92) 100%)",
boxShadow: "0 20px 40px rgba(15, 23, 42, 0.08)",
width: "100%",
};
const statusBarStyle: CSSProperties = {
@ -44,28 +33,22 @@ const statusBarStyle: CSSProperties = {
justifyContent: "space-between",
gap: 12,
padding: "10px 14px",
borderBottom: "1px solid rgba(15, 23, 42, 0.08)",
background: "rgba(255, 255, 255, 0.78)",
color: "#0f172a",
fontSize: 12,
lineHeight: 1.4,
};
const viewportStyle: CSSProperties = {
position: "relative",
display: "flex",
alignItems: "center",
justifyContent: "center",
width: "100%",
overflow: "hidden",
background:
"radial-gradient(circle at top, rgba(14, 165, 233, 0.18), transparent 45%), linear-gradient(180deg, #0f172a 0%, #111827 100%)",
background: "#000",
outline: "none",
};
const imageBaseStyle: CSSProperties = {
const videoBaseStyle: CSSProperties = {
display: "block",
width: "100%",
height: "100%",
objectFit: "contain",
height: "auto",
userSelect: "none",
};
@ -90,90 +73,96 @@ export const DesktopViewer = ({
client,
className,
style,
imageStyle,
height = 480,
autoStart = true,
showStatusBar = true,
tabIndex = 0,
onConnect,
onDisconnect,
onError,
}: DesktopViewerProps) => {
const videoRef = useRef<HTMLVideoElement | null>(null);
const wrapperRef = useRef<HTMLDivElement | null>(null);
const sessionRef = useRef<ReturnType<DesktopViewerClient["connectDesktopStream"]> | null>(null);
const [connectionState, setConnectionState] = useState<ConnectionState>("connecting");
const [statusMessage, setStatusMessage] = useState("Starting desktop stream...");
const [frameUrl, setFrameUrl] = useState<string | null>(null);
const sessionRef = useRef<DesktopStreamSession | null>(null);
const [connectionState, setConnectionState] = useState<ConnectionState>(autoStart ? "connecting" : "closed");
const [statusMessage, setStatusMessage] = useState(autoStart ? "Starting desktop stream..." : "Stream not started.");
const [resolution, setResolution] = useState<{ width: number; height: number } | null>(null);
useEffect(() => {
let cancelled = false;
let lastObjectUrl: string | null = null;
let session: ReturnType<DesktopViewerClient["connectDesktopStream"]> | null = null;
// Store callbacks and client in refs to keep them out of the effect deps.
const onConnectRef = useRef(onConnect);
onConnectRef.current = onConnect;
const onDisconnectRef = useRef(onDisconnect);
onDisconnectRef.current = onDisconnect;
const onErrorRef = useRef(onError);
onErrorRef.current = onError;
const clientRef = useRef(client);
clientRef.current = client;
useEffect(() => {
if (!autoStart) {
setConnectionState("closed");
setStatusMessage("Stream not started.");
return;
}
let cancelled = false;
setConnectionState("connecting");
setStatusMessage("Starting desktop stream...");
setResolution(null);
const cl = clientRef.current;
const connect = async () => {
try {
await client.startDesktopStream();
if (cancelled) {
return;
}
await cl.startDesktopStream();
if (cancelled) return;
session = client.connectDesktopStream();
const session = cl.connectDesktopStream();
sessionRef.current = session;
session.onReady((status) => {
if (cancelled) {
return;
if (cancelled) return;
setResolution({ width: status.width, height: status.height });
setStatusMessage("Negotiating WebRTC...");
onConnectRef.current?.(status);
});
session.onTrack((stream) => {
if (cancelled) return;
if (videoRef.current) {
videoRef.current.srcObject = stream;
}
setConnectionState("ready");
setStatusMessage("Desktop stream connected.");
setResolution({ width: status.width, height: status.height });
onConnect?.(status);
// Grab keyboard focus when connected.
wrapperRef.current?.focus();
});
session.onFrame((frame) => {
if (cancelled) {
return;
}
const nextUrl = URL.createObjectURL(
new Blob([frame.slice().buffer], { type: "image/jpeg" }),
);
setFrameUrl((current) => {
if (current) {
URL.revokeObjectURL(current);
}
return nextUrl;
});
if (lastObjectUrl) {
URL.revokeObjectURL(lastObjectUrl);
}
lastObjectUrl = nextUrl;
session.onConnect(() => {
if (cancelled) return;
setConnectionState("ready");
setStatusMessage("Desktop stream connected.");
wrapperRef.current?.focus();
});
session.onError((error) => {
if (cancelled) {
return;
}
if (cancelled) return;
setConnectionState("error");
setStatusMessage(error instanceof Error ? error.message : error.message);
onError?.(error);
onErrorRef.current?.(error);
});
session.onClose(() => {
if (cancelled) {
return;
}
setConnectionState((current) => (current === "error" ? current : "closed"));
setStatusMessage((current) =>
current === "Desktop stream connected." ? "Desktop stream disconnected." : current,
);
onDisconnect?.();
session.onDisconnect(() => {
if (cancelled) return;
setConnectionState((cur) => (cur === "error" ? cur : "closed"));
setStatusMessage((cur) => (cur === "Desktop stream connected." ? "Desktop stream disconnected." : cur));
onDisconnectRef.current?.();
});
} catch (error) {
if (cancelled) {
return;
}
const nextError = error instanceof Error ? error : new Error("Failed to initialize desktop stream.");
if (cancelled) return;
const nextError = error instanceof Error ? error : new Error("Failed to start desktop stream.");
setConnectionState("error");
setStatusMessage(nextError.message);
onError?.(nextError);
onErrorRef.current?.(nextError);
}
};
@ -181,36 +170,28 @@ export const DesktopViewer = ({
return () => {
cancelled = true;
session?.close();
sessionRef.current = null;
void client.stopDesktopStream().catch(() => undefined);
setFrameUrl((current) => {
if (current) {
URL.revokeObjectURL(current);
}
return null;
});
if (lastObjectUrl) {
URL.revokeObjectURL(lastObjectUrl);
const session = sessionRef.current;
if (session) {
session.close();
sessionRef.current = null;
}
if (videoRef.current) {
videoRef.current.srcObject = null;
}
// Note: we do NOT call stopDesktopStream() here. The parent component
// manages the stream lifecycle. Calling stop on unmount would kill the
// streaming process and race with subsequent mounts.
};
}, [client, onConnect, onDisconnect, onError]);
}, [autoStart]);
const scalePoint = (clientX: number, clientY: number) => {
const wrapper = wrapperRef.current;
if (!wrapper || !resolution) {
return null;
}
const rect = wrapper.getBoundingClientRect();
if (rect.width === 0 || rect.height === 0) {
return null;
}
const video = videoRef.current;
if (!video || !resolution) return null;
const rect = video.getBoundingClientRect();
if (rect.width === 0 || rect.height === 0) return null;
const x = Math.max(0, Math.min(resolution.width, ((clientX - rect.left) / rect.width) * resolution.width));
const y = Math.max(0, Math.min(resolution.height, ((clientY - rect.top) / rect.height) * resolution.height));
return {
x: Math.round(x),
y: Math.round(y),
};
return { x: Math.round(x), y: Math.round(y) };
};
const buttonFromMouseEvent = (event: MouseEvent<HTMLDivElement>): DesktopMouseButton => {
@ -224,64 +205,60 @@ export const DesktopViewer = ({
}
};
const withSession = (
callback: (session: NonNullable<ReturnType<DesktopViewerClient["connectDesktopStream"]>>) => void,
) => {
const session = sessionRef.current;
if (session) {
callback(session);
}
const withSession = (fn: (s: DesktopStreamSession) => void) => {
const s = sessionRef.current;
if (s) fn(s);
};
return (
<div className={className} style={{ ...shellStyle, ...style }}>
<div style={statusBarStyle}>
<span style={{ color: getStatusColor(connectionState) }}>{statusMessage}</span>
<span style={hintStyle}>
{resolution ? `${resolution.width}×${resolution.height}` : "Awaiting frames"}
</span>
</div>
{showStatusBar && (
<div style={statusBarStyle}>
<span style={{ color: getStatusColor(connectionState) }}>{statusMessage}</span>
<span style={hintStyle}>{resolution ? `${resolution.width}\u00d7${resolution.height}` : "Awaiting stream"}</span>
</div>
)}
<div
ref={wrapperRef}
role="button"
tabIndex={0}
style={{ ...viewportStyle, height }}
role="application"
tabIndex={tabIndex}
style={viewportStyle}
onMouseMove={(event) => {
const point = scalePoint(event.clientX, event.clientY);
if (!point) {
return;
}
withSession((session) => session.moveMouse(point.x, point.y));
if (!point) return;
withSession((s) => s.moveMouse(point.x, point.y));
}}
onMouseDown={(event) => {
event.preventDefault();
// Ensure keyboard focus stays on the viewport when clicking.
wrapperRef.current?.focus();
const point = scalePoint(event.clientX, event.clientY);
withSession((session) =>
session.mouseDown(buttonFromMouseEvent(event), point?.x, point?.y),
);
if (!point) return;
withSession((s) => s.mouseDown(buttonFromMouseEvent(event), point.x, point.y));
}}
onMouseUp={(event) => {
const point = scalePoint(event.clientX, event.clientY);
withSession((session) => session.mouseUp(buttonFromMouseEvent(event), point?.x, point?.y));
if (!point) return;
withSession((s) => s.mouseUp(buttonFromMouseEvent(event), point.x, point.y));
}}
onWheel={(event: WheelEvent<HTMLDivElement>) => {
event.preventDefault();
const point = scalePoint(event.clientX, event.clientY);
if (!point) {
return;
}
withSession((session) => session.scroll(point.x, point.y, Math.round(event.deltaX), Math.round(event.deltaY)));
if (!point) return;
withSession((s) => s.scroll(point.x, point.y, Math.round(event.deltaX), Math.round(event.deltaY)));
}}
onKeyDown={(event) => {
withSession((session) => session.keyDown(event.key));
event.preventDefault();
event.stopPropagation();
withSession((s) => s.keyDown(event.key));
}}
onKeyUp={(event) => {
withSession((session) => session.keyUp(event.key));
event.stopPropagation();
withSession((s) => s.keyUp(event.key));
}}
onContextMenu={(event) => event.preventDefault()}
>
{frameUrl ? (
<img alt="Desktop stream" draggable={false} src={frameUrl} style={{ ...imageBaseStyle, ...imageStyle }} />
) : null}
<video ref={videoRef} autoPlay playsInline muted style={videoBaseStyle} />
</div>
</div>
);

View file

@ -23,10 +23,7 @@ import {
type SetSessionModeRequest,
} from "acp-http-client";
import type { SandboxAgentSpawnHandle, SandboxAgentSpawnOptions } from "./spawn.ts";
import {
DesktopStreamSession,
type DesktopStreamConnectOptions,
} from "./desktop-stream.ts";
import { DesktopStreamSession, type DesktopStreamConnectOptions } from "./desktop-stream.ts";
import {
type AcpServerListResponse,
type AgentInfo,
@ -1530,9 +1527,7 @@ export class SandboxAgent {
return this.requestJson("GET", `${API_PREFIX}/desktop/windows`);
}
async startDesktopRecording(
request: DesktopRecordingStartRequest = {},
): Promise<DesktopRecordingInfo> {
async startDesktopRecording(request: DesktopRecordingStartRequest = {}): Promise<DesktopRecordingInfo> {
return this.requestJson("POST", `${API_PREFIX}/desktop/recording/start`, {
body: request,
});
@ -1551,13 +1546,9 @@ export class SandboxAgent {
}
async downloadDesktopRecording(id: string): Promise<Uint8Array> {
const response = await this.requestRaw(
"GET",
`${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}/download`,
{
accept: "video/mp4",
},
);
const response = await this.requestRaw("GET", `${API_PREFIX}/desktop/recordings/${encodeURIComponent(id)}/download`, {
accept: "video/mp4",
});
const buffer = await response.arrayBuffer();
return new Uint8Array(buffer);
}
@ -1799,7 +1790,7 @@ export class SandboxAgent {
buildDesktopStreamWebSocketUrl(options: ProcessTerminalWebSocketUrlOptions = {}): string {
return toWebSocketUrl(
this.buildUrl(`${API_PREFIX}/desktop/stream/ws`, {
this.buildUrl(`${API_PREFIX}/desktop/stream/signaling`, {
access_token: options.accessToken ?? this.token,
}),
);
@ -1820,7 +1811,7 @@ export class SandboxAgent {
}
connectDesktopStream(options: DesktopStreamSessionOptions = {}): DesktopStreamSession {
return new DesktopStreamSession(this.connectDesktopStreamWebSocket(options));
return new DesktopStreamSession(this.connectDesktopStreamWebSocket(options), options);
}
private async getLiveConnection(agent: string): Promise<LiveAcpConnection> {

View file

@ -1,6 +1,5 @@
import type { DesktopMouseButton } from "./types.ts";
const WS_READY_STATE_CONNECTING = 0;
const WS_READY_STATE_OPEN = 1;
const WS_READY_STATE_CLOSED = 3;
@ -21,63 +20,140 @@ export interface DesktopStreamConnectOptions {
accessToken?: string;
WebSocket?: typeof WebSocket;
protocols?: string | string[];
RTCPeerConnection?: typeof RTCPeerConnection;
rtcConfig?: RTCConfiguration;
}
type DesktopStreamClientFrame =
| {
type: "moveMouse";
x: number;
y: number;
}
| {
type: "mouseDown" | "mouseUp";
x?: number;
y?: number;
button?: DesktopMouseButton;
}
| {
type: "scroll";
x: number;
y: number;
deltaX?: number;
deltaY?: number;
}
| {
type: "keyDown" | "keyUp";
key: string;
}
| {
type: "close";
};
/**
* Data channel binary input protocol (Big Endian).
*
* Byte 0: opcode
* 0x01 = mouse_move (bytes 1-2: u16 BE x, bytes 3-4: u16 BE y)
* 0x02 = mouse_down (byte 1: u8 button)
* 0x03 = mouse_up (byte 1: u8 button)
* 0x04 = mouse_scroll (bytes 1-2: i16 BE dx, bytes 3-4: i16 BE dy)
* 0x05 = key_down (bytes 1-4: u32 BE keysym)
* 0x06 = key_up (bytes 1-4: u32 BE keysym)
*/
const OP_MOUSE_MOVE = 0x01;
const OP_MOUSE_DOWN = 0x02;
const OP_MOUSE_UP = 0x03;
const OP_MOUSE_SCROLL = 0x04;
const OP_KEY_DOWN = 0x05;
const OP_KEY_UP = 0x06;
function mouseButtonToX11(button?: DesktopMouseButton): number {
switch (button) {
case "middle":
return 2;
case "right":
return 3;
default:
return 1;
}
}
function keyToX11Keysym(key: string): number {
if (key.length === 1) {
const cp = key.charCodeAt(0);
if (cp >= 0x20 && cp <= 0x7e) return cp;
return 0x01000000 + cp;
}
const map: Record<string, number> = {
Backspace: 0xff08,
Tab: 0xff09,
Return: 0xff0d,
Enter: 0xff0d,
Escape: 0xff1b,
Delete: 0xffff,
Home: 0xff50,
Left: 0xff51,
ArrowLeft: 0xff51,
Up: 0xff52,
ArrowUp: 0xff52,
Right: 0xff53,
ArrowRight: 0xff53,
Down: 0xff54,
ArrowDown: 0xff54,
PageUp: 0xff55,
PageDown: 0xff56,
End: 0xff57,
Insert: 0xff63,
F1: 0xffbe,
F2: 0xffbf,
F3: 0xffc0,
F4: 0xffc1,
F5: 0xffc2,
F6: 0xffc3,
F7: 0xffc4,
F8: 0xffc5,
F9: 0xffc6,
F10: 0xffc7,
F11: 0xffc8,
F12: 0xffc9,
Shift: 0xffe1,
ShiftLeft: 0xffe1,
ShiftRight: 0xffe2,
Control: 0xffe3,
ControlLeft: 0xffe3,
ControlRight: 0xffe4,
Alt: 0xffe9,
AltLeft: 0xffe9,
AltRight: 0xffea,
Meta: 0xffeb,
MetaLeft: 0xffeb,
MetaRight: 0xffec,
CapsLock: 0xffe5,
NumLock: 0xff7f,
ScrollLock: 0xff14,
" ": 0x0020,
Space: 0x0020,
};
return map[key] ?? 0;
}
export class DesktopStreamSession {
readonly socket: WebSocket;
readonly closed: Promise<void>;
private pc: RTCPeerConnection | null = null;
private dataChannel: RTCDataChannel | null = null;
private mediaStream: MediaStream | null = null;
private connected = false;
private pendingCandidates: Record<string, unknown>[] = [];
private cachedReadyStatus: DesktopStreamReadyStatus | null = null;
private readonly readyListeners = new Set<(status: DesktopStreamReadyStatus) => void>();
private readonly frameListeners = new Set<(frame: Uint8Array) => void>();
private readonly trackListeners = new Set<(stream: MediaStream) => void>();
private readonly connectListeners = new Set<() => void>();
private readonly disconnectListeners = new Set<() => void>();
private readonly errorListeners = new Set<(error: DesktopStreamErrorStatus | Error) => void>();
private readonly closeListeners = new Set<() => void>();
private closeSignalSent = false;
private closedResolve!: () => void;
private readonly PeerConnection: typeof RTCPeerConnection;
private readonly rtcConfig: RTCConfiguration;
constructor(socket: WebSocket) {
constructor(socket: WebSocket, options: DesktopStreamConnectOptions = {}) {
this.socket = socket;
this.socket.binaryType = "arraybuffer";
this.PeerConnection = options.RTCPeerConnection ?? globalThis.RTCPeerConnection;
this.rtcConfig = options.rtcConfig ?? {};
this.closed = new Promise<void>((resolve) => {
this.closedResolve = resolve;
});
this.socket.addEventListener("message", (event) => {
void this.handleMessage(event.data);
this.handleMessage(event.data as string);
});
this.socket.addEventListener("error", () => {
this.emitError(new Error("Desktop stream websocket connection failed."));
this.emitError(new Error("Desktop stream signaling connection failed."));
});
this.socket.addEventListener("close", () => {
this.teardownPeerConnection();
this.closedResolve();
for (const listener of this.closeListeners) {
for (const listener of this.disconnectListeners) {
listener();
}
});
@ -85,15 +161,35 @@ export class DesktopStreamSession {
onReady(listener: (status: DesktopStreamReadyStatus) => void): () => void {
this.readyListeners.add(listener);
if (this.cachedReadyStatus) {
listener(this.cachedReadyStatus);
}
return () => {
this.readyListeners.delete(listener);
};
}
onFrame(listener: (frame: Uint8Array) => void): () => void {
this.frameListeners.add(listener);
onTrack(listener: (stream: MediaStream) => void): () => void {
this.trackListeners.add(listener);
if (this.mediaStream) {
listener(this.mediaStream);
}
return () => {
this.frameListeners.delete(listener);
this.trackListeners.delete(listener);
};
}
onConnect(listener: () => void): () => void {
this.connectListeners.add(listener);
return () => {
this.connectListeners.delete(listener);
};
}
onDisconnect(listener: () => void): () => void {
this.disconnectListeners.add(listener);
return () => {
this.disconnectListeners.delete(listener);
};
}
@ -104,97 +200,313 @@ export class DesktopStreamSession {
};
}
/** @deprecated Use onDisconnect instead. */
onClose(listener: () => void): () => void {
this.closeListeners.add(listener);
return () => {
this.closeListeners.delete(listener);
};
return this.onDisconnect(listener);
}
/** @deprecated No longer emits JPEG frames. Use onTrack for WebRTC media. */
onFrame(_listener: (frame: Uint8Array) => void): () => void {
return () => {};
}
getMediaStream(): MediaStream | null {
return this.mediaStream;
}
moveMouse(x: number, y: number): void {
this.sendFrame({ type: "moveMouse", x, y });
if (this.dataChannel?.readyState === "open") {
const buf = new ArrayBuffer(5);
const view = new DataView(buf);
view.setUint8(0, OP_MOUSE_MOVE);
view.setUint16(1, x, false);
view.setUint16(3, y, false);
this.dataChannel.send(buf);
} else {
this.sendSignaling("moveMouse", { x, y });
}
}
mouseDown(button?: DesktopMouseButton, x?: number, y?: number): void {
this.sendFrame({ type: "mouseDown", button, x, y });
if (x != null && y != null) {
this.moveMouse(x, y);
}
if (this.dataChannel?.readyState === "open") {
const buf = new ArrayBuffer(2);
const view = new DataView(buf);
view.setUint8(0, OP_MOUSE_DOWN);
view.setUint8(1, mouseButtonToX11(button));
this.dataChannel.send(buf);
} else {
this.sendSignaling("mouseDown", { button, x, y });
}
}
mouseUp(button?: DesktopMouseButton, x?: number, y?: number): void {
this.sendFrame({ type: "mouseUp", button, x, y });
if (x != null && y != null) {
this.moveMouse(x, y);
}
if (this.dataChannel?.readyState === "open") {
const buf = new ArrayBuffer(2);
const view = new DataView(buf);
view.setUint8(0, OP_MOUSE_UP);
view.setUint8(1, mouseButtonToX11(button));
this.dataChannel.send(buf);
} else {
this.sendSignaling("mouseUp", { button, x, y });
}
}
scroll(x: number, y: number, deltaX?: number, deltaY?: number): void {
this.sendFrame({ type: "scroll", x, y, deltaX, deltaY });
this.moveMouse(x, y);
if (this.dataChannel?.readyState === "open") {
const buf = new ArrayBuffer(5);
const view = new DataView(buf);
view.setUint8(0, OP_MOUSE_SCROLL);
view.setInt16(1, deltaX ?? 0, false);
view.setInt16(3, deltaY ?? 0, false);
this.dataChannel.send(buf);
} else {
this.sendSignaling("scroll", { x, y, deltaX, deltaY });
}
}
keyDown(key: string): void {
this.sendFrame({ type: "keyDown", key });
const keysym = keyToX11Keysym(key);
if (keysym === 0) return;
if (this.dataChannel?.readyState === "open") {
const buf = new ArrayBuffer(5);
const view = new DataView(buf);
view.setUint8(0, OP_KEY_DOWN);
view.setUint32(1, keysym, false);
this.dataChannel.send(buf);
} else {
this.sendSignaling("keyDown", { key });
}
}
keyUp(key: string): void {
this.sendFrame({ type: "keyUp", key });
const keysym = keyToX11Keysym(key);
if (keysym === 0) return;
if (this.dataChannel?.readyState === "open") {
const buf = new ArrayBuffer(5);
const view = new DataView(buf);
view.setUint8(0, OP_KEY_UP);
view.setUint32(1, keysym, false);
this.dataChannel.send(buf);
} else {
this.sendSignaling("keyUp", { key });
}
}
close(): void {
if (this.socket.readyState === WS_READY_STATE_CONNECTING) {
this.socket.addEventListener(
"open",
() => {
this.close();
},
{ once: true },
);
return;
}
if (this.socket.readyState === WS_READY_STATE_OPEN) {
if (!this.closeSignalSent) {
this.closeSignalSent = true;
this.sendFrame({ type: "close" });
}
this.socket.close();
return;
}
this.teardownPeerConnection();
if (this.socket.readyState !== WS_READY_STATE_CLOSED) {
this.socket.close();
}
}
private async handleMessage(data: unknown): Promise<void> {
private handleMessage(data: string): void {
let msg: Record<string, unknown>;
try {
if (typeof data === "string") {
const frame = parseStatusFrame(data);
if (!frame) {
this.emitError(new Error("Received invalid desktop stream control frame."));
return;
}
msg = JSON.parse(data) as Record<string, unknown>;
} catch {
return;
}
if (frame.type === "ready") {
for (const listener of this.readyListeners) {
listener(frame);
}
return;
}
const type = (msg.type as string) ?? "";
this.emitError(frame);
return;
switch (type) {
case "ready": {
const status: DesktopStreamReadyStatus = {
type: "ready",
width: Number(msg.width) || 0,
height: Number(msg.height) || 0,
};
this.cachedReadyStatus = status;
for (const listener of this.readyListeners) {
listener(status);
}
break;
}
const bytes = await decodeBinaryFrame(data);
for (const listener of this.frameListeners) {
listener(bytes);
case "offer": {
if (msg.sdp) {
void this.handleOffer(msg.sdp as string);
}
break;
}
case "candidate": {
void this.handleCandidate(msg as unknown as RTCIceCandidateInit);
break;
}
case "error": {
const errorStatus: DesktopStreamErrorStatus = {
type: "error",
message: (msg.message as string) ?? "Unknown error",
};
this.emitError(errorStatus);
break;
}
default:
break;
}
}
private async handleOffer(sdp: string): Promise<void> {
try {
const config: RTCConfiguration = {
...this.rtcConfig,
iceServers: this.rtcConfig.iceServers ?? [{ urls: "stun:stun.l.google.com:19302" }],
};
const pc = new this.PeerConnection(config);
this.pc = pc;
pc.ontrack = (event) => {
const stream = event.streams[0] ?? new MediaStream([event.track]);
this.mediaStream = stream;
for (const listener of this.trackListeners) {
listener(stream);
}
};
pc.onicecandidate = (event) => {
if (event.candidate) {
this.sendJson({
type: "candidate",
candidate: event.candidate.candidate,
sdpMLineIndex: event.candidate.sdpMLineIndex,
sdpMid: event.candidate.sdpMid,
});
}
};
pc.onconnectionstatechange = () => {
switch (pc.connectionState) {
case "connected":
if (!this.connected) {
this.connected = true;
for (const listener of this.connectListeners) {
listener();
}
}
break;
case "closed":
case "failed":
this.emitError(new Error(`WebRTC connection ${pc.connectionState}.`));
break;
}
};
pc.oniceconnectionstatechange = () => {
switch (pc.iceConnectionState) {
case "connected":
if (!this.connected) {
this.connected = true;
for (const listener of this.connectListeners) {
listener();
}
}
break;
case "closed":
case "failed":
this.emitError(new Error(`WebRTC ICE ${pc.iceConnectionState}.`));
break;
}
};
// Server creates the data channel; client receives it.
pc.ondatachannel = (event) => {
this.dataChannel = event.channel;
this.dataChannel.binaryType = "arraybuffer";
this.dataChannel.onerror = () => {
this.emitError(new Error("WebRTC data channel error."));
};
this.dataChannel.onclose = () => {
this.dataChannel = null;
};
};
await pc.setRemoteDescription({ type: "offer", sdp });
// Flush any ICE candidates that arrived before the PC was ready.
for (const pending of this.pendingCandidates) {
try {
await pc.addIceCandidate(pending as unknown as RTCIceCandidateInit);
} catch {
// ignore stale candidates
}
}
this.pendingCandidates = [];
const answer = await pc.createAnswer();
await pc.setLocalDescription(answer);
this.sendJson({ type: "answer", sdp: answer.sdp });
} catch (error) {
this.emitError(error instanceof Error ? error : new Error(String(error)));
}
}
private sendFrame(frame: DesktopStreamClientFrame): void {
if (this.socket.readyState !== WS_READY_STATE_OPEN) {
private async handleCandidate(candidate: RTCIceCandidateInit): Promise<void> {
if (!this.pc) {
this.pendingCandidates.push(candidate as unknown as Record<string, unknown>);
return;
}
this.socket.send(JSON.stringify(frame));
try {
await this.pc.addIceCandidate(candidate);
} catch (error) {
this.emitError(error instanceof Error ? error : new Error(String(error)));
}
}
/** Send a JSON message to the server. */
private sendJson(msg: Record<string, unknown>): void {
if (this.socket.readyState !== WS_READY_STATE_OPEN) return;
this.socket.send(JSON.stringify(msg));
}
/** Send a typed input message over the signaling WebSocket as fallback. */
private sendSignaling(type: string, data: Record<string, unknown>): void {
this.sendJson({ type, ...data });
}
/** Tear down the peer connection, nullifying handlers first to prevent stale
* callbacks. */
private teardownPeerConnection(): void {
if (this.dataChannel) {
this.dataChannel.onerror = null;
this.dataChannel.onmessage = null;
this.dataChannel.onopen = null;
this.dataChannel.onclose = null;
try {
this.dataChannel.close();
} catch {
/* ignore */
}
this.dataChannel = null;
}
if (this.pc) {
this.pc.onicecandidate = null;
this.pc.onicecandidateerror = null;
this.pc.onconnectionstatechange = null;
this.pc.oniceconnectionstatechange = null;
this.pc.onsignalingstatechange = null;
this.pc.onnegotiationneeded = null;
this.pc.ontrack = null;
this.pc.ondatachannel = null;
try {
this.pc.close();
} catch {
/* ignore */
}
this.pc = null;
}
this.mediaStream = null;
this.connected = false;
}
private emitError(error: DesktopStreamErrorStatus | Error): void {
@ -203,34 +515,3 @@ export class DesktopStreamSession {
}
}
}
function parseStatusFrame(payload: string): DesktopStreamStatusMessage | null {
const value = JSON.parse(payload) as Record<string, unknown>;
if (value.type === "ready" && typeof value.width === "number" && typeof value.height === "number") {
return {
type: "ready",
width: value.width,
height: value.height,
};
}
if (value.type === "error" && typeof value.message === "string") {
return {
type: "error",
message: value.message,
};
}
return null;
}
async function decodeBinaryFrame(data: unknown): Promise<Uint8Array> {
if (data instanceof ArrayBuffer) {
return new Uint8Array(data);
}
if (ArrayBuffer.isView(data)) {
return new Uint8Array(data.buffer, data.byteOffset, data.byteLength);
}
if (typeof Blob !== "undefined" && data instanceof Blob) {
return new Uint8Array(await data.arrayBuffer());
}
throw new Error("Unsupported desktop stream binary frame type.");
}

View file

@ -3,7 +3,6 @@
* Do not make direct changes to the file.
*/
export interface paths {
"/v1/acp": {
get: operations["get_v1_acp_servers"];
@ -225,9 +224,10 @@ export interface paths {
};
"/v1/desktop/stream/ws": {
/**
* Open a desktop websocket streaming session.
* @description Upgrades the connection to a websocket that streams JPEG desktop frames and
* accepts mouse and keyboard control frames.
* Open a desktop WebRTC signaling session.
* @description Upgrades the connection to a WebSocket used for WebRTC signaling between
* the browser client and the desktop streaming process. Also accepts mouse
* and keyboard input frames as a fallback transport.
*/
get: operations["get_v1_desktop_stream_ws"];
};
@ -633,7 +633,23 @@ export interface components {
windows: components["schemas"]["DesktopWindowInfo"][];
};
/** @enum {string} */
ErrorType: "invalid_request" | "conflict" | "unsupported_agent" | "agent_not_installed" | "install_failed" | "agent_process_exited" | "token_invalid" | "permission_denied" | "not_acceptable" | "unsupported_media_type" | "not_found" | "session_not_found" | "session_already_exists" | "mode_not_supported" | "stream_error" | "timeout";
ErrorType:
| "invalid_request"
| "conflict"
| "unsupported_agent"
| "agent_not_installed"
| "install_failed"
| "agent_process_exited"
| "token_invalid"
| "permission_denied"
| "not_acceptable"
| "unsupported_media_type"
| "not_found"
| "session_not_found"
| "session_already_exists"
| "mode_not_supported"
| "stream_error"
| "timeout";
FsActionResponse: {
path: string;
};
@ -692,35 +708,37 @@ export interface components {
directory: string;
mcpName: string;
};
McpServerConfig: ({
args?: string[];
command: string;
cwd?: string | null;
enabled?: boolean | null;
env?: {
[key: string]: string;
} | null;
/** Format: int64 */
timeoutMs?: number | null;
/** @enum {string} */
type: "local";
}) | ({
bearerTokenEnvVar?: string | null;
enabled?: boolean | null;
envHeaders?: {
[key: string]: string;
} | null;
headers?: {
[key: string]: string;
} | null;
oauth?: Record<string, unknown> | null | null;
/** Format: int64 */
timeoutMs?: number | null;
transport?: string | null;
/** @enum {string} */
type: "remote";
url: string;
});
McpServerConfig:
| {
args?: string[];
command: string;
cwd?: string | null;
enabled?: boolean | null;
env?: {
[key: string]: string;
} | null;
/** Format: int64 */
timeoutMs?: number | null;
/** @enum {string} */
type: "local";
}
| {
bearerTokenEnvVar?: string | null;
enabled?: boolean | null;
envHeaders?: {
[key: string]: string;
} | null;
headers?: {
[key: string]: string;
} | null;
oauth?: Record<string, unknown> | null | null;
/** Format: int64 */
timeoutMs?: number | null;
transport?: string | null;
/** @enum {string} */
type: "remote";
url: string;
};
ProblemDetails: {
detail?: string | null;
instance?: string | null;
@ -880,7 +898,6 @@ export type $defs = Record<string, never>;
export type external = Record<string, never>;
export interface operations {
get_v1_acp_servers: {
responses: {
/** @description Active ACP server instances */
@ -2002,9 +2019,10 @@ export interface operations {
};
};
/**
* Open a desktop websocket streaming session.
* @description Upgrades the connection to a websocket that streams JPEG desktop frames and
* accepts mouse and keyboard control frames.
* Open a desktop WebRTC signaling session.
* @description Upgrades the connection to a WebSocket used for WebRTC signaling between
* the browser client and the desktop streaming process. Also accepts mouse
* and keyboard input frames as a fallback transport.
*/
get_v1_desktop_stream_ws: {
parameters: {

40
server/compose.dev.yaml Normal file
View file

@ -0,0 +1,40 @@
name: sandbox-agent-dev
services:
backend:
build:
context: ..
dockerfile: docker/test-agent/Dockerfile
image: sandbox-agent-dev
command: ["server", "--host", "0.0.0.0", "--port", "3000", "--no-token"]
environment:
RUST_LOG: "${RUST_LOG:-info}"
ports:
- "2468:3000"
frontend:
build:
context: ..
dockerfile: server/docker/frontend.dev.Dockerfile
working_dir: /app
depends_on:
- backend
environment:
SANDBOX_AGENT_URL: "http://backend:3000"
ports:
- "5173:5173"
volumes:
- "..:/app"
# Keep Linux-native node_modules inside the container.
- "sa_root_node_modules:/app/node_modules"
- "sa_inspector_node_modules:/app/frontend/packages/inspector/node_modules"
- "sa_react_node_modules:/app/sdks/react/node_modules"
- "sa_typescript_node_modules:/app/sdks/typescript/node_modules"
- "sa_pnpm_store:/root/.local/share/pnpm/store"
volumes:
sa_root_node_modules: {}
sa_inspector_node_modules: {}
sa_react_node_modules: {}
sa_typescript_node_modules: {}
sa_pnpm_store: {}

View file

@ -0,0 +1,5 @@
FROM node:22-bookworm-slim
RUN npm install -g pnpm
WORKDIR /app
EXPOSE 5173
CMD ["sh", "-c", "pnpm install && cd frontend/packages/inspector && npx vite --host 0.0.0.0"]

View file

@ -42,6 +42,9 @@ toml_edit.workspace = true
tar.workspace = true
zip.workspace = true
tempfile = { workspace = true, optional = true }
gstreamer = { version = "0.23", optional = true }
gstreamer-sdp = { version = "0.23", optional = true }
gstreamer-webrtc = { version = "0.23", optional = true }
[target.'cfg(unix)'.dependencies]
libc = "0.2"
@ -59,3 +62,4 @@ tokio-tungstenite = "0.24"
[features]
test-utils = ["tempfile"]
desktop-gstreamer = ["gstreamer", "gstreamer-sdp", "gstreamer-webrtc"]

View file

@ -0,0 +1,246 @@
/// GStreamer WebRTC pipeline for desktop streaming.
///
/// Creates a pipeline that captures the X11 display via `ximagesrc`, encodes to
/// VP8, and streams over WebRTC using `webrtcbin`. Signaling (SDP offer/answer,
/// ICE candidate exchange) is handled via channels that the caller bridges to
/// the client WebSocket.
#[cfg(feature = "desktop-gstreamer")]
pub mod pipeline {
use gstreamer as gst;
use gstreamer::prelude::*;
use gstreamer_sdp as gst_sdp;
use gstreamer_webrtc as gst_webrtc;
use tokio::sync::mpsc;
/// Messages sent from the GStreamer pipeline to the WebSocket handler.
#[derive(Debug)]
pub enum PipelineEvent {
/// SDP offer generated by webrtcbin.
Offer(String),
/// ICE candidate produced by webrtcbin.
IceCandidate {
candidate: String,
sdp_m_line_index: u32,
},
}
/// Messages sent from the WebSocket handler to the GStreamer pipeline.
#[derive(Debug)]
pub enum SignalingCommand {
/// SDP answer from the client.
Answer(String),
/// ICE candidate from the client.
IceCandidate {
candidate: String,
sdp_m_line_index: u32,
},
}
pub struct GStreamerPipeline {
pipeline: gst::Pipeline,
cmd_tx: mpsc::UnboundedSender<SignalingCommand>,
}
impl GStreamerPipeline {
/// Create and start a new GStreamer WebRTC pipeline for the given display.
///
/// Returns the pipeline handle and a receiver for pipeline events (offers,
/// ICE candidates) that should be forwarded to the client.
pub fn new(
display: &str,
) -> Result<(Self, mpsc::UnboundedReceiver<PipelineEvent>), String> {
gst::init().map_err(|e| {
format!(
"Desktop streaming requires GStreamer. Install it with: \
sandbox-agent desktop install\n\
Error: {e}"
)
})?;
let pipeline_str = format!(
"ximagesrc display-name={display} use-damage=true show-pointer=true \
! video/x-raw,framerate=30/1 \
! videorate \
! videoconvert \
! queue max-size-buffers=1 leaky=downstream \
! vp8enc deadline=1 target-bitrate=3000000 cpu-used=16 threads=4 \
keyframe-max-dist=60 end-usage=cbr buffer-size=500 buffer-initial-size=300 \
error-resilient=partitions \
! rtpvp8pay picture-id-mode=15bit \
! queue max-size-buffers=1 leaky=downstream \
! application/x-rtp,media=video,encoding-name=VP8,payload=96 \
! webrtcbin name=wb bundle-policy=max-bundle"
);
let pipeline = gst::parse::launch(&pipeline_str)
.map_err(|e| format!("failed to create GStreamer pipeline: {e}"))?
.downcast::<gst::Pipeline>()
.map_err(|_| "pipeline is not a GstPipeline".to_string())?;
let webrtcbin = pipeline
.by_name("wb")
.ok_or_else(|| "webrtcbin element not found in pipeline".to_string())?;
// Configure STUN for ICE connectivity (used for server-reflexive
// candidates when behind NAT).
webrtcbin.set_property_from_str("stun-server", "stun://stun.l.google.com:19302");
// Restrict the UDP port range so Docker port forwarding works.
// The ice-agent is a GstWebRTCICE which wraps a NiceAgent.
let ice_agent: gst::glib::Object = webrtcbin.property("ice-agent");
// GstWebRTCNice has a "min-rtp-port" and "max-rtp-port" property
// in newer versions, but on GStreamer 1.22 we need to access the
// underlying NiceAgent via the "agent" property.
if ice_agent.has_property("min-rtp-port", None) {
ice_agent.set_property("min-rtp-port", 30000u32);
ice_agent.set_property("max-rtp-port", 30100u32);
} else if ice_agent.has_property("agent", None) {
let nice_agent: gst::glib::Object = ice_agent.property("agent");
nice_agent.set_property("max-port", 30100u32);
nice_agent.set_property("min-port", 30000u32);
}
// Channel for pipeline -> WS handler events.
let (event_tx, event_rx) = mpsc::unbounded_channel::<PipelineEvent>();
// Channel for WS handler -> pipeline commands.
let (cmd_tx, mut cmd_rx) = mpsc::unbounded_channel::<SignalingCommand>();
// Note: Data channel for input will be created once we establish
// the WebRTC connection. Input falls back to the WS transport.
// When webrtcbin needs to negotiate, create an offer.
let wb_clone = webrtcbin.clone();
let event_tx_offer = event_tx.clone();
webrtcbin.connect("on-negotiation-needed", false, move |_| {
let wb_offer = wb_clone.clone();
let wb_create = wb_clone.clone();
let tx = event_tx_offer.clone();
let promise = gst::Promise::with_change_func(move |reply| {
let reply = match reply {
Ok(Some(reply)) => reply,
_ => return,
};
let offer = match reply.value("offer") {
Ok(offer) => offer,
Err(_) => return,
};
let offer = offer
.get::<gst_webrtc::WebRTCSessionDescription>()
.expect("offer is WebRTCSessionDescription");
wb_offer.emit_by_name::<()>(
"set-local-description",
&[&offer, &None::<gst::Promise>],
);
if let Ok(sdp_text) = offer.sdp().as_text() {
let _ = tx.send(PipelineEvent::Offer(sdp_text.to_string()));
}
});
wb_create.emit_by_name::<()>("create-offer", &[&None::<gst::Structure>, &promise]);
None
});
// When webrtcbin produces an ICE candidate, send it to client.
// We rewrite host candidates to use 127.0.0.1 so the browser can
// reach the server when running inside Docker.
let event_tx_ice = event_tx;
webrtcbin.connect("on-ice-candidate", false, move |values| {
let sdp_m_line_index = values[1].get::<u32>().expect("m-line index is u32");
let candidate = values[2].get::<String>().expect("candidate is String");
// Only forward UDP host candidates, rewritten to 127.0.0.1.
// Skip TCP candidates (browsers rarely use TCP for WebRTC media)
// and server-reflexive candidates (STUN responses with public IPs).
if candidate.contains("UDP") && candidate.contains("typ host") {
// Replace the Docker-internal IP with 127.0.0.1 so the
// browser on the host can connect.
let rewritten = rewrite_candidate_ip(&candidate, "127.0.0.1");
let _ = event_tx_ice.send(PipelineEvent::IceCandidate {
candidate: rewritten,
sdp_m_line_index,
});
}
None
});
// Start the pipeline.
pipeline
.set_state(gst::State::Playing)
.map_err(|e| format!("failed to start GStreamer pipeline: {e}"))?;
// Spawn a thread to process signaling commands from the WS handler.
let wb_cmd = webrtcbin.clone();
std::thread::spawn(move || {
while let Some(cmd) = cmd_rx.blocking_recv() {
match cmd {
SignalingCommand::Answer(sdp_str) => {
let sdp = match gst_sdp::SDPMessage::parse_buffer(sdp_str.as_bytes()) {
Ok(sdp) => sdp,
Err(e) => {
tracing::warn!(error = ?e, "failed to parse SDP answer");
continue;
}
};
let answer = gst_webrtc::WebRTCSessionDescription::new(
gst_webrtc::WebRTCSDPType::Answer,
sdp,
);
wb_cmd.emit_by_name::<()>(
"set-remote-description",
&[&answer, &None::<gst::Promise>],
);
}
SignalingCommand::IceCandidate {
candidate,
sdp_m_line_index,
} => {
wb_cmd.emit_by_name::<()>(
"add-ice-candidate",
&[&sdp_m_line_index, &candidate],
);
}
}
}
});
Ok((Self { pipeline, cmd_tx }, event_rx))
}
/// Send a signaling command to the pipeline.
pub fn send_command(&self, cmd: SignalingCommand) {
let _ = self.cmd_tx.send(cmd);
}
}
impl Drop for GStreamerPipeline {
fn drop(&mut self) {
let _ = self.pipeline.set_state(gst::State::Null);
}
}
/// Rewrite the IP address in an ICE candidate string.
///
/// ICE candidate format:
/// candidate:1 1 UDP 2015363327 172.17.0.6 39395 typ host
///
/// We replace the IP (field 5, 0-indexed) with the target IP.
fn rewrite_candidate_ip(candidate: &str, target_ip: &str) -> String {
let parts: Vec<&str> = candidate.splitn(6, ' ').collect();
if parts.len() >= 6 {
// parts[4] is the IP address
let rest_after_ip = &candidate[parts[..5].join(" ").len()..];
format!(
"{} {} {} {} {}{}",
parts[0], parts[1], parts[2], parts[3], target_ip, rest_after_ip
)
} else {
candidate.to_string()
}
}
}
/// Check if GStreamer support is compiled in.
pub fn is_available() -> bool {
cfg!(feature = "desktop-gstreamer")
}

View file

@ -110,6 +110,13 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V
"dbus-x11",
"xauth",
"fonts-dejavu-core",
"libgstreamer1.0-0",
"gstreamer1.0-plugins-base",
"gstreamer1.0-plugins-good",
"gstreamer1.0-plugins-bad",
"gstreamer1.0-plugins-ugly",
"gstreamer1.0-nice",
"gstreamer1.0-x",
],
DesktopPackageManager::Dnf => vec![
"xorg-x11-server-Xvfb",
@ -121,6 +128,13 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V
"dbus-x11",
"xauth",
"dejavu-sans-fonts",
"gstreamer1",
"gstreamer1-plugins-base",
"gstreamer1-plugins-good",
"gstreamer1-plugins-bad-free",
"gstreamer1-plugins-ugly-free",
"gstreamer1-plugin-libnice",
"gstreamer1-plugins-good-extras",
],
DesktopPackageManager::Apk => vec![
"xvfb",
@ -132,6 +146,12 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V
"dbus",
"xauth",
"ttf-dejavu",
"gstreamer",
"gst-plugins-base",
"gst-plugins-good",
"gst-plugins-bad",
"gst-plugins-ugly",
"libnice-gstreamer",
],
}
.into_iter()

View file

@ -10,20 +10,20 @@ use tokio::sync::Mutex;
use sandbox_agent_error::SandboxError;
use crate::desktop_recording::{DesktopRecordingContext, DesktopRecordingManager};
use crate::desktop_errors::DesktopProblem;
use crate::desktop_install::desktop_platform_support_message;
use crate::desktop_recording::{DesktopRecordingContext, DesktopRecordingManager};
use crate::desktop_streaming::DesktopStreamingManager;
use crate::desktop_types::{
DesktopActionResponse, DesktopDisplayInfoResponse, DesktopErrorInfo,
DesktopKeyModifiers, DesktopKeyboardDownRequest, DesktopKeyboardPressRequest,
DesktopKeyboardTypeRequest, DesktopKeyboardUpRequest, DesktopMouseButton,
DesktopMouseClickRequest, DesktopMouseDownRequest, DesktopMouseDragRequest,
DesktopMouseMoveRequest, DesktopMousePositionResponse, DesktopMouseScrollRequest,
DesktopMouseUpRequest, DesktopProcessInfo, DesktopRecordingInfo,
DesktopRecordingListResponse, DesktopRecordingStartRequest, DesktopRegionScreenshotQuery,
DesktopResolution, DesktopScreenshotFormat, DesktopScreenshotQuery, DesktopStartRequest,
DesktopState, DesktopStatusResponse, DesktopStreamStatusResponse, DesktopWindowInfo,
DesktopActionResponse, DesktopDisplayInfoResponse, DesktopErrorInfo, DesktopKeyModifiers,
DesktopKeyboardDownRequest, DesktopKeyboardPressRequest, DesktopKeyboardTypeRequest,
DesktopKeyboardUpRequest, DesktopMouseButton, DesktopMouseClickRequest,
DesktopMouseDownRequest, DesktopMouseDragRequest, DesktopMouseMoveRequest,
DesktopMousePositionResponse, DesktopMouseScrollRequest, DesktopMouseUpRequest,
DesktopProcessInfo, DesktopRecordingInfo, DesktopRecordingListResponse,
DesktopRecordingStartRequest, DesktopRegionScreenshotQuery, DesktopResolution,
DesktopScreenshotFormat, DesktopScreenshotQuery, DesktopStartRequest, DesktopState,
DesktopStatusResponse, DesktopStreamStatusResponse, DesktopWindowInfo,
DesktopWindowListResponse,
};
use crate::process_runtime::{
@ -172,9 +172,9 @@ impl DesktopRuntime {
let recording_manager =
DesktopRecordingManager::new(process_runtime.clone(), config.state_dir.clone());
Self {
streaming_manager: DesktopStreamingManager::new(),
process_runtime,
recording_manager,
streaming_manager: DesktopStreamingManager::new(),
inner: Arc::new(Mutex::new(DesktopRuntimeStateData {
state: DesktopState::Inactive,
display_num: config.display_num,
@ -197,7 +197,10 @@ impl DesktopRuntime {
pub async fn status(&self) -> DesktopStatusResponse {
let mut state = self.inner.lock().await;
self.refresh_status_locked(&mut state).await;
self.snapshot_locked(&state)
let mut response = self.snapshot_locked(&state);
drop(state);
response
}
pub async fn start(
@ -221,7 +224,10 @@ impl DesktopRuntime {
self.refresh_status_locked(&mut state).await;
if state.state == DesktopState::Active {
return Ok(self.snapshot_locked(&state));
let mut response = self.snapshot_locked(&state);
drop(state);
return Ok(response);
}
if !state.missing_dependencies.is_empty() {
@ -307,7 +313,10 @@ impl DesktopRuntime {
),
);
Ok(self.snapshot_locked(&state))
let mut response = self.snapshot_locked(&state);
drop(state);
Ok(response)
}
pub async fn stop(&self) -> Result<DesktopStatusResponse, DesktopProblem> {
@ -336,7 +345,10 @@ impl DesktopRuntime {
state.install_command = self.install_command_for(&state.missing_dependencies);
state.environment.clear();
Ok(self.snapshot_locked(&state))
let mut response = self.snapshot_locked(&state);
drop(state);
Ok(response)
}
pub async fn shutdown(&self) {
@ -630,8 +642,23 @@ impl DesktopRuntime {
self.recording_manager.delete(id).await
}
pub async fn start_streaming(&self) -> DesktopStreamStatusResponse {
self.streaming_manager.start().await
pub async fn start_streaming(&self) -> Result<DesktopStreamStatusResponse, SandboxError> {
let state = self.inner.lock().await;
let display = state
.display
.as_deref()
.ok_or_else(|| SandboxError::Conflict {
message: "desktop runtime is not active".to_string(),
})?;
let resolution = state
.resolution
.clone()
.ok_or_else(|| SandboxError::Conflict {
message: "desktop runtime is not active".to_string(),
})?;
let display = display.to_string();
drop(state);
Ok(self.streaming_manager.start(&display, resolution).await)
}
pub async fn stop_streaming(&self) -> DesktopStreamStatusResponse {
@ -639,7 +666,17 @@ impl DesktopRuntime {
}
pub async fn ensure_streaming_active(&self) -> Result<(), SandboxError> {
self.streaming_manager.ensure_active().await
if self.streaming_manager.is_active().await {
Ok(())
} else {
Err(SandboxError::Conflict {
message: "desktop streaming is not active".to_string(),
})
}
}
pub fn streaming_manager(&self) -> &DesktopStreamingManager {
&self.streaming_manager
}
async fn recording_context(&self) -> Result<DesktopRecordingContext, SandboxError> {
@ -831,8 +868,14 @@ impl DesktopRuntime {
name: &str,
) -> Result<(), DesktopProblem> {
let process_id = match name {
"Xvfb" => state.xvfb.as_ref().map(|process| process.process_id.clone()),
"openbox" => state.openbox.as_ref().map(|process| process.process_id.clone()),
"Xvfb" => state
.xvfb
.as_ref()
.map(|process| process.process_id.clone()),
"openbox" => state
.openbox
.as_ref()
.map(|process| process.process_id.clone()),
_ => None,
};

View file

@ -2,9 +2,7 @@ use std::sync::Arc;
use tokio::sync::Mutex;
use sandbox_agent_error::SandboxError;
use crate::desktop_types::DesktopStreamStatusResponse;
use crate::desktop_types::{DesktopResolution, DesktopStreamStatusResponse};
#[derive(Debug, Clone)]
pub struct DesktopStreamingManager {
@ -14,6 +12,8 @@ pub struct DesktopStreamingManager {
#[derive(Debug, Default)]
struct DesktopStreamingState {
active: bool,
display: Option<String>,
resolution: Option<DesktopResolution>,
}
impl DesktopStreamingManager {
@ -23,25 +23,46 @@ impl DesktopStreamingManager {
}
}
pub async fn start(&self) -> DesktopStreamStatusResponse {
/// Mark desktop streaming as active for the given display and resolution.
///
/// The actual GStreamer pipeline is created per-WebSocket-session in the
/// signaling handler — this method just records that streaming is enabled.
pub async fn start(
&self,
display: &str,
resolution: DesktopResolution,
) -> DesktopStreamStatusResponse {
let mut state = self.inner.lock().await;
if state.active {
return DesktopStreamStatusResponse { active: true };
}
state.active = true;
state.display = Some(display.to_string());
state.resolution = Some(resolution);
DesktopStreamStatusResponse { active: true }
}
/// Stop streaming and clear state.
pub async fn stop(&self) -> DesktopStreamStatusResponse {
let mut state = self.inner.lock().await;
state.active = false;
state.display = None;
state.resolution = None;
DesktopStreamStatusResponse { active: false }
}
pub async fn ensure_active(&self) -> Result<(), SandboxError> {
if self.inner.lock().await.active {
Ok(())
} else {
Err(SandboxError::Conflict {
message: "desktop streaming is not active".to_string(),
})
}
pub async fn is_active(&self) -> bool {
self.inner.lock().await.active
}
pub async fn resolution(&self) -> Option<DesktopResolution> {
self.inner.lock().await.resolution.clone()
}
pub async fn display_name(&self) -> Option<String> {
self.inner.lock().await.display.clone()
}
}

View file

@ -4,6 +4,7 @@ mod acp_proxy_runtime;
pub mod cli;
pub mod daemon;
mod desktop_errors;
mod desktop_gstreamer;
mod desktop_install;
mod desktop_recording;
mod desktop_runtime;

View file

@ -41,9 +41,9 @@ use crate::desktop_errors::DesktopProblem;
use crate::desktop_runtime::DesktopRuntime;
use crate::desktop_types::*;
use crate::process_runtime::{
decode_input_bytes, ProcessLogFilter, ProcessLogFilterStream, ProcessOwner as RuntimeProcessOwner,
ProcessRuntime, ProcessRuntimeConfig, ProcessSnapshot, ProcessStartSpec, ProcessStatus,
ProcessStream, RunSpec,
decode_input_bytes, ProcessLogFilter, ProcessLogFilterStream,
ProcessOwner as RuntimeProcessOwner, ProcessRuntime, ProcessRuntimeConfig, ProcessSnapshot,
ProcessStartSpec, ProcessStatus, ProcessStream, RunSpec,
};
use crate::ui;
@ -235,7 +235,7 @@ pub fn build_router_with_state(shared: Arc<AppState>) -> (Router, Arc<AppState>)
)
.route("/desktop/stream/start", post(post_v1_desktop_stream_start))
.route("/desktop/stream/stop", post(post_v1_desktop_stream_stop))
.route("/desktop/stream/ws", get(get_v1_desktop_stream_ws))
.route("/desktop/stream/signaling", get(get_v1_desktop_stream_ws))
.route("/agents", get(get_v1_agents))
.route("/agents/:agent", get(get_v1_agent))
.route("/agents/:agent/install", post(post_v1_agent_install))
@ -1135,9 +1135,11 @@ async fn get_v1_desktop_recording_download(
Path(id): Path<String>,
) -> Result<Response, ApiError> {
let path = state.desktop_runtime().recording_download_path(&id).await?;
let bytes = tokio::fs::read(&path).await.map_err(|err| SandboxError::StreamError {
message: format!("failed to read desktop recording {}: {err}", path.display()),
})?;
let bytes = tokio::fs::read(&path)
.await
.map_err(|err| SandboxError::StreamError {
message: format!("failed to read desktop recording {}: {err}", path.display()),
})?;
Ok(([(header::CONTENT_TYPE, "video/mp4")], Bytes::from(bytes)).into_response())
}
@ -1179,7 +1181,7 @@ async fn delete_v1_desktop_recording(
async fn post_v1_desktop_stream_start(
State(state): State<Arc<AppState>>,
) -> Result<Json<DesktopStreamStatusResponse>, ApiError> {
Ok(Json(state.desktop_runtime().start_streaming().await))
Ok(Json(state.desktop_runtime().start_streaming().await?))
}
/// Stop desktop streaming.
@ -1199,13 +1201,14 @@ async fn post_v1_desktop_stream_stop(
Ok(Json(state.desktop_runtime().stop_streaming().await))
}
/// Open a desktop websocket streaming session.
/// Open a desktop WebRTC signaling session.
///
/// Upgrades the connection to a websocket that streams JPEG desktop frames and
/// accepts mouse and keyboard control frames.
/// Upgrades the connection to a WebSocket used for WebRTC signaling between
/// the browser client and the desktop streaming process. Also accepts mouse
/// and keyboard input frames as a fallback transport.
#[utoipa::path(
get,
path = "/v1/desktop/stream/ws",
path = "/v1/desktop/stream/signaling",
tag = "v1",
params(
("access_token" = Option<String>, Query, description = "Bearer token alternative for WS auth")
@ -2449,46 +2452,6 @@ enum TerminalClientFrame {
Close,
}
#[derive(Debug, Deserialize)]
#[serde(tag = "type", rename_all = "camelCase")]
enum DesktopStreamClientFrame {
MoveMouse {
x: i32,
y: i32,
},
MouseDown {
#[serde(default)]
x: Option<i32>,
#[serde(default)]
y: Option<i32>,
#[serde(default)]
button: Option<DesktopMouseButton>,
},
MouseUp {
#[serde(default)]
x: Option<i32>,
#[serde(default)]
y: Option<i32>,
#[serde(default)]
button: Option<DesktopMouseButton>,
},
Scroll {
x: i32,
y: i32,
#[serde(default)]
delta_x: Option<i32>,
#[serde(default)]
delta_y: Option<i32>,
},
KeyDown {
key: String,
},
KeyUp {
key: String,
},
Close,
}
async fn process_terminal_ws_session(
mut socket: WebSocket,
runtime: Arc<ProcessRuntime>,
@ -2601,22 +2564,38 @@ async fn process_terminal_ws_session(
}
}
async fn desktop_stream_ws_session(mut socket: WebSocket, desktop_runtime: Arc<DesktopRuntime>) {
let display_info = match desktop_runtime.display_info().await {
Ok(info) => info,
Err(err) => {
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
let _ = socket.close().await;
return;
}
};
/// WebRTC signaling and input session.
///
/// Handles WebRTC signaling (SDP offer/answer, ICE candidate exchange) and
/// accepts mouse/keyboard input as a fallback transport when the WebRTC data
/// channel is not established. When compiled with the `desktop-gstreamer`
/// feature, creates a GStreamer pipeline for real video streaming.
async fn desktop_stream_ws_session(mut ws: WebSocket, desktop_runtime: Arc<DesktopRuntime>) {
let streaming = desktop_runtime.streaming_manager();
// Get resolution for the ready message.
let resolution =
streaming
.resolution()
.await
.unwrap_or(crate::desktop_types::DesktopResolution {
width: 1440,
height: 900,
dpi: None,
});
let x_display = streaming
.display_name()
.await
.unwrap_or_else(|| ":99".to_string());
// Send stream metadata immediately.
if send_ws_json(
&mut socket,
&mut ws,
json!({
"type": "ready",
"width": display_info.resolution.width,
"height": display_info.resolution.height,
"width": resolution.width,
"height": resolution.height,
}),
)
.await
@ -2625,109 +2604,270 @@ async fn desktop_stream_ws_session(mut socket: WebSocket, desktop_runtime: Arc<D
return;
}
let mut frame_tick = tokio::time::interval(Duration::from_millis(100));
// Try to create a GStreamer WebRTC pipeline for real video streaming.
#[cfg(feature = "desktop-gstreamer")]
{
use crate::desktop_gstreamer::pipeline::GStreamerPipeline;
match GStreamerPipeline::new(&x_display) {
Ok((pipeline, mut event_rx)) => {
tracing::info!(display = %x_display, "GStreamer WebRTC pipeline started");
// Run the session with the GStreamer pipeline active.
desktop_stream_ws_loop_gstreamer(
&mut ws,
&desktop_runtime,
&pipeline,
&mut event_rx,
)
.await;
// Pipeline is dropped here, stopping GStreamer.
let _ = ws.close().await;
return;
}
Err(e) => {
tracing::warn!(error = %e, "GStreamer pipeline creation failed");
let _ = send_ws_error(&mut ws, &e).await;
}
}
}
// Fallback: run without GStreamer (input-only, no video).
desktop_stream_ws_loop_simple(&mut ws, &desktop_runtime).await;
let _ = ws.close().await;
}
/// Inner WS message loop — input-only, no GStreamer pipeline.
async fn desktop_stream_ws_loop_simple(ws: &mut WebSocket, desktop_runtime: &Arc<DesktopRuntime>) {
loop {
let ws_msg = ws.recv().await;
if !handle_ws_message_simple(ws_msg, ws, desktop_runtime).await {
break;
}
}
}
/// Inner WS message loop with GStreamer pipeline — polls both pipeline events
/// and client WS messages.
#[cfg(feature = "desktop-gstreamer")]
async fn desktop_stream_ws_loop_gstreamer(
ws: &mut WebSocket,
desktop_runtime: &Arc<DesktopRuntime>,
pipeline: &crate::desktop_gstreamer::pipeline::GStreamerPipeline,
event_rx: &mut tokio::sync::mpsc::UnboundedReceiver<
crate::desktop_gstreamer::pipeline::PipelineEvent,
>,
) {
use crate::desktop_gstreamer::pipeline::{PipelineEvent, SignalingCommand};
loop {
tokio::select! {
ws_in = socket.recv() => {
match ws_in {
Some(Ok(Message::Text(text))) => {
match serde_json::from_str::<DesktopStreamClientFrame>(&text) {
Ok(DesktopStreamClientFrame::MoveMouse { x, y }) => {
if let Err(err) = desktop_runtime
.move_mouse(DesktopMouseMoveRequest { x, y })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::MouseDown { x, y, button }) => {
if let Err(err) = desktop_runtime
.mouse_down(DesktopMouseDownRequest { x, y, button })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::MouseUp { x, y, button }) => {
if let Err(err) = desktop_runtime
.mouse_up(DesktopMouseUpRequest { x, y, button })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::Scroll { x, y, delta_x, delta_y }) => {
if let Err(err) = desktop_runtime
.scroll_mouse(DesktopMouseScrollRequest {
x,
y,
delta_x,
delta_y,
})
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::KeyDown { key }) => {
if let Err(err) = desktop_runtime
.key_down(DesktopKeyboardDownRequest { key })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::KeyUp { key }) => {
if let Err(err) = desktop_runtime
.key_up(DesktopKeyboardUpRequest { key })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::Close) => {
let _ = socket.close().await;
break;
}
Err(err) => {
let _ = send_ws_error(&mut socket, &format!("invalid desktop stream frame: {err}")).await;
}
}
}
Some(Ok(Message::Ping(payload))) => {
let _ = socket.send(Message::Pong(payload)).await;
}
Some(Ok(Message::Close(_))) | None => break,
Some(Ok(Message::Binary(_))) | Some(Ok(Message::Pong(_))) => {}
Some(Err(_)) => break,
}
}
_ = frame_tick.tick() => {
let frame = desktop_runtime
.screenshot(DesktopScreenshotQuery {
format: Some(DesktopScreenshotFormat::Jpeg),
quality: Some(60),
scale: Some(1.0),
})
.await;
match frame {
Ok(frame) => {
if socket.send(Message::Binary(frame.bytes.into())).await.is_err() {
pipeline_event = event_rx.recv() => {
match pipeline_event {
Some(PipelineEvent::Offer(sdp)) => {
if send_ws_json(ws, json!({"type": "offer", "sdp": sdp})).await.is_err() {
break;
}
}
Err(err) => {
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
let _ = socket.close().await;
break;
Some(PipelineEvent::IceCandidate { candidate, sdp_m_line_index }) => {
if send_ws_json(ws, json!({
"type": "candidate",
"candidate": candidate,
"sdpMLineIndex": sdp_m_line_index,
})).await.is_err() {
break;
}
}
None => break,
}
}
ws_msg = ws.recv() => {
match ws_msg {
Some(Ok(Message::Text(text))) => {
let parsed: Value = match serde_json::from_str(&text) {
Ok(v) => v,
Err(_) => continue,
};
match parsed.get("type").and_then(|v| v.as_str()) {
Some("answer") => {
if let Some(sdp) = parsed.get("sdp").and_then(|v| v.as_str()) {
pipeline.send_command(SignalingCommand::Answer(sdp.to_string()));
}
}
Some("candidate") => {
if let Some(candidate) = parsed.get("candidate").and_then(|v| v.as_str()) {
let sdp_m_line_index = parsed
.get("sdpMLineIndex")
.and_then(|v| v.as_u64())
.unwrap_or(0) as u32;
pipeline.send_command(SignalingCommand::IceCandidate {
candidate: candidate.to_string(),
sdp_m_line_index,
});
}
}
// Input messages (fallback transport)
Some("moveMouse") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let _ = desktop_runtime
.move_mouse(DesktopMouseMoveRequest { x: x as i32, y: y as i32 })
.await;
}
}
Some("mouseDown") => {
let button = parsed.get("button").and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime.mouse_down(DesktopMouseDownRequest { x, y, button }).await;
}
Some("mouseUp") => {
let button = parsed.get("button").and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime.mouse_up(DesktopMouseUpRequest { x, y, button }).await;
}
Some("scroll") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let dx = parsed.get("deltaX").and_then(|v| v.as_i64()).map(|v| v as i32);
let dy = parsed.get("deltaY").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime.scroll_mouse(DesktopMouseScrollRequest { x: x as i32, y: y as i32, delta_x: dx, delta_y: dy }).await;
}
}
Some("keyDown") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime.key_down(DesktopKeyboardDownRequest { key: key.to_string() }).await;
}
}
Some("keyUp") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime.key_up(DesktopKeyboardUpRequest { key: key.to_string() }).await;
}
}
_ => {}
}
}
Some(Ok(Message::Ping(payload))) => {
let _ = ws.send(Message::Pong(payload)).await;
}
Some(Ok(Message::Close(_))) | None | Some(Err(_)) => break,
_ => {}
}
}
}
}
}
/// Process a single WebSocket message (no pipeline). Returns false to close.
async fn handle_ws_message_simple(
msg: Option<Result<Message, axum::Error>>,
ws: &mut WebSocket,
desktop_runtime: &Arc<DesktopRuntime>,
) -> bool {
match msg {
Some(Ok(Message::Text(text))) => {
let parsed: Value = match serde_json::from_str(&text) {
Ok(v) => v,
Err(_) => return true,
};
match parsed.get("type").and_then(|v| v.as_str()) {
// --- Input messages (fallback transport) ---
Some("moveMouse") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let _ = desktop_runtime
.move_mouse(DesktopMouseMoveRequest {
x: x as i32,
y: y as i32,
})
.await;
}
}
Some("mouseDown") => {
let button = parsed
.get("button")
.and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime
.mouse_down(DesktopMouseDownRequest { x, y, button })
.await;
}
Some("mouseUp") => {
let button = parsed
.get("button")
.and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime
.mouse_up(DesktopMouseUpRequest { x, y, button })
.await;
}
Some("scroll") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let delta_x = parsed
.get("deltaX")
.and_then(|v| v.as_i64())
.map(|v| v as i32);
let delta_y = parsed
.get("deltaY")
.and_then(|v| v.as_i64())
.map(|v| v as i32);
let _ = desktop_runtime
.scroll_mouse(DesktopMouseScrollRequest {
x: x as i32,
y: y as i32,
delta_x,
delta_y,
})
.await;
}
}
Some("keyDown") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime
.key_down(DesktopKeyboardDownRequest {
key: key.to_string(),
})
.await;
}
}
Some("keyUp") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime
.key_up(DesktopKeyboardUpRequest {
key: key.to_string(),
})
.await;
}
}
// --- WebRTC signaling messages (accepted without error) ---
Some("answer") | Some("candidate") | Some("offer") => {}
_ => {}
}
true
}
Some(Ok(Message::Ping(payload))) => {
let _ = ws.send(Message::Pong(payload)).await;
true
}
Some(Ok(Message::Close(_))) | None | Some(Err(_)) => false,
_ => true,
}
}
async fn send_ws_json(socket: &mut WebSocket, payload: Value) -> Result<(), ()> {
socket
.send(Message::Text(

View file

@ -432,7 +432,7 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() {
assert_eq!(status, StatusCode::OK);
assert_eq!(parse_json(&body)["active"], true);
let (mut ws, _) = connect_async(test_app.app.ws_url("/v1/desktop/stream/ws"))
let (mut ws, _) = connect_async(test_app.app.ws_url("/v1/desktop/stream/signaling"))
.await
.expect("connect desktop stream websocket");
@ -447,12 +447,9 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() {
other => panic!("expected text ready frame, got {other:?}"),
}
let frame = recv_ws_message(&mut ws).await;
match frame {
Message::Binary(bytes) => assert!(bytes.starts_with(&[0xff, 0xd8, 0xff])),
other => panic!("expected binary jpeg frame, got {other:?}"),
}
// The signaling WebSocket now accepts input frames as fallback transport
// (when the WebRTC data channel is not established). Send a mouse move to
// verify input dispatch still works over the signaling channel.
ws.send(Message::Text(
json!({
"type": "moveMouse",
@ -464,6 +461,20 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() {
))
.await
.expect("send desktop stream mouse move");
// Send a WebRTC signaling message (offer) to verify the signaling path
// accepts it without error.
ws.send(Message::Text(
json!({
"type": "offer",
"sdp": "v=0\r\n"
})
.to_string()
.into(),
))
.await
.expect("send desktop stream offer");
let _ = ws.close(None).await;
let (status, _, body) = send_request(