feat: replace neko with native GStreamer WebRTC for desktop streaming

Replace the neko binary dependency with a native GStreamer pipeline
(ximagesrc -> vp8enc -> webrtcbin) for desktop video streaming. This
removes the external neko process and integrates screen capture directly
via gstreamer-rs crate bindings behind a `desktop-gstreamer` feature flag.

Key changes:
- Add desktop_gstreamer.rs with GStreamer WebRTC pipeline management
- Rewrite signaling protocol (ready/offer/answer/candidate over WS)
- Add leaky queues and videorate for low-latency streaming
- Rewrite ICE candidates to 127.0.0.1 for Docker connectivity
- Constrain UDP port range (30000-30100) via libnice agent
- Update TypeScript SDK desktop-stream.ts for new signaling
- Update inspector DesktopTab with WebRTC Live View
- Update Dockerfiles to install GStreamer dev packages

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nathan Flurry 2026-03-16 17:54:39 -07:00
parent e638148345
commit 944ad1ba55
22 changed files with 1848 additions and 1170 deletions

40
server/compose.dev.yaml Normal file
View file

@ -0,0 +1,40 @@
name: sandbox-agent-dev
services:
backend:
build:
context: ..
dockerfile: docker/test-agent/Dockerfile
image: sandbox-agent-dev
command: ["server", "--host", "0.0.0.0", "--port", "3000", "--no-token"]
environment:
RUST_LOG: "${RUST_LOG:-info}"
ports:
- "2468:3000"
frontend:
build:
context: ..
dockerfile: server/docker/frontend.dev.Dockerfile
working_dir: /app
depends_on:
- backend
environment:
SANDBOX_AGENT_URL: "http://backend:3000"
ports:
- "5173:5173"
volumes:
- "..:/app"
# Keep Linux-native node_modules inside the container.
- "sa_root_node_modules:/app/node_modules"
- "sa_inspector_node_modules:/app/frontend/packages/inspector/node_modules"
- "sa_react_node_modules:/app/sdks/react/node_modules"
- "sa_typescript_node_modules:/app/sdks/typescript/node_modules"
- "sa_pnpm_store:/root/.local/share/pnpm/store"
volumes:
sa_root_node_modules: {}
sa_inspector_node_modules: {}
sa_react_node_modules: {}
sa_typescript_node_modules: {}
sa_pnpm_store: {}

View file

@ -0,0 +1,5 @@
FROM node:22-bookworm-slim
RUN npm install -g pnpm
WORKDIR /app
EXPOSE 5173
CMD ["sh", "-c", "pnpm install && cd frontend/packages/inspector && npx vite --host 0.0.0.0"]

View file

@ -42,6 +42,9 @@ toml_edit.workspace = true
tar.workspace = true
zip.workspace = true
tempfile = { workspace = true, optional = true }
gstreamer = { version = "0.23", optional = true }
gstreamer-sdp = { version = "0.23", optional = true }
gstreamer-webrtc = { version = "0.23", optional = true }
[target.'cfg(unix)'.dependencies]
libc = "0.2"
@ -59,3 +62,4 @@ tokio-tungstenite = "0.24"
[features]
test-utils = ["tempfile"]
desktop-gstreamer = ["gstreamer", "gstreamer-sdp", "gstreamer-webrtc"]

View file

@ -0,0 +1,246 @@
/// GStreamer WebRTC pipeline for desktop streaming.
///
/// Creates a pipeline that captures the X11 display via `ximagesrc`, encodes to
/// VP8, and streams over WebRTC using `webrtcbin`. Signaling (SDP offer/answer,
/// ICE candidate exchange) is handled via channels that the caller bridges to
/// the client WebSocket.
#[cfg(feature = "desktop-gstreamer")]
pub mod pipeline {
use gstreamer as gst;
use gstreamer::prelude::*;
use gstreamer_sdp as gst_sdp;
use gstreamer_webrtc as gst_webrtc;
use tokio::sync::mpsc;
/// Messages sent from the GStreamer pipeline to the WebSocket handler.
#[derive(Debug)]
pub enum PipelineEvent {
/// SDP offer generated by webrtcbin.
Offer(String),
/// ICE candidate produced by webrtcbin.
IceCandidate {
candidate: String,
sdp_m_line_index: u32,
},
}
/// Messages sent from the WebSocket handler to the GStreamer pipeline.
#[derive(Debug)]
pub enum SignalingCommand {
/// SDP answer from the client.
Answer(String),
/// ICE candidate from the client.
IceCandidate {
candidate: String,
sdp_m_line_index: u32,
},
}
pub struct GStreamerPipeline {
pipeline: gst::Pipeline,
cmd_tx: mpsc::UnboundedSender<SignalingCommand>,
}
impl GStreamerPipeline {
/// Create and start a new GStreamer WebRTC pipeline for the given display.
///
/// Returns the pipeline handle and a receiver for pipeline events (offers,
/// ICE candidates) that should be forwarded to the client.
pub fn new(
display: &str,
) -> Result<(Self, mpsc::UnboundedReceiver<PipelineEvent>), String> {
gst::init().map_err(|e| {
format!(
"Desktop streaming requires GStreamer. Install it with: \
sandbox-agent desktop install\n\
Error: {e}"
)
})?;
let pipeline_str = format!(
"ximagesrc display-name={display} use-damage=true show-pointer=true \
! video/x-raw,framerate=30/1 \
! videorate \
! videoconvert \
! queue max-size-buffers=1 leaky=downstream \
! vp8enc deadline=1 target-bitrate=3000000 cpu-used=16 threads=4 \
keyframe-max-dist=60 end-usage=cbr buffer-size=500 buffer-initial-size=300 \
error-resilient=partitions \
! rtpvp8pay picture-id-mode=15bit \
! queue max-size-buffers=1 leaky=downstream \
! application/x-rtp,media=video,encoding-name=VP8,payload=96 \
! webrtcbin name=wb bundle-policy=max-bundle"
);
let pipeline = gst::parse::launch(&pipeline_str)
.map_err(|e| format!("failed to create GStreamer pipeline: {e}"))?
.downcast::<gst::Pipeline>()
.map_err(|_| "pipeline is not a GstPipeline".to_string())?;
let webrtcbin = pipeline
.by_name("wb")
.ok_or_else(|| "webrtcbin element not found in pipeline".to_string())?;
// Configure STUN for ICE connectivity (used for server-reflexive
// candidates when behind NAT).
webrtcbin.set_property_from_str("stun-server", "stun://stun.l.google.com:19302");
// Restrict the UDP port range so Docker port forwarding works.
// The ice-agent is a GstWebRTCICE which wraps a NiceAgent.
let ice_agent: gst::glib::Object = webrtcbin.property("ice-agent");
// GstWebRTCNice has a "min-rtp-port" and "max-rtp-port" property
// in newer versions, but on GStreamer 1.22 we need to access the
// underlying NiceAgent via the "agent" property.
if ice_agent.has_property("min-rtp-port", None) {
ice_agent.set_property("min-rtp-port", 30000u32);
ice_agent.set_property("max-rtp-port", 30100u32);
} else if ice_agent.has_property("agent", None) {
let nice_agent: gst::glib::Object = ice_agent.property("agent");
nice_agent.set_property("max-port", 30100u32);
nice_agent.set_property("min-port", 30000u32);
}
// Channel for pipeline -> WS handler events.
let (event_tx, event_rx) = mpsc::unbounded_channel::<PipelineEvent>();
// Channel for WS handler -> pipeline commands.
let (cmd_tx, mut cmd_rx) = mpsc::unbounded_channel::<SignalingCommand>();
// Note: Data channel for input will be created once we establish
// the WebRTC connection. Input falls back to the WS transport.
// When webrtcbin needs to negotiate, create an offer.
let wb_clone = webrtcbin.clone();
let event_tx_offer = event_tx.clone();
webrtcbin.connect("on-negotiation-needed", false, move |_| {
let wb_offer = wb_clone.clone();
let wb_create = wb_clone.clone();
let tx = event_tx_offer.clone();
let promise = gst::Promise::with_change_func(move |reply| {
let reply = match reply {
Ok(Some(reply)) => reply,
_ => return,
};
let offer = match reply.value("offer") {
Ok(offer) => offer,
Err(_) => return,
};
let offer = offer
.get::<gst_webrtc::WebRTCSessionDescription>()
.expect("offer is WebRTCSessionDescription");
wb_offer.emit_by_name::<()>(
"set-local-description",
&[&offer, &None::<gst::Promise>],
);
if let Ok(sdp_text) = offer.sdp().as_text() {
let _ = tx.send(PipelineEvent::Offer(sdp_text.to_string()));
}
});
wb_create.emit_by_name::<()>("create-offer", &[&None::<gst::Structure>, &promise]);
None
});
// When webrtcbin produces an ICE candidate, send it to client.
// We rewrite host candidates to use 127.0.0.1 so the browser can
// reach the server when running inside Docker.
let event_tx_ice = event_tx;
webrtcbin.connect("on-ice-candidate", false, move |values| {
let sdp_m_line_index = values[1].get::<u32>().expect("m-line index is u32");
let candidate = values[2].get::<String>().expect("candidate is String");
// Only forward UDP host candidates, rewritten to 127.0.0.1.
// Skip TCP candidates (browsers rarely use TCP for WebRTC media)
// and server-reflexive candidates (STUN responses with public IPs).
if candidate.contains("UDP") && candidate.contains("typ host") {
// Replace the Docker-internal IP with 127.0.0.1 so the
// browser on the host can connect.
let rewritten = rewrite_candidate_ip(&candidate, "127.0.0.1");
let _ = event_tx_ice.send(PipelineEvent::IceCandidate {
candidate: rewritten,
sdp_m_line_index,
});
}
None
});
// Start the pipeline.
pipeline
.set_state(gst::State::Playing)
.map_err(|e| format!("failed to start GStreamer pipeline: {e}"))?;
// Spawn a thread to process signaling commands from the WS handler.
let wb_cmd = webrtcbin.clone();
std::thread::spawn(move || {
while let Some(cmd) = cmd_rx.blocking_recv() {
match cmd {
SignalingCommand::Answer(sdp_str) => {
let sdp = match gst_sdp::SDPMessage::parse_buffer(sdp_str.as_bytes()) {
Ok(sdp) => sdp,
Err(e) => {
tracing::warn!(error = ?e, "failed to parse SDP answer");
continue;
}
};
let answer = gst_webrtc::WebRTCSessionDescription::new(
gst_webrtc::WebRTCSDPType::Answer,
sdp,
);
wb_cmd.emit_by_name::<()>(
"set-remote-description",
&[&answer, &None::<gst::Promise>],
);
}
SignalingCommand::IceCandidate {
candidate,
sdp_m_line_index,
} => {
wb_cmd.emit_by_name::<()>(
"add-ice-candidate",
&[&sdp_m_line_index, &candidate],
);
}
}
}
});
Ok((Self { pipeline, cmd_tx }, event_rx))
}
/// Send a signaling command to the pipeline.
pub fn send_command(&self, cmd: SignalingCommand) {
let _ = self.cmd_tx.send(cmd);
}
}
impl Drop for GStreamerPipeline {
fn drop(&mut self) {
let _ = self.pipeline.set_state(gst::State::Null);
}
}
/// Rewrite the IP address in an ICE candidate string.
///
/// ICE candidate format:
/// candidate:1 1 UDP 2015363327 172.17.0.6 39395 typ host
///
/// We replace the IP (field 5, 0-indexed) with the target IP.
fn rewrite_candidate_ip(candidate: &str, target_ip: &str) -> String {
let parts: Vec<&str> = candidate.splitn(6, ' ').collect();
if parts.len() >= 6 {
// parts[4] is the IP address
let rest_after_ip = &candidate[parts[..5].join(" ").len()..];
format!(
"{} {} {} {} {}{}",
parts[0], parts[1], parts[2], parts[3], target_ip, rest_after_ip
)
} else {
candidate.to_string()
}
}
}
/// Check if GStreamer support is compiled in.
pub fn is_available() -> bool {
cfg!(feature = "desktop-gstreamer")
}

View file

@ -110,6 +110,13 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V
"dbus-x11",
"xauth",
"fonts-dejavu-core",
"libgstreamer1.0-0",
"gstreamer1.0-plugins-base",
"gstreamer1.0-plugins-good",
"gstreamer1.0-plugins-bad",
"gstreamer1.0-plugins-ugly",
"gstreamer1.0-nice",
"gstreamer1.0-x",
],
DesktopPackageManager::Dnf => vec![
"xorg-x11-server-Xvfb",
@ -121,6 +128,13 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V
"dbus-x11",
"xauth",
"dejavu-sans-fonts",
"gstreamer1",
"gstreamer1-plugins-base",
"gstreamer1-plugins-good",
"gstreamer1-plugins-bad-free",
"gstreamer1-plugins-ugly-free",
"gstreamer1-plugin-libnice",
"gstreamer1-plugins-good-extras",
],
DesktopPackageManager::Apk => vec![
"xvfb",
@ -132,6 +146,12 @@ fn desktop_packages(package_manager: DesktopPackageManager, no_fonts: bool) -> V
"dbus",
"xauth",
"ttf-dejavu",
"gstreamer",
"gst-plugins-base",
"gst-plugins-good",
"gst-plugins-bad",
"gst-plugins-ugly",
"libnice-gstreamer",
],
}
.into_iter()

View file

@ -10,20 +10,20 @@ use tokio::sync::Mutex;
use sandbox_agent_error::SandboxError;
use crate::desktop_recording::{DesktopRecordingContext, DesktopRecordingManager};
use crate::desktop_errors::DesktopProblem;
use crate::desktop_install::desktop_platform_support_message;
use crate::desktop_recording::{DesktopRecordingContext, DesktopRecordingManager};
use crate::desktop_streaming::DesktopStreamingManager;
use crate::desktop_types::{
DesktopActionResponse, DesktopDisplayInfoResponse, DesktopErrorInfo,
DesktopKeyModifiers, DesktopKeyboardDownRequest, DesktopKeyboardPressRequest,
DesktopKeyboardTypeRequest, DesktopKeyboardUpRequest, DesktopMouseButton,
DesktopMouseClickRequest, DesktopMouseDownRequest, DesktopMouseDragRequest,
DesktopMouseMoveRequest, DesktopMousePositionResponse, DesktopMouseScrollRequest,
DesktopMouseUpRequest, DesktopProcessInfo, DesktopRecordingInfo,
DesktopRecordingListResponse, DesktopRecordingStartRequest, DesktopRegionScreenshotQuery,
DesktopResolution, DesktopScreenshotFormat, DesktopScreenshotQuery, DesktopStartRequest,
DesktopState, DesktopStatusResponse, DesktopStreamStatusResponse, DesktopWindowInfo,
DesktopActionResponse, DesktopDisplayInfoResponse, DesktopErrorInfo, DesktopKeyModifiers,
DesktopKeyboardDownRequest, DesktopKeyboardPressRequest, DesktopKeyboardTypeRequest,
DesktopKeyboardUpRequest, DesktopMouseButton, DesktopMouseClickRequest,
DesktopMouseDownRequest, DesktopMouseDragRequest, DesktopMouseMoveRequest,
DesktopMousePositionResponse, DesktopMouseScrollRequest, DesktopMouseUpRequest,
DesktopProcessInfo, DesktopRecordingInfo, DesktopRecordingListResponse,
DesktopRecordingStartRequest, DesktopRegionScreenshotQuery, DesktopResolution,
DesktopScreenshotFormat, DesktopScreenshotQuery, DesktopStartRequest, DesktopState,
DesktopStatusResponse, DesktopStreamStatusResponse, DesktopWindowInfo,
DesktopWindowListResponse,
};
use crate::process_runtime::{
@ -172,9 +172,9 @@ impl DesktopRuntime {
let recording_manager =
DesktopRecordingManager::new(process_runtime.clone(), config.state_dir.clone());
Self {
streaming_manager: DesktopStreamingManager::new(),
process_runtime,
recording_manager,
streaming_manager: DesktopStreamingManager::new(),
inner: Arc::new(Mutex::new(DesktopRuntimeStateData {
state: DesktopState::Inactive,
display_num: config.display_num,
@ -197,7 +197,10 @@ impl DesktopRuntime {
pub async fn status(&self) -> DesktopStatusResponse {
let mut state = self.inner.lock().await;
self.refresh_status_locked(&mut state).await;
self.snapshot_locked(&state)
let mut response = self.snapshot_locked(&state);
drop(state);
response
}
pub async fn start(
@ -221,7 +224,10 @@ impl DesktopRuntime {
self.refresh_status_locked(&mut state).await;
if state.state == DesktopState::Active {
return Ok(self.snapshot_locked(&state));
let mut response = self.snapshot_locked(&state);
drop(state);
return Ok(response);
}
if !state.missing_dependencies.is_empty() {
@ -307,7 +313,10 @@ impl DesktopRuntime {
),
);
Ok(self.snapshot_locked(&state))
let mut response = self.snapshot_locked(&state);
drop(state);
Ok(response)
}
pub async fn stop(&self) -> Result<DesktopStatusResponse, DesktopProblem> {
@ -336,7 +345,10 @@ impl DesktopRuntime {
state.install_command = self.install_command_for(&state.missing_dependencies);
state.environment.clear();
Ok(self.snapshot_locked(&state))
let mut response = self.snapshot_locked(&state);
drop(state);
Ok(response)
}
pub async fn shutdown(&self) {
@ -630,8 +642,23 @@ impl DesktopRuntime {
self.recording_manager.delete(id).await
}
pub async fn start_streaming(&self) -> DesktopStreamStatusResponse {
self.streaming_manager.start().await
pub async fn start_streaming(&self) -> Result<DesktopStreamStatusResponse, SandboxError> {
let state = self.inner.lock().await;
let display = state
.display
.as_deref()
.ok_or_else(|| SandboxError::Conflict {
message: "desktop runtime is not active".to_string(),
})?;
let resolution = state
.resolution
.clone()
.ok_or_else(|| SandboxError::Conflict {
message: "desktop runtime is not active".to_string(),
})?;
let display = display.to_string();
drop(state);
Ok(self.streaming_manager.start(&display, resolution).await)
}
pub async fn stop_streaming(&self) -> DesktopStreamStatusResponse {
@ -639,7 +666,17 @@ impl DesktopRuntime {
}
pub async fn ensure_streaming_active(&self) -> Result<(), SandboxError> {
self.streaming_manager.ensure_active().await
if self.streaming_manager.is_active().await {
Ok(())
} else {
Err(SandboxError::Conflict {
message: "desktop streaming is not active".to_string(),
})
}
}
pub fn streaming_manager(&self) -> &DesktopStreamingManager {
&self.streaming_manager
}
async fn recording_context(&self) -> Result<DesktopRecordingContext, SandboxError> {
@ -831,8 +868,14 @@ impl DesktopRuntime {
name: &str,
) -> Result<(), DesktopProblem> {
let process_id = match name {
"Xvfb" => state.xvfb.as_ref().map(|process| process.process_id.clone()),
"openbox" => state.openbox.as_ref().map(|process| process.process_id.clone()),
"Xvfb" => state
.xvfb
.as_ref()
.map(|process| process.process_id.clone()),
"openbox" => state
.openbox
.as_ref()
.map(|process| process.process_id.clone()),
_ => None,
};

View file

@ -2,9 +2,7 @@ use std::sync::Arc;
use tokio::sync::Mutex;
use sandbox_agent_error::SandboxError;
use crate::desktop_types::DesktopStreamStatusResponse;
use crate::desktop_types::{DesktopResolution, DesktopStreamStatusResponse};
#[derive(Debug, Clone)]
pub struct DesktopStreamingManager {
@ -14,6 +12,8 @@ pub struct DesktopStreamingManager {
#[derive(Debug, Default)]
struct DesktopStreamingState {
active: bool,
display: Option<String>,
resolution: Option<DesktopResolution>,
}
impl DesktopStreamingManager {
@ -23,25 +23,46 @@ impl DesktopStreamingManager {
}
}
pub async fn start(&self) -> DesktopStreamStatusResponse {
/// Mark desktop streaming as active for the given display and resolution.
///
/// The actual GStreamer pipeline is created per-WebSocket-session in the
/// signaling handler — this method just records that streaming is enabled.
pub async fn start(
&self,
display: &str,
resolution: DesktopResolution,
) -> DesktopStreamStatusResponse {
let mut state = self.inner.lock().await;
if state.active {
return DesktopStreamStatusResponse { active: true };
}
state.active = true;
state.display = Some(display.to_string());
state.resolution = Some(resolution);
DesktopStreamStatusResponse { active: true }
}
/// Stop streaming and clear state.
pub async fn stop(&self) -> DesktopStreamStatusResponse {
let mut state = self.inner.lock().await;
state.active = false;
state.display = None;
state.resolution = None;
DesktopStreamStatusResponse { active: false }
}
pub async fn ensure_active(&self) -> Result<(), SandboxError> {
if self.inner.lock().await.active {
Ok(())
} else {
Err(SandboxError::Conflict {
message: "desktop streaming is not active".to_string(),
})
}
pub async fn is_active(&self) -> bool {
self.inner.lock().await.active
}
pub async fn resolution(&self) -> Option<DesktopResolution> {
self.inner.lock().await.resolution.clone()
}
pub async fn display_name(&self) -> Option<String> {
self.inner.lock().await.display.clone()
}
}

View file

@ -4,6 +4,7 @@ mod acp_proxy_runtime;
pub mod cli;
pub mod daemon;
mod desktop_errors;
mod desktop_gstreamer;
mod desktop_install;
mod desktop_recording;
mod desktop_runtime;

View file

@ -41,9 +41,9 @@ use crate::desktop_errors::DesktopProblem;
use crate::desktop_runtime::DesktopRuntime;
use crate::desktop_types::*;
use crate::process_runtime::{
decode_input_bytes, ProcessLogFilter, ProcessLogFilterStream, ProcessOwner as RuntimeProcessOwner,
ProcessRuntime, ProcessRuntimeConfig, ProcessSnapshot, ProcessStartSpec, ProcessStatus,
ProcessStream, RunSpec,
decode_input_bytes, ProcessLogFilter, ProcessLogFilterStream,
ProcessOwner as RuntimeProcessOwner, ProcessRuntime, ProcessRuntimeConfig, ProcessSnapshot,
ProcessStartSpec, ProcessStatus, ProcessStream, RunSpec,
};
use crate::ui;
@ -235,7 +235,7 @@ pub fn build_router_with_state(shared: Arc<AppState>) -> (Router, Arc<AppState>)
)
.route("/desktop/stream/start", post(post_v1_desktop_stream_start))
.route("/desktop/stream/stop", post(post_v1_desktop_stream_stop))
.route("/desktop/stream/ws", get(get_v1_desktop_stream_ws))
.route("/desktop/stream/signaling", get(get_v1_desktop_stream_ws))
.route("/agents", get(get_v1_agents))
.route("/agents/:agent", get(get_v1_agent))
.route("/agents/:agent/install", post(post_v1_agent_install))
@ -1135,9 +1135,11 @@ async fn get_v1_desktop_recording_download(
Path(id): Path<String>,
) -> Result<Response, ApiError> {
let path = state.desktop_runtime().recording_download_path(&id).await?;
let bytes = tokio::fs::read(&path).await.map_err(|err| SandboxError::StreamError {
message: format!("failed to read desktop recording {}: {err}", path.display()),
})?;
let bytes = tokio::fs::read(&path)
.await
.map_err(|err| SandboxError::StreamError {
message: format!("failed to read desktop recording {}: {err}", path.display()),
})?;
Ok(([(header::CONTENT_TYPE, "video/mp4")], Bytes::from(bytes)).into_response())
}
@ -1179,7 +1181,7 @@ async fn delete_v1_desktop_recording(
async fn post_v1_desktop_stream_start(
State(state): State<Arc<AppState>>,
) -> Result<Json<DesktopStreamStatusResponse>, ApiError> {
Ok(Json(state.desktop_runtime().start_streaming().await))
Ok(Json(state.desktop_runtime().start_streaming().await?))
}
/// Stop desktop streaming.
@ -1199,13 +1201,14 @@ async fn post_v1_desktop_stream_stop(
Ok(Json(state.desktop_runtime().stop_streaming().await))
}
/// Open a desktop websocket streaming session.
/// Open a desktop WebRTC signaling session.
///
/// Upgrades the connection to a websocket that streams JPEG desktop frames and
/// accepts mouse and keyboard control frames.
/// Upgrades the connection to a WebSocket used for WebRTC signaling between
/// the browser client and the desktop streaming process. Also accepts mouse
/// and keyboard input frames as a fallback transport.
#[utoipa::path(
get,
path = "/v1/desktop/stream/ws",
path = "/v1/desktop/stream/signaling",
tag = "v1",
params(
("access_token" = Option<String>, Query, description = "Bearer token alternative for WS auth")
@ -2449,46 +2452,6 @@ enum TerminalClientFrame {
Close,
}
#[derive(Debug, Deserialize)]
#[serde(tag = "type", rename_all = "camelCase")]
enum DesktopStreamClientFrame {
MoveMouse {
x: i32,
y: i32,
},
MouseDown {
#[serde(default)]
x: Option<i32>,
#[serde(default)]
y: Option<i32>,
#[serde(default)]
button: Option<DesktopMouseButton>,
},
MouseUp {
#[serde(default)]
x: Option<i32>,
#[serde(default)]
y: Option<i32>,
#[serde(default)]
button: Option<DesktopMouseButton>,
},
Scroll {
x: i32,
y: i32,
#[serde(default)]
delta_x: Option<i32>,
#[serde(default)]
delta_y: Option<i32>,
},
KeyDown {
key: String,
},
KeyUp {
key: String,
},
Close,
}
async fn process_terminal_ws_session(
mut socket: WebSocket,
runtime: Arc<ProcessRuntime>,
@ -2601,22 +2564,38 @@ async fn process_terminal_ws_session(
}
}
async fn desktop_stream_ws_session(mut socket: WebSocket, desktop_runtime: Arc<DesktopRuntime>) {
let display_info = match desktop_runtime.display_info().await {
Ok(info) => info,
Err(err) => {
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
let _ = socket.close().await;
return;
}
};
/// WebRTC signaling and input session.
///
/// Handles WebRTC signaling (SDP offer/answer, ICE candidate exchange) and
/// accepts mouse/keyboard input as a fallback transport when the WebRTC data
/// channel is not established. When compiled with the `desktop-gstreamer`
/// feature, creates a GStreamer pipeline for real video streaming.
async fn desktop_stream_ws_session(mut ws: WebSocket, desktop_runtime: Arc<DesktopRuntime>) {
let streaming = desktop_runtime.streaming_manager();
// Get resolution for the ready message.
let resolution =
streaming
.resolution()
.await
.unwrap_or(crate::desktop_types::DesktopResolution {
width: 1440,
height: 900,
dpi: None,
});
let x_display = streaming
.display_name()
.await
.unwrap_or_else(|| ":99".to_string());
// Send stream metadata immediately.
if send_ws_json(
&mut socket,
&mut ws,
json!({
"type": "ready",
"width": display_info.resolution.width,
"height": display_info.resolution.height,
"width": resolution.width,
"height": resolution.height,
}),
)
.await
@ -2625,109 +2604,270 @@ async fn desktop_stream_ws_session(mut socket: WebSocket, desktop_runtime: Arc<D
return;
}
let mut frame_tick = tokio::time::interval(Duration::from_millis(100));
// Try to create a GStreamer WebRTC pipeline for real video streaming.
#[cfg(feature = "desktop-gstreamer")]
{
use crate::desktop_gstreamer::pipeline::GStreamerPipeline;
match GStreamerPipeline::new(&x_display) {
Ok((pipeline, mut event_rx)) => {
tracing::info!(display = %x_display, "GStreamer WebRTC pipeline started");
// Run the session with the GStreamer pipeline active.
desktop_stream_ws_loop_gstreamer(
&mut ws,
&desktop_runtime,
&pipeline,
&mut event_rx,
)
.await;
// Pipeline is dropped here, stopping GStreamer.
let _ = ws.close().await;
return;
}
Err(e) => {
tracing::warn!(error = %e, "GStreamer pipeline creation failed");
let _ = send_ws_error(&mut ws, &e).await;
}
}
}
// Fallback: run without GStreamer (input-only, no video).
desktop_stream_ws_loop_simple(&mut ws, &desktop_runtime).await;
let _ = ws.close().await;
}
/// Inner WS message loop — input-only, no GStreamer pipeline.
async fn desktop_stream_ws_loop_simple(ws: &mut WebSocket, desktop_runtime: &Arc<DesktopRuntime>) {
loop {
let ws_msg = ws.recv().await;
if !handle_ws_message_simple(ws_msg, ws, desktop_runtime).await {
break;
}
}
}
/// Inner WS message loop with GStreamer pipeline — polls both pipeline events
/// and client WS messages.
#[cfg(feature = "desktop-gstreamer")]
async fn desktop_stream_ws_loop_gstreamer(
ws: &mut WebSocket,
desktop_runtime: &Arc<DesktopRuntime>,
pipeline: &crate::desktop_gstreamer::pipeline::GStreamerPipeline,
event_rx: &mut tokio::sync::mpsc::UnboundedReceiver<
crate::desktop_gstreamer::pipeline::PipelineEvent,
>,
) {
use crate::desktop_gstreamer::pipeline::{PipelineEvent, SignalingCommand};
loop {
tokio::select! {
ws_in = socket.recv() => {
match ws_in {
Some(Ok(Message::Text(text))) => {
match serde_json::from_str::<DesktopStreamClientFrame>(&text) {
Ok(DesktopStreamClientFrame::MoveMouse { x, y }) => {
if let Err(err) = desktop_runtime
.move_mouse(DesktopMouseMoveRequest { x, y })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::MouseDown { x, y, button }) => {
if let Err(err) = desktop_runtime
.mouse_down(DesktopMouseDownRequest { x, y, button })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::MouseUp { x, y, button }) => {
if let Err(err) = desktop_runtime
.mouse_up(DesktopMouseUpRequest { x, y, button })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::Scroll { x, y, delta_x, delta_y }) => {
if let Err(err) = desktop_runtime
.scroll_mouse(DesktopMouseScrollRequest {
x,
y,
delta_x,
delta_y,
})
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::KeyDown { key }) => {
if let Err(err) = desktop_runtime
.key_down(DesktopKeyboardDownRequest { key })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::KeyUp { key }) => {
if let Err(err) = desktop_runtime
.key_up(DesktopKeyboardUpRequest { key })
.await
{
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
}
}
Ok(DesktopStreamClientFrame::Close) => {
let _ = socket.close().await;
break;
}
Err(err) => {
let _ = send_ws_error(&mut socket, &format!("invalid desktop stream frame: {err}")).await;
}
}
}
Some(Ok(Message::Ping(payload))) => {
let _ = socket.send(Message::Pong(payload)).await;
}
Some(Ok(Message::Close(_))) | None => break,
Some(Ok(Message::Binary(_))) | Some(Ok(Message::Pong(_))) => {}
Some(Err(_)) => break,
}
}
_ = frame_tick.tick() => {
let frame = desktop_runtime
.screenshot(DesktopScreenshotQuery {
format: Some(DesktopScreenshotFormat::Jpeg),
quality: Some(60),
scale: Some(1.0),
})
.await;
match frame {
Ok(frame) => {
if socket.send(Message::Binary(frame.bytes.into())).await.is_err() {
pipeline_event = event_rx.recv() => {
match pipeline_event {
Some(PipelineEvent::Offer(sdp)) => {
if send_ws_json(ws, json!({"type": "offer", "sdp": sdp})).await.is_err() {
break;
}
}
Err(err) => {
let _ = send_ws_error(&mut socket, &err.to_error_info().message).await;
let _ = socket.close().await;
break;
Some(PipelineEvent::IceCandidate { candidate, sdp_m_line_index }) => {
if send_ws_json(ws, json!({
"type": "candidate",
"candidate": candidate,
"sdpMLineIndex": sdp_m_line_index,
})).await.is_err() {
break;
}
}
None => break,
}
}
ws_msg = ws.recv() => {
match ws_msg {
Some(Ok(Message::Text(text))) => {
let parsed: Value = match serde_json::from_str(&text) {
Ok(v) => v,
Err(_) => continue,
};
match parsed.get("type").and_then(|v| v.as_str()) {
Some("answer") => {
if let Some(sdp) = parsed.get("sdp").and_then(|v| v.as_str()) {
pipeline.send_command(SignalingCommand::Answer(sdp.to_string()));
}
}
Some("candidate") => {
if let Some(candidate) = parsed.get("candidate").and_then(|v| v.as_str()) {
let sdp_m_line_index = parsed
.get("sdpMLineIndex")
.and_then(|v| v.as_u64())
.unwrap_or(0) as u32;
pipeline.send_command(SignalingCommand::IceCandidate {
candidate: candidate.to_string(),
sdp_m_line_index,
});
}
}
// Input messages (fallback transport)
Some("moveMouse") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let _ = desktop_runtime
.move_mouse(DesktopMouseMoveRequest { x: x as i32, y: y as i32 })
.await;
}
}
Some("mouseDown") => {
let button = parsed.get("button").and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime.mouse_down(DesktopMouseDownRequest { x, y, button }).await;
}
Some("mouseUp") => {
let button = parsed.get("button").and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime.mouse_up(DesktopMouseUpRequest { x, y, button }).await;
}
Some("scroll") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let dx = parsed.get("deltaX").and_then(|v| v.as_i64()).map(|v| v as i32);
let dy = parsed.get("deltaY").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime.scroll_mouse(DesktopMouseScrollRequest { x: x as i32, y: y as i32, delta_x: dx, delta_y: dy }).await;
}
}
Some("keyDown") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime.key_down(DesktopKeyboardDownRequest { key: key.to_string() }).await;
}
}
Some("keyUp") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime.key_up(DesktopKeyboardUpRequest { key: key.to_string() }).await;
}
}
_ => {}
}
}
Some(Ok(Message::Ping(payload))) => {
let _ = ws.send(Message::Pong(payload)).await;
}
Some(Ok(Message::Close(_))) | None | Some(Err(_)) => break,
_ => {}
}
}
}
}
}
/// Process a single WebSocket message (no pipeline). Returns false to close.
async fn handle_ws_message_simple(
msg: Option<Result<Message, axum::Error>>,
ws: &mut WebSocket,
desktop_runtime: &Arc<DesktopRuntime>,
) -> bool {
match msg {
Some(Ok(Message::Text(text))) => {
let parsed: Value = match serde_json::from_str(&text) {
Ok(v) => v,
Err(_) => return true,
};
match parsed.get("type").and_then(|v| v.as_str()) {
// --- Input messages (fallback transport) ---
Some("moveMouse") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let _ = desktop_runtime
.move_mouse(DesktopMouseMoveRequest {
x: x as i32,
y: y as i32,
})
.await;
}
}
Some("mouseDown") => {
let button = parsed
.get("button")
.and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime
.mouse_down(DesktopMouseDownRequest { x, y, button })
.await;
}
Some("mouseUp") => {
let button = parsed
.get("button")
.and_then(|v| serde_json::from_value(v.clone()).ok());
let x = parsed.get("x").and_then(|v| v.as_i64()).map(|v| v as i32);
let y = parsed.get("y").and_then(|v| v.as_i64()).map(|v| v as i32);
let _ = desktop_runtime
.mouse_up(DesktopMouseUpRequest { x, y, button })
.await;
}
Some("scroll") => {
if let (Some(x), Some(y)) = (
parsed.get("x").and_then(|v| v.as_i64()),
parsed.get("y").and_then(|v| v.as_i64()),
) {
let delta_x = parsed
.get("deltaX")
.and_then(|v| v.as_i64())
.map(|v| v as i32);
let delta_y = parsed
.get("deltaY")
.and_then(|v| v.as_i64())
.map(|v| v as i32);
let _ = desktop_runtime
.scroll_mouse(DesktopMouseScrollRequest {
x: x as i32,
y: y as i32,
delta_x,
delta_y,
})
.await;
}
}
Some("keyDown") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime
.key_down(DesktopKeyboardDownRequest {
key: key.to_string(),
})
.await;
}
}
Some("keyUp") => {
if let Some(key) = parsed.get("key").and_then(|v| v.as_str()) {
let _ = desktop_runtime
.key_up(DesktopKeyboardUpRequest {
key: key.to_string(),
})
.await;
}
}
// --- WebRTC signaling messages (accepted without error) ---
Some("answer") | Some("candidate") | Some("offer") => {}
_ => {}
}
true
}
Some(Ok(Message::Ping(payload))) => {
let _ = ws.send(Message::Pong(payload)).await;
true
}
Some(Ok(Message::Close(_))) | None | Some(Err(_)) => false,
_ => true,
}
}
async fn send_ws_json(socket: &mut WebSocket, payload: Value) -> Result<(), ()> {
socket
.send(Message::Text(

View file

@ -432,7 +432,7 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() {
assert_eq!(status, StatusCode::OK);
assert_eq!(parse_json(&body)["active"], true);
let (mut ws, _) = connect_async(test_app.app.ws_url("/v1/desktop/stream/ws"))
let (mut ws, _) = connect_async(test_app.app.ws_url("/v1/desktop/stream/signaling"))
.await
.expect("connect desktop stream websocket");
@ -447,12 +447,9 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() {
other => panic!("expected text ready frame, got {other:?}"),
}
let frame = recv_ws_message(&mut ws).await;
match frame {
Message::Binary(bytes) => assert!(bytes.starts_with(&[0xff, 0xd8, 0xff])),
other => panic!("expected binary jpeg frame, got {other:?}"),
}
// The signaling WebSocket now accepts input frames as fallback transport
// (when the WebRTC data channel is not established). Send a mouse move to
// verify input dispatch still works over the signaling channel.
ws.send(Message::Text(
json!({
"type": "moveMouse",
@ -464,6 +461,20 @@ async fn v1_desktop_lifecycle_and_actions_work_with_real_runtime() {
))
.await
.expect("send desktop stream mouse move");
// Send a WebRTC signaling message (offer) to verify the signaling path
// accepts it without error.
ws.send(Message::Text(
json!({
"type": "offer",
"sdp": "v=0\r\n"
})
.to_string()
.into(),
))
.await
.expect("send desktop stream offer");
let _ = ws.close(None).await;
let (status, _, body) = send_request(