From 03dfd6b6ea28ae5a587e7fee596536cf49a028b2 Mon Sep 17 00:00:00 2001 From: Harivansh Rathi Date: Tue, 24 Mar 2026 21:40:29 -0400 Subject: [PATCH] Phase 6: utility commands, SKILL.md, AGENTS.md, README.md - Implement screen_size via xcap Monitor, mouse_position via x11rb query_pointer, standalone screenshot with optional annotation, launch for spawning detached processes - Handler dispatchers for get-screen-size, get-mouse-position, screenshot, launch - SKILL.md agent discovery file with allowed-tools frontmatter - AGENTS.md contributor guidelines for AI agents - README.md with installation, quick start, architecture overview --- AGENTS.md | 40 +++++++++++++++ README.md | 53 +++++++++++++++++++ SKILL.md | 116 ++++++++++++++++++++++++++++++++++++++++++ src/backend/x11.rs | 65 ++++++++++++++++++++--- src/daemon/handler.rs | 73 ++++++++++++++++++++++++++ 5 files changed, 339 insertions(+), 8 deletions(-) create mode 100644 AGENTS.md create mode 100644 README.md create mode 100644 SKILL.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..ad4756a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,40 @@ +# Agent Guidelines + +## Build + +```bash +cargo build +cargo clippy +``` + +## Run + +Requires an X11 session with `DISPLAY` set. + +```bash +cargo run -- snapshot +cargo run -- --json snapshot --annotate +``` + +## Code Style + +- No emojis in code or comments +- Use `anyhow::Result` for all fallible functions +- All daemon handler functions are async +- Match field naming between CLI args, NDJSON protocol, and JSON output + +## Architecture + +- `src/cli/` - clap CLI parser and client-side socket connection +- `src/daemon/` - tokio async daemon, request handler, state management +- `src/backend/` - DesktopBackend trait and X11 implementation +- `src/core/` - shared types, protocol, ref map, session detection + +## Adding a New Command + +1. Add the variant to `Command` enum in `src/cli/mod.rs` +2. Add request building in `build_request()` in `src/cli/mod.rs` +3. Add the action handler in `src/daemon/handler.rs` +4. Add the backend method to `DesktopBackend` trait in `src/backend/mod.rs` +5. Implement in `src/backend/x11.rs` +6. Update `SKILL.md` diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d7ba1a --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# desktop-ctl + +Desktop control CLI for AI agents on Linux X11. A single installable binary that gives agents full desktop access: screenshots with window refs, mouse/keyboard input, and window management. + +Inspired by [agent-browser](https://github.com/vercel-labs/agent-browser) - but for the full desktop. + +## Install + +```bash +cargo install desktop-ctl +``` + +System dependencies (Debian/Ubuntu): +```bash +sudo apt install libxcb-dev libxrandr-dev libclang-dev +``` + +## Quick Start + +```bash +# See the desktop +desktop-ctl snapshot + +# Click a window +desktop-ctl click @w1 + +# Type text +desktop-ctl type "hello world" + +# Focus by name +desktop-ctl focus "firefox" +``` + +## Architecture + +Client-daemon architecture over Unix sockets (NDJSON wire protocol). The daemon starts automatically on first command and keeps the X11 connection alive for fast repeated calls. + +``` +Agent -> desktop-ctl CLI (thin client) -> Unix socket -> desktop-ctl daemon -> X11 +``` + +## Requirements + +- Linux with X11 session +- Rust 1.75+ (for building) + +## Wayland Support + +Coming in v0.2. The trait-based backend design means adding Hyprland/Wayland support is a single trait implementation with zero refactoring of the core. + +## License + +MIT OR Apache-2.0 diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..d777fd4 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,116 @@ +--- +name: desktop-ctl +description: Desktop control CLI for AI agents - screenshot, click, type, window management on Linux X11 +allowed-tools: Bash(desktop-ctl:*) +--- + +# desktop-ctl + +Desktop control CLI for AI agents on Linux X11. Provides a unified interface for screenshots, mouse/keyboard input, and window management with compact `@wN` window references. + +## Core Workflow + +1. **Snapshot** to see the desktop and get window refs +2. **Act** using refs or coordinates (click, type, focus) +3. **Repeat** as needed + +## Quick Reference + +### See the Desktop + +```bash +desktop-ctl snapshot # Screenshot + window tree with @wN refs +desktop-ctl snapshot --annotate # Screenshot with bounding boxes and labels +desktop-ctl snapshot --json # Structured JSON output +desktop-ctl list-windows # Window tree without screenshot +desktop-ctl screenshot /tmp/s.png # Screenshot only (no window tree) +``` + +### Click and Type + +```bash +desktop-ctl click @w1 # Click center of window @w1 +desktop-ctl click 500,300 # Click absolute coordinates +desktop-ctl dblclick @w2 # Double-click window @w2 +desktop-ctl type "hello world" # Type text into focused window +desktop-ctl press enter # Press a key +desktop-ctl hotkey ctrl c # Send Ctrl+C +desktop-ctl hotkey ctrl shift t # Send Ctrl+Shift+T +``` + +### Mouse Control + +```bash +desktop-ctl mouse move 500 300 # Move cursor to coordinates +desktop-ctl mouse scroll 3 # Scroll down 3 units +desktop-ctl mouse scroll -3 # Scroll up 3 units +desktop-ctl mouse drag 100 100 500 500 # Drag from (100,100) to (500,500) +``` + +### Window Management + +```bash +desktop-ctl focus @w2 # Focus window by ref +desktop-ctl focus "firefox" # Focus window by name (substring match) +desktop-ctl close @w3 # Close window gracefully +desktop-ctl move-window @w1 100 200 # Move window to position +desktop-ctl resize-window @w1 800 600 # Resize window +``` + +### Utilities + +```bash +desktop-ctl get-screen-size # Screen resolution +desktop-ctl get-mouse-position # Current cursor position +desktop-ctl launch firefox # Launch an application +desktop-ctl launch code -- --new-window # Launch with arguments +``` + +### Daemon + +```bash +desktop-ctl daemon start # Start daemon manually +desktop-ctl daemon stop # Stop daemon +desktop-ctl daemon status # Check daemon status +``` + +## Global Options + +- `--json` : Output as structured JSON (all commands) +- `--session NAME` : Session name for multiple daemon instances (default: "default") +- `--socket PATH` : Custom Unix socket path + +## Window Refs + +After `snapshot` or `list-windows`, windows are assigned short refs: +- `@w1` is the topmost (usually focused) window +- `@w2`, `@w3`, etc. follow z-order (front to back) +- Refs reset on each `snapshot` call +- Use `--json` to see stable `xcb_id` for programmatic tracking + +## Example Agent Workflow + +```bash +# 1. See what's on screen +desktop-ctl snapshot --annotate + +# 2. Focus the browser +desktop-ctl focus "firefox" + +# 3. Navigate to a URL +desktop-ctl hotkey ctrl l +desktop-ctl type "https://example.com" +desktop-ctl press enter + +# 4. Take a new snapshot to see the result +desktop-ctl snapshot +``` + +## Key Names for press/hotkey + +Modifiers: `ctrl`, `alt`, `shift`, `super` +Navigation: `enter`, `tab`, `escape`, `backspace`, `delete`, `space` +Arrows: `up`, `down`, `left`, `right` +Page: `home`, `end`, `pageup`, `pagedown` +Function: `f1` through `f12` +Characters: any single character (e.g. `a`, `1`, `/`) diff --git a/src/backend/x11.rs b/src/backend/x11.rs index d4513fd..502a4d1 100644 --- a/src/backend/x11.rs +++ b/src/backend/x11.rs @@ -288,22 +288,71 @@ impl super::DesktopBackend for X11Backend { Ok(()) } - // Phase 6: utility stubs - fn screen_size(&self) -> Result<(u32, u32)> { - anyhow::bail!("Utility commands not yet implemented (Phase 6)") + let monitors = xcap::Monitor::all().context("Failed to enumerate monitors")?; + let monitor = monitors.into_iter().next().context("No monitor found")?; + let w = monitor.width().context("Failed to get monitor width")?; + let h = monitor.height().context("Failed to get monitor height")?; + Ok((w, h)) } fn mouse_position(&self) -> Result<(i32, i32)> { - anyhow::bail!("Utility commands not yet implemented (Phase 6)") + let reply = self + .conn + .query_pointer(self.root)? + .reply() + .context("Failed to query pointer")?; + Ok((reply.root_x as i32, reply.root_y as i32)) } - fn screenshot(&mut self, _path: &str, _annotate: bool) -> Result { - anyhow::bail!("Standalone screenshot not yet implemented (Phase 6)") + fn screenshot(&mut self, path: &str, annotate: bool) -> Result { + let monitors = xcap::Monitor::all().context("Failed to enumerate monitors")?; + let monitor = monitors.into_iter().next().context("No monitor found")?; + + let mut image = monitor + .capture_image() + .context("Failed to capture screenshot")?; + + if annotate { + let windows = xcap::Window::all().unwrap_or_default(); + let mut window_infos = Vec::new(); + let mut ref_counter = 1usize; + for win in &windows { + let title = win.title().unwrap_or_default(); + let app_name = win.app_name().unwrap_or_default(); + if title.is_empty() && app_name.is_empty() { + continue; + } + window_infos.push(crate::core::types::WindowInfo { + ref_id: format!("w{ref_counter}"), + xcb_id: win.id().unwrap_or(0), + title, + app_name, + x: win.x().unwrap_or(0), + y: win.y().unwrap_or(0), + width: win.width().unwrap_or(0), + height: win.height().unwrap_or(0), + focused: win.is_focused().unwrap_or(false), + minimized: win.is_minimized().unwrap_or(false), + }); + ref_counter += 1; + } + annotate_screenshot(&mut image, &window_infos); + } + + image.save(path).context("Failed to save screenshot")?; + Ok(path.to_string()) } - fn launch(&self, _command: &str, _args: &[String]) -> Result { - anyhow::bail!("Launch not yet implemented (Phase 6)") + fn launch(&self, command: &str, args: &[String]) -> Result { + let child = std::process::Command::new(command) + .args(args) + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .spawn() + .with_context(|| format!("Failed to launch: {command}"))?; + Ok(child.id()) } } diff --git a/src/daemon/handler.rs b/src/daemon/handler.rs index 80cded1..5efb886 100644 --- a/src/daemon/handler.rs +++ b/src/daemon/handler.rs @@ -25,6 +25,10 @@ pub async fn handle_request( "move-window" => handle_move_window(request, state).await, "resize-window" => handle_resize_window(request, state).await, "list-windows" => handle_list_windows(state).await, + "get-screen-size" => handle_get_screen_size(state).await, + "get-mouse-position" => handle_get_mouse_position(state).await, + "screenshot" => handle_screenshot(request, state).await, + "launch" => handle_launch(request, state).await, action => Response::err(format!("Unknown action: {action}")), } } @@ -372,6 +376,75 @@ async fn handle_list_windows( } } +async fn handle_get_screen_size(state: &Arc>) -> Response { + let state = state.lock().await; + match state.backend.screen_size() { + Ok((w, h)) => Response::ok(serde_json::json!({"width": w, "height": h})), + Err(e) => Response::err(format!("Failed: {e}")), + } +} + +async fn handle_get_mouse_position(state: &Arc>) -> Response { + let state = state.lock().await; + match state.backend.mouse_position() { + Ok((x, y)) => Response::ok(serde_json::json!({"x": x, "y": y})), + Err(e) => Response::err(format!("Failed: {e}")), + } +} + +async fn handle_screenshot( + request: &Request, + state: &Arc>, +) -> Response { + let annotate = request + .extra + .get("annotate") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + let path = request + .extra + .get("path") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| { + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + format!("/tmp/desktop-ctl-{ts}.png") + }); + let mut state = state.lock().await; + match state.backend.screenshot(&path, annotate) { + Ok(saved) => Response::ok(serde_json::json!({"screenshot": saved})), + Err(e) => Response::err(format!("Screenshot failed: {e}")), + } +} + +async fn handle_launch( + request: &Request, + state: &Arc>, +) -> Response { + let command = match request.extra.get("command").and_then(|v| v.as_str()) { + Some(c) => c.to_string(), + None => return Response::err("Missing 'command' field"), + }; + let args: Vec = request + .extra + .get("args") + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default(); + let state = state.lock().await; + match state.backend.launch(&command, &args) { + Ok(pid) => Response::ok(serde_json::json!({"pid": pid, "command": command})), + Err(e) => Response::err(format!("Launch failed: {e}")), + } +} + fn parse_coords(s: &str) -> Option<(i32, i32)> { let parts: Vec<&str> = s.split(',').collect(); if parts.len() == 2 {