From 3dbd9ce52d09759b0ffa96fd60061fab5535cf89 Mon Sep 17 00:00:00 2001 From: Harivansh Rathi Date: Thu, 26 Mar 2026 00:07:03 -0400 Subject: [PATCH] init with runtime contract --- CONTRIBUTING.md | 2 +- README.md | 14 +- ...{runtime-output.md => runtime-contract.md} | 0 skills/SKILL.md | 149 ------------------ skills/deskctl/SKILL.md | 132 ++++++++++++++++ skills/deskctl/references/commands.md | 75 +++++++++ skills/deskctl/references/install.md | 75 +++++++++ skills/deskctl/references/sandbox-agent.md | 61 +++++++ .../deskctl/templates/install-deskctl-npm.sh | 27 ++++ .../templates/sandbox-agent-desktop-loop.sh | 7 + 10 files changed, 390 insertions(+), 152 deletions(-) rename docs/{runtime-output.md => runtime-contract.md} (100%) delete mode 100644 skills/SKILL.md create mode 100644 skills/deskctl/SKILL.md create mode 100644 skills/deskctl/references/commands.md create mode 100644 skills/deskctl/references/install.md create mode 100644 skills/deskctl/references/sandbox-agent.md create mode 100644 skills/deskctl/templates/install-deskctl-npm.sh create mode 100644 skills/deskctl/templates/sandbox-agent-desktop-loop.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bdbce4e..926c58a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,7 @@ pnpm --dir site install - `src/` holds production code and unit tests - `tests/` holds integration tests - `tests/support/` holds shared X11 and daemon helpers for integration coverage -- `docs/runtime-output.md` is the stable-vs-best-effort runtime output contract for agent-facing CLI work +- `docs/runtime-contract.md` is the stable-vs-best-effort runtime output contract for agent-facing CLI work Keep integration-only helpers out of `src/`. diff --git a/README.md b/README.md index 036396a..db7d92f 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,16 @@ npx deskctl-cli --help `deskctl-cli` currently supports `linux-x64` and installs the `deskctl` command by downloading the matching GitHub Release asset. +### Installable skill + +For `skills.sh` / agent skill ecosystems: + +```bash +npx skills add harivansh-afk/deskctl -s deskctl +``` + +The installable skill lives under [`skills/deskctl`](skills/deskctl) and is designed for X11 sandboxes, VMs, and sandbox-agent desktop sessions. It points agents to the npm install path first so they can get `deskctl` without Cargo. + ### Nix ```bash @@ -133,7 +143,7 @@ deskctl doctor - `@wN` refs are short-lived handles assigned by `snapshot` and `list-windows` - `--json` output includes a stable `window_id` for programmatic targeting within the current daemon session - `list-windows` is a cheap read-only operation and does not capture or write a screenshot -- the stable runtime JSON/error contract is documented in [docs/runtime-output.md](docs/runtime-output.md) +- the stable runtime JSON/error contract is documented in [docs/runtime-contract.md](docs/runtime-contract.md) ## Read and Wait Surface @@ -189,7 +199,7 @@ Text mode is compact and follow-up-oriented, but JSON is the parsing contract. - rely on `window_id`, selector-related fields, grouped read payloads, and structured error `kind` values for stable automation - treat monitor naming, incidental whitespace, and default screenshot file names as best-effort -See [docs/runtime-output.md](docs/runtime-output.md) for the exact stable-vs-best-effort breakdown. +See [docs/runtime-conract.md](docs/runtime-contract.md) for the exact stable-vs-best-effort breakdown. ## Distribution diff --git a/docs/runtime-output.md b/docs/runtime-contract.md similarity index 100% rename from docs/runtime-output.md rename to docs/runtime-contract.md diff --git a/skills/SKILL.md b/skills/SKILL.md deleted file mode 100644 index efbd188..0000000 --- a/skills/SKILL.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -name: deskctl -description: Desktop control CLI for AI agents -allowed-tools: Bash(deskctl:*) ---- - -# deskctl - -Desktop control CLI for AI agents on Linux X11. Provides a unified interface for screenshots, mouse/keyboard input, and window management with compact `@wN` window references. - -## Core Workflow - -1. **Snapshot** to see the desktop and get window refs -2. **Query / wait** using grouped `get` and `wait` commands -3. **Act** using refs, explicit selectors, or coordinates -4. **Repeat** as needed - -## Quick Reference - -### See the Desktop - -```bash -deskctl snapshot # Screenshot + window tree with @wN refs -deskctl snapshot --annotate # Screenshot with bounding boxes and labels -deskctl snapshot --json # Structured JSON output -deskctl list-windows # Window tree without screenshot -deskctl screenshot /tmp/s.png # Screenshot only (no window tree) -deskctl get active-window # Currently focused window -deskctl get monitors # Monitor geometry -deskctl get version # deskctl version + backend -deskctl get systeminfo # Runtime-scoped diagnostics -deskctl wait window --selector 'title=Firefox' --timeout 10 -deskctl wait focus --selector 'class=firefox' --timeout 5 -``` - -### Click and Type - -```bash -deskctl click @w1 # Click center of window @w1 -deskctl click 500,300 # Click absolute coordinates -deskctl dblclick @w2 # Double-click window @w2 -deskctl type "hello world" # Type text into focused window -deskctl press enter # Press a key -deskctl hotkey ctrl c # Send Ctrl+C -deskctl hotkey ctrl shift t # Send Ctrl+Shift+T -``` - -### Mouse Control - -```bash -deskctl mouse move 500 300 # Move cursor to coordinates -deskctl mouse scroll 3 # Scroll down 3 units -deskctl mouse scroll -3 # Scroll up 3 units -deskctl mouse drag 100 100 500 500 # Drag from (100,100) to (500,500) -``` - -### Window Management - -```bash -deskctl focus @w2 # Focus window by ref -deskctl focus 'title=Firefox' # Focus by explicit title selector -deskctl focus 'class=firefox' # Focus by explicit class selector -deskctl focus "firefox" # Fuzzy substring match (fails on ambiguity) -deskctl close @w3 # Close window gracefully -deskctl move-window @w1 100 200 # Move window to position -deskctl resize-window @w1 800 600 # Resize window -``` - -### Utilities - -```bash -deskctl doctor # Diagnose X11, screenshot, and daemon health -deskctl get-screen-size # Screen resolution -deskctl get-mouse-position # Current cursor position -deskctl launch firefox # Launch an application -deskctl launch code -- --new-window # Launch with arguments -``` - -### Daemon - -```bash -deskctl daemon start # Start daemon manually -deskctl daemon stop # Stop daemon -deskctl daemon status # Check daemon status -``` - -## Global Options - -- `--json` : Output as structured JSON (all commands) -- `--session NAME` : Session name for multiple daemon instances (default: "default") -- `--socket PATH` : Custom Unix socket path - -## Output Contract - -- Prefer `--json` when an agent needs strict parsing. -- Use `window_id` for stable targeting inside a live daemon session. -- Use `ref_id` / `@wN` for quick short-lived follow-up actions after `snapshot` or `list-windows`. -- Structured JSON failures expose machine-usable `kind` values for selector and wait failures. -- The exact text formatting is intentionally compact but not the parsing contract. See `docs/runtime-output.md` for the stable field policy. - -## Window Refs - -After `snapshot` or `list-windows`, windows are assigned short refs: -- `@w1` is the topmost (usually focused) window -- `@w2`, `@w3`, etc. follow z-order (front to back) -- Refs reset on each `snapshot` call -- Use `--json` to see stable `window_id` values for programmatic tracking within the current daemon session - -## Selector Contract - -Prefer explicit selectors when an agent needs deterministic targeting: - -```bash -ref=w1 -id=win1 -title=Firefox -class=firefox -focused -``` - -Bare selectors such as `firefox` still work as fuzzy substring matches, but they now fail with candidate windows if multiple matches exist. - -## Example Agent Workflow - -```bash -# 1. See what's on screen -deskctl snapshot --annotate - -# 2. Wait for the browser and focus it deterministically -deskctl wait window --selector 'class=firefox' --timeout 10 -deskctl focus 'class=firefox' - -# 3. Navigate to a URL -deskctl hotkey ctrl l -deskctl type "https://example.com" -deskctl press enter - -# 4. Take a new snapshot to see the result -deskctl snapshot -``` - -## Key Names for press/hotkey - -Modifiers: `ctrl`, `alt`, `shift`, `super` -Navigation: `enter`, `tab`, `escape`, `backspace`, `delete`, `space` -Arrows: `up`, `down`, `left`, `right` -Page: `home`, `end`, `pageup`, `pagedown` -Function: `f1` through `f12` -Characters: any single character (e.g. `a`, `1`, `/`) diff --git a/skills/deskctl/SKILL.md b/skills/deskctl/SKILL.md new file mode 100644 index 0000000..1522703 --- /dev/null +++ b/skills/deskctl/SKILL.md @@ -0,0 +1,132 @@ +--- +name: deskctl +description: Desktop control CLI for AI agents on Linux X11. Use when operating an X11 desktop in a sandbox, VM, or sandbox-agent session via screenshots, grouped get/wait commands, selectors, and mouse or keyboard input. Prefer this skill when the task is "control the desktop", "inspect windows", "wait for a window", "click/type in the sandbox desktop", or "use deskctl inside sandbox-agent". +allowed-tools: Bash(deskctl:*), Bash(npx deskctl-cli:*), Bash(npm:*), Bash(which:*), Bash(printenv:*), Bash(echo:*), Bash(sandbox-agent:*) +--- + +# deskctl + +`deskctl` is a non-interactive desktop control CLI for Linux X11 agents. It works well inside sandbox-agent desktop environments because it gives agents a tight `observe -> wait -> act -> verify` loop. + +## Install skill (optional) + +### npx + +```bash +npx skills add harivansh-afk/deskctl -s deskctl +``` + +### bunx + +```bash +bunx skills add harivansh-afk/deskctl -s deskctl +``` + +## Install the CLI + +Preferred install path: + +```bash +npm install -g deskctl-cli +deskctl --help +``` + +If global npm installs are not writable, use a user prefix: + +```bash +mkdir -p "$HOME/.local/bin" +npm install -g --prefix "$HOME/.local" deskctl-cli +export PATH="$HOME/.local/bin:$PATH" +deskctl --help +``` + +One-shot usage also works: + +```bash +npx deskctl-cli --help +``` + +For install details and fallback paths, see [references/install.md](references/install.md). + +## Sandbox-Agent Notes + +Before using `deskctl` inside sandbox-agent: + +1. Make sure the sandbox has desktop runtime packages installed. +2. Make sure the session is actually running X11. +3. Run `deskctl doctor` before trying to click or type. + +Typical sandbox-agent prep: + +```bash +sandbox-agent install desktop --yes +deskctl doctor +``` + +If `doctor` fails, inspect `DISPLAY`, `XDG_SESSION_TYPE`, and whether the sandbox actually has a desktop session. See [references/sandbox-agent.md](references/sandbox-agent.md). + +## Core Workflow + +Every desktop task should follow this loop: + +1. **Observe** +2. **Target** +3. **Wait** +4. **Act** +5. **Verify** + +```bash +deskctl doctor +deskctl snapshot --annotate +deskctl get active-window +deskctl wait window --selector 'class=firefox' --timeout 10 +deskctl focus 'class=firefox' +deskctl hotkey ctrl l +deskctl type "https://example.com" +deskctl press enter +deskctl snapshot +``` + +## What To Reach For First + +- `deskctl doctor` +- `deskctl snapshot --annotate` +- `deskctl list-windows` +- `deskctl get active-window` +- `deskctl wait window --selector ...` +- `deskctl wait focus --selector ...` + +Use `--json` when you need strict parsing. Use explicit selectors when you need deterministic targeting. + +## Selector Rules + +Prefer explicit selectors: + +```bash +ref=w1 +id=win1 +title=Firefox +class=firefox +focused +``` + +Legacy refs still work: + +```bash +@w1 +w1 +win1 +``` + +Bare strings such as `firefox` are fuzzy substring selectors. They fail on ambiguity instead of silently picking the wrong window. + +## References + +- [references/install.md](references/install.md) - install paths, npm-first bootstrap, runtime prerequisites +- [references/commands.md](references/commands.md) - grouped reads, waits, selectors, and core action commands +- [references/sandbox-agent.md](references/sandbox-agent.md) - using `deskctl` inside sandbox-agent desktop sessions + +## Templates + +- [templates/install-deskctl-npm.sh](templates/install-deskctl-npm.sh) - install `deskctl-cli` into a user prefix +- [templates/sandbox-agent-desktop-loop.sh](templates/sandbox-agent-desktop-loop.sh) - minimal observe/wait/act loop for desktop tasks diff --git a/skills/deskctl/references/commands.md b/skills/deskctl/references/commands.md new file mode 100644 index 0000000..2d2dc1f --- /dev/null +++ b/skills/deskctl/references/commands.md @@ -0,0 +1,75 @@ +# deskctl command guide + +## Observe + +```bash +deskctl doctor +deskctl snapshot +deskctl snapshot --annotate +deskctl list-windows +deskctl screenshot /tmp/current.png +deskctl get active-window +deskctl get monitors +deskctl get version +deskctl get systeminfo +``` + +Use `snapshot --annotate` when you need both the screenshot artifact and the short `@wN` labels. Use `list-windows` when you only need the window tree and do not want screenshot side effects. + +## Wait + +```bash +deskctl wait window --selector 'title=Firefox' --timeout 10 +deskctl wait focus --selector 'class=firefox' --timeout 5 +``` + +Wait commands return the matched window payload on success. In `--json` mode, failures include structured `kind` values so the caller can recover without string parsing. + +## Selectors + +Prefer explicit selectors: + +```bash +ref=w1 +id=win1 +title=Firefox +class=firefox +focused +``` + +Legacy refs still work: + +```bash +@w1 +w1 +win1 +``` + +Bare fuzzy selectors such as `firefox` are supported, but they fail on ambiguity. + +## Act + +```bash +deskctl focus 'class=firefox' +deskctl click @w1 +deskctl dblclick @w2 +deskctl type "hello world" +deskctl press enter +deskctl hotkey ctrl shift t +deskctl mouse move 500 300 +deskctl mouse scroll 3 +deskctl mouse drag 100 100 500 500 +deskctl move-window @w1 100 120 +deskctl resize-window @w1 1280 720 +deskctl close @w3 +deskctl launch firefox +``` + +## Agent loop + +The safe pattern is: + +1. Observe with `snapshot`, `list-windows`, or `get ...` +2. Wait for the target window if needed +3. Act using explicit selectors or refs +4. Snapshot again to verify the result diff --git a/skills/deskctl/references/install.md b/skills/deskctl/references/install.md new file mode 100644 index 0000000..cb97a5c --- /dev/null +++ b/skills/deskctl/references/install.md @@ -0,0 +1,75 @@ +# Install `deskctl` + +`deskctl` is designed to be used non-interactively by agents. The easiest install path is the npm package because it installs the `deskctl` command directly from GitHub Release assets without needing Cargo on the target machine. + +## Preferred: npm global install + +```bash +npm install -g deskctl-cli +deskctl --help +``` + +This is the preferred path for sandboxes, VMs, and sandbox-agent sessions where Node/npm already exists. + +## User-prefix npm install + +If global npm installs are not writable: + +```bash +mkdir -p "$HOME/.local/bin" +npm install -g --prefix "$HOME/.local" deskctl-cli +export PATH="$HOME/.local/bin:$PATH" +deskctl --help +``` + +This avoids `sudo` and keeps the install inside the user home directory. + +## One-shot npm execution + +```bash +npx deskctl-cli --help +``` + +Use this for quick testing. For repeated desktop control, install the command once so the runtime is predictable. + +## Fallback: Cargo + +```bash +cargo install deskctl +``` + +Use this only when the machine already has a Rust toolchain or when you explicitly want a source build. + +## Fallback: local Docker build + +If you need a Linux binary from macOS or another non-Linux host: + +```bash +docker compose -f docker/docker-compose.yml run --rm build +``` + +Then copy `dist/deskctl-linux-x86_64` into the target machine. + +## Runtime prerequisites + +`deskctl` needs: + +- Linux +- X11 +- a valid `DISPLAY` +- a working desktop/window-manager session + +Quick verification: + +```bash +printenv DISPLAY +printenv XDG_SESSION_TYPE +deskctl doctor +``` + +Inside sandbox-agent, you may need to install desktop dependencies first: + +```bash +sandbox-agent install desktop --yes +deskctl doctor +``` diff --git a/skills/deskctl/references/sandbox-agent.md b/skills/deskctl/references/sandbox-agent.md new file mode 100644 index 0000000..d994062 --- /dev/null +++ b/skills/deskctl/references/sandbox-agent.md @@ -0,0 +1,61 @@ +# deskctl inside sandbox-agent + +Use `deskctl` when the sandbox-agent session includes a Linux desktop and you want a tight local desktop-control loop from the shell. + +## When it fits + +`deskctl` is a good fit when: + +- the sandbox already has an X11 desktop session +- you want fast local desktop control from inside the sandbox +- you want short-lived refs like `@w1` and grouped `get` or `wait` primitives + +It is not a replacement for sandbox-agent session orchestration itself. Use sandbox-agent to provision the sandbox and desktop runtime, then use `deskctl` inside that environment to control the GUI. + +## Minimal bootstrap + +```bash +sandbox-agent install desktop --yes +npm install -g deskctl-cli +deskctl doctor +deskctl snapshot --annotate +``` + +If npm global installs are not writable: + +```bash +mkdir -p "$HOME/.local/bin" +npm install -g --prefix "$HOME/.local" deskctl-cli +export PATH="$HOME/.local/bin:$PATH" +deskctl doctor +``` + +## Expected environment + +Check: + +```bash +printenv DISPLAY +printenv XDG_SESSION_TYPE +deskctl --json get systeminfo +``` + +Healthy `deskctl` usage usually means: + +- `DISPLAY` is set +- `XDG_SESSION_TYPE=x11` +- `deskctl doctor` succeeds + +## Recommended workflow + +```bash +deskctl snapshot --annotate +deskctl wait window --selector 'class=firefox' --timeout 10 +deskctl focus 'class=firefox' +deskctl hotkey ctrl l +deskctl type "https://example.com" +deskctl press enter +deskctl snapshot +``` + +Prefer `--json` for strict machine parsing and explicit selectors for deterministic targeting. diff --git a/skills/deskctl/templates/install-deskctl-npm.sh b/skills/deskctl/templates/install-deskctl-npm.sh new file mode 100644 index 0000000..a0ab596 --- /dev/null +++ b/skills/deskctl/templates/install-deskctl-npm.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +if command -v deskctl >/dev/null 2>&1; then + echo "deskctl already installed: $(command -v deskctl)" + exit 0 +fi + +if ! command -v npm >/dev/null 2>&1; then + echo "npm is required for the preferred deskctl install path" + exit 1 +fi + +prefix="${DESKCTL_NPM_PREFIX:-$HOME/.local}" +bin_dir="$prefix/bin" + +mkdir -p "$bin_dir" +npm install -g --prefix "$prefix" deskctl-cli + +if ! command -v deskctl >/dev/null 2>&1; then + echo "deskctl installed to $bin_dir" + echo "add this to PATH if needed:" + echo "export PATH=\"$bin_dir:\$PATH\"" +fi + +"$bin_dir/deskctl" --help >/dev/null 2>&1 || true +echo "deskctl bootstrap complete" diff --git a/skills/deskctl/templates/sandbox-agent-desktop-loop.sh b/skills/deskctl/templates/sandbox-agent-desktop-loop.sh new file mode 100644 index 0000000..f47dbb8 --- /dev/null +++ b/skills/deskctl/templates/sandbox-agent-desktop-loop.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +deskctl doctor +deskctl snapshot --annotate +deskctl get active-window +deskctl wait window --selector "${1:-focused}" --timeout "${2:-5}"