From c37589ccf403106ebba3414ceeb9263c19c96e4f Mon Sep 17 00:00:00 2001
From: Harivansh Rathi <rathiharivansh@gmail.com>
Date: Thu, 26 Mar 2026 00:30:05 -0400
Subject: [PATCH] skill validated with workflows

---
 skills/deskctl/SKILL.md                       | 128 ++++--------------
 skills/deskctl/references/commands.md         |  64 ++++-----
 skills/deskctl/references/install.md          |  75 ----------
 skills/deskctl/references/runtime-contract.md |   1 +
 skills/deskctl/references/sandbox-agent.md    |  61 ---------
 .../deskctl/templates/install-deskctl-npm.sh  |  27 ----
 .../templates/sandbox-agent-desktop-loop.sh   |   7 -
 skills/deskctl/workflows/observe-act.sh       |  37 +++++
 skills/deskctl/workflows/poll-condition.sh    |  42 ++++++
 9 files changed, 134 insertions(+), 308 deletions(-)
 delete mode 100644 skills/deskctl/references/install.md
 create mode 120000 skills/deskctl/references/runtime-contract.md
 delete mode 100644 skills/deskctl/references/sandbox-agent.md
 delete mode 100644 skills/deskctl/templates/install-deskctl-npm.sh
 delete mode 100644 skills/deskctl/templates/sandbox-agent-desktop-loop.sh
 create mode 100755 skills/deskctl/workflows/observe-act.sh
 create mode 100755 skills/deskctl/workflows/poll-condition.sh

diff --git a/skills/deskctl/SKILL.md b/skills/deskctl/SKILL.md
index 1522703..81dea19 100644
--- a/skills/deskctl/SKILL.md
+++ b/skills/deskctl/SKILL.md
@@ -1,132 +1,54 @@
 ---
 name: deskctl
-description: Desktop control CLI for AI agents on Linux X11. Use when operating an X11 desktop in a sandbox, VM, or sandbox-agent session via screenshots, grouped get/wait commands, selectors, and mouse or keyboard input. Prefer this skill when the task is "control the desktop", "inspect windows", "wait for a window", "click/type in the sandbox desktop", or "use deskctl inside sandbox-agent".
-allowed-tools: Bash(deskctl:*), Bash(npx deskctl-cli:*), Bash(npm:*), Bash(which:*), Bash(printenv:*), Bash(echo:*), Bash(sandbox-agent:*)
+description: Non-interactive X11 desktop control for AI agents. Use when the task involves controlling a Linux desktop - clicking, typing, reading windows, waiting for UI state, or taking screenshots inside a sandbox or VM.
+allowed-tools: Bash(deskctl:*), Bash(npx deskctl-cli:*), Bash(npm:*), Bash(which:*), Bash(printenv:*), Bash(echo:*)
 ---
 
 # deskctl
 
-`deskctl` is a non-interactive desktop control CLI for Linux X11 agents. It works well inside sandbox-agent desktop environments because it gives agents a tight `observe -> wait -> act -> verify` loop.
+Non-interactive desktop control CLI for Linux X11 agents.
 
-## Install skill (optional)
+All output follows the runtime contract defined in [references/runtime-contract.md](references/runtime-contract.md). Every command returns a stable JSON envelope when called with `--json`. Use `--json` whenever you need to parse output programmatically.
 
-### npx
-
-```bash
-npx skills add harivansh-afk/deskctl -s deskctl
-```
-
-### bunx
-
-```bash
-bunx skills add harivansh-afk/deskctl -s deskctl
-```
-
-## Install the CLI
-
-Preferred install path:
+## Quick start
 
 ```bash
 npm install -g deskctl-cli
-deskctl --help
-```
-
-If global npm installs are not writable, use a user prefix:
-
-```bash
-mkdir -p "$HOME/.local/bin"
-npm install -g --prefix "$HOME/.local" deskctl-cli
-export PATH="$HOME/.local/bin:$PATH"
-deskctl --help
-```
-
-One-shot usage also works:
-
-```bash
-npx deskctl-cli --help
-```
-
-For install details and fallback paths, see [references/install.md](references/install.md).
-
-## Sandbox-Agent Notes
-
-Before using `deskctl` inside sandbox-agent:
-
-1. Make sure the sandbox has desktop runtime packages installed.
-2. Make sure the session is actually running X11.
-3. Run `deskctl doctor` before trying to click or type.
-
-Typical sandbox-agent prep:
-
-```bash
-sandbox-agent install desktop --yes
-deskctl doctor
-```
-
-If `doctor` fails, inspect `DISPLAY`, `XDG_SESSION_TYPE`, and whether the sandbox actually has a desktop session. See [references/sandbox-agent.md](references/sandbox-agent.md).
-
-## Core Workflow
-
-Every desktop task should follow this loop:
-
-1. **Observe**
-2. **Target**
-3. **Wait**
-4. **Act**
-5. **Verify**
-
-```bash
 deskctl doctor
 deskctl snapshot --annotate
-deskctl get active-window
-deskctl wait window --selector 'class=firefox' --timeout 10
-deskctl focus 'class=firefox'
-deskctl hotkey ctrl l
-deskctl type "https://example.com"
-deskctl press enter
-deskctl snapshot
 ```
 
-## What To Reach For First
+## Agent loop
 
-- `deskctl doctor`
-- `deskctl snapshot --annotate`
-- `deskctl list-windows`
-- `deskctl get active-window`
-- `deskctl wait window --selector ...`
-- `deskctl wait focus --selector ...`
-
-Use `--json` when you need strict parsing. Use explicit selectors when you need deterministic targeting.
-
-## Selector Rules
-
-Prefer explicit selectors:
+Every desktop interaction follows: **observe -> wait -> act -> verify**.
 
 ```bash
-ref=w1
-id=win1
-title=Firefox
-class=firefox
-focused
+deskctl snapshot --annotate        # observe
+deskctl wait window --selector 'title=Firefox' --timeout 10  # wait
+deskctl click 'title=Firefox'      # act
+deskctl snapshot                   # verify
 ```
 
-Legacy refs still work:
+See [workflows/observe-act.sh](workflows/observe-act.sh) for a reusable script. See [workflows/poll-condition.sh](workflows/poll-condition.sh) for polling loops.
+
+## Selectors
 
 ```bash
-@w1
-w1
-win1
+ref=w1          # snapshot ref (short-lived)
+id=win1         # stable window ID (session-scoped)
+title=Firefox   # match by title
+class=firefox   # match by WM class
+focused         # currently focused window
 ```
 
-Bare strings such as `firefox` are fuzzy substring selectors. They fail on ambiguity instead of silently picking the wrong window.
+Bare strings like `firefox` do fuzzy matching but fail on ambiguity. Prefer explicit selectors.
 
 ## References
 
-- [references/install.md](references/install.md) - install paths, npm-first bootstrap, runtime prerequisites
-- [references/commands.md](references/commands.md) - grouped reads, waits, selectors, and core action commands
-- [references/sandbox-agent.md](references/sandbox-agent.md) - using `deskctl` inside sandbox-agent desktop sessions
+- [references/runtime-contract.md](references/runtime-contract.md) - output contract, stable fields, error kinds
+- [references/commands.md](references/commands.md) - all available commands
 
-## Templates
+## Workflows
 
-- [templates/install-deskctl-npm.sh](templates/install-deskctl-npm.sh) - install `deskctl-cli` into a user prefix
-- [templates/sandbox-agent-desktop-loop.sh](templates/sandbox-agent-desktop-loop.sh) - minimal observe/wait/act loop for desktop tasks
+- [workflows/observe-act.sh](workflows/observe-act.sh) - main observe-act loop
+- [workflows/poll-condition.sh](workflows/poll-condition.sh) - poll for a condition on screen
diff --git a/skills/deskctl/references/commands.md b/skills/deskctl/references/commands.md
index 2d2dc1f..d0e7c9f 100644
--- a/skills/deskctl/references/commands.md
+++ b/skills/deskctl/references/commands.md
@@ -1,21 +1,23 @@
-# deskctl command guide
+# deskctl commands
+
+All commands support `--json` for machine-parseable output following the runtime contract.
 
 ## Observe
 
 ```bash
-deskctl doctor
-deskctl snapshot
-deskctl snapshot --annotate
-deskctl list-windows
-deskctl screenshot /tmp/current.png
-deskctl get active-window
-deskctl get monitors
-deskctl get version
-deskctl get systeminfo
+deskctl doctor                          # check X11 runtime and daemon health
+deskctl snapshot                        # screenshot + window list
+deskctl snapshot --annotate             # screenshot with @wN labels overlaid
+deskctl list-windows                    # window list only (no screenshot)
+deskctl screenshot /tmp/screen.png      # screenshot to explicit path
+deskctl get active-window               # focused window info
+deskctl get monitors                    # monitor geometry
+deskctl get version                     # version and backend
+deskctl get systeminfo                  # full runtime diagnostics
+deskctl get-screen-size                 # screen resolution
+deskctl get-mouse-position              # cursor coordinates
 ```
 
-Use `snapshot --annotate` when you need both the screenshot artifact and the short `@wN` labels. Use `list-windows` when you only need the window tree and do not want screenshot side effects.
-
 ## Wait
 
 ```bash
@@ -23,29 +25,19 @@ deskctl wait window --selector 'title=Firefox' --timeout 10
 deskctl wait focus --selector 'class=firefox' --timeout 5
 ```
 
-Wait commands return the matched window payload on success. In `--json` mode, failures include structured `kind` values so the caller can recover without string parsing.
+Returns the matched window payload on success. Failures include structured `kind` values in `--json` mode.
 
 ## Selectors
 
-Prefer explicit selectors:
-
 ```bash
-ref=w1
-id=win1
-title=Firefox
-class=firefox
-focused
+ref=w1          # snapshot ref (short-lived, from last snapshot)
+id=win1         # stable window ID (session-scoped)
+title=Firefox   # match by window title
+class=firefox   # match by WM class
+focused         # currently focused window
 ```
 
-Legacy refs still work:
-
-```bash
-@w1
-w1
-win1
-```
-
-Bare fuzzy selectors such as `firefox` are supported, but they fail on ambiguity.
+Legacy shorthand: `@w1`, `w1`, `win1`. Bare strings do fuzzy matching but fail on ambiguity.
 
 ## Act
 
@@ -58,6 +50,7 @@ deskctl press enter
 deskctl hotkey ctrl shift t
 deskctl mouse move 500 300
 deskctl mouse scroll 3
+deskctl mouse scroll 3 --axis horizontal
 deskctl mouse drag 100 100 500 500
 deskctl move-window @w1 100 120
 deskctl resize-window @w1 1280 720
@@ -65,11 +58,12 @@ deskctl close @w3
 deskctl launch firefox
 ```
 
-## Agent loop
+## Daemon
 
-The safe pattern is:
+```bash
+deskctl daemon start
+deskctl daemon stop
+deskctl daemon status
+```
 
-1. Observe with `snapshot`, `list-windows`, or `get ...`
-2. Wait for the target window if needed
-3. Act using explicit selectors or refs
-4. Snapshot again to verify the result
+The daemon starts automatically on first command. Manual control is rarely needed.
diff --git a/skills/deskctl/references/install.md b/skills/deskctl/references/install.md
deleted file mode 100644
index cb97a5c..0000000
--- a/skills/deskctl/references/install.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Install `deskctl`
-
-`deskctl` is designed to be used non-interactively by agents. The easiest install path is the npm package because it installs the `deskctl` command directly from GitHub Release assets without needing Cargo on the target machine.
-
-## Preferred: npm global install
-
-```bash
-npm install -g deskctl-cli
-deskctl --help
-```
-
-This is the preferred path for sandboxes, VMs, and sandbox-agent sessions where Node/npm already exists.
-
-## User-prefix npm install
-
-If global npm installs are not writable:
-
-```bash
-mkdir -p "$HOME/.local/bin"
-npm install -g --prefix "$HOME/.local" deskctl-cli
-export PATH="$HOME/.local/bin:$PATH"
-deskctl --help
-```
-
-This avoids `sudo` and keeps the install inside the user home directory.
-
-## One-shot npm execution
-
-```bash
-npx deskctl-cli --help
-```
-
-Use this for quick testing. For repeated desktop control, install the command once so the runtime is predictable.
-
-## Fallback: Cargo
-
-```bash
-cargo install deskctl
-```
-
-Use this only when the machine already has a Rust toolchain or when you explicitly want a source build.
-
-## Fallback: local Docker build
-
-If you need a Linux binary from macOS or another non-Linux host:
-
-```bash
-docker compose -f docker/docker-compose.yml run --rm build
-```
-
-Then copy `dist/deskctl-linux-x86_64` into the target machine.
-
-## Runtime prerequisites
-
-`deskctl` needs:
-
-- Linux
-- X11
-- a valid `DISPLAY`
-- a working desktop/window-manager session
-
-Quick verification:
-
-```bash
-printenv DISPLAY
-printenv XDG_SESSION_TYPE
-deskctl doctor
-```
-
-Inside sandbox-agent, you may need to install desktop dependencies first:
-
-```bash
-sandbox-agent install desktop --yes
-deskctl doctor
-```
diff --git a/skills/deskctl/references/runtime-contract.md b/skills/deskctl/references/runtime-contract.md
new file mode 120000
index 0000000..8de0781
--- /dev/null
+++ b/skills/deskctl/references/runtime-contract.md
@@ -0,0 +1 @@
+../../../docs/runtime-contract.md
\ No newline at end of file
diff --git a/skills/deskctl/references/sandbox-agent.md b/skills/deskctl/references/sandbox-agent.md
deleted file mode 100644
index d994062..0000000
--- a/skills/deskctl/references/sandbox-agent.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# deskctl inside sandbox-agent
-
-Use `deskctl` when the sandbox-agent session includes a Linux desktop and you want a tight local desktop-control loop from the shell.
-
-## When it fits
-
-`deskctl` is a good fit when:
-
-- the sandbox already has an X11 desktop session
-- you want fast local desktop control from inside the sandbox
-- you want short-lived refs like `@w1` and grouped `get` or `wait` primitives
-
-It is not a replacement for sandbox-agent session orchestration itself. Use sandbox-agent to provision the sandbox and desktop runtime, then use `deskctl` inside that environment to control the GUI.
-
-## Minimal bootstrap
-
-```bash
-sandbox-agent install desktop --yes
-npm install -g deskctl-cli
-deskctl doctor
-deskctl snapshot --annotate
-```
-
-If npm global installs are not writable:
-
-```bash
-mkdir -p "$HOME/.local/bin"
-npm install -g --prefix "$HOME/.local" deskctl-cli
-export PATH="$HOME/.local/bin:$PATH"
-deskctl doctor
-```
-
-## Expected environment
-
-Check:
-
-```bash
-printenv DISPLAY
-printenv XDG_SESSION_TYPE
-deskctl --json get systeminfo
-```
-
-Healthy `deskctl` usage usually means:
-
-- `DISPLAY` is set
-- `XDG_SESSION_TYPE=x11`
-- `deskctl doctor` succeeds
-
-## Recommended workflow
-
-```bash
-deskctl snapshot --annotate
-deskctl wait window --selector 'class=firefox' --timeout 10
-deskctl focus 'class=firefox'
-deskctl hotkey ctrl l
-deskctl type "https://example.com"
-deskctl press enter
-deskctl snapshot
-```
-
-Prefer `--json` for strict machine parsing and explicit selectors for deterministic targeting.
diff --git a/skills/deskctl/templates/install-deskctl-npm.sh b/skills/deskctl/templates/install-deskctl-npm.sh
deleted file mode 100644
index a0ab596..0000000
--- a/skills/deskctl/templates/install-deskctl-npm.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-if command -v deskctl >/dev/null 2>&1; then
-  echo "deskctl already installed: $(command -v deskctl)"
-  exit 0
-fi
-
-if ! command -v npm >/dev/null 2>&1; then
-  echo "npm is required for the preferred deskctl install path"
-  exit 1
-fi
-
-prefix="${DESKCTL_NPM_PREFIX:-$HOME/.local}"
-bin_dir="$prefix/bin"
-
-mkdir -p "$bin_dir"
-npm install -g --prefix "$prefix" deskctl-cli
-
-if ! command -v deskctl >/dev/null 2>&1; then
-  echo "deskctl installed to $bin_dir"
-  echo "add this to PATH if needed:"
-  echo "export PATH=\"$bin_dir:\$PATH\""
-fi
-
-"$bin_dir/deskctl" --help >/dev/null 2>&1 || true
-echo "deskctl bootstrap complete"
diff --git a/skills/deskctl/templates/sandbox-agent-desktop-loop.sh b/skills/deskctl/templates/sandbox-agent-desktop-loop.sh
deleted file mode 100644
index f47dbb8..0000000
--- a/skills/deskctl/templates/sandbox-agent-desktop-loop.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-deskctl doctor
-deskctl snapshot --annotate
-deskctl get active-window
-deskctl wait window --selector "${1:-focused}" --timeout "${2:-5}"
diff --git a/skills/deskctl/workflows/observe-act.sh b/skills/deskctl/workflows/observe-act.sh
new file mode 100755
index 0000000..0e336ae
--- /dev/null
+++ b/skills/deskctl/workflows/observe-act.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# observe-act.sh - main desktop interaction loop
+# usage: ./observe-act.sh <selector> [action] [action-args...]
+# example: ./observe-act.sh 'title=Firefox' click
+# example: ./observe-act.sh 'class=terminal' type "ls -la"
+set -euo pipefail
+
+SELECTOR="${1:?usage: observe-act.sh <selector> [action] [action-args...]}"
+ACTION="${2:-click}"
+shift 2 2>/dev/null || true
+
+# 1. observe - snapshot the desktop, get current state
+echo "--- observe ---"
+deskctl snapshot --annotate --json | head -1
+deskctl get active-window
+
+# 2. wait - ensure target exists
+echo "--- wait ---"
+deskctl wait window --selector "$SELECTOR" --timeout 10
+
+# 3. act - perform the action on the target
+echo "--- act ---"
+case "$ACTION" in
+  click)    deskctl click "$SELECTOR" ;;
+  dblclick) deskctl dblclick "$SELECTOR" ;;
+  focus)    deskctl focus "$SELECTOR" ;;
+  type)     deskctl focus "$SELECTOR" && deskctl type "$@" ;;
+  press)    deskctl focus "$SELECTOR" && deskctl press "$@" ;;
+  hotkey)   deskctl focus "$SELECTOR" && deskctl hotkey "$@" ;;
+  close)    deskctl close "$SELECTOR" ;;
+  *)        echo "unknown action: $ACTION"; exit 1 ;;
+esac
+
+# 4. verify - snapshot again to confirm result
+echo "--- verify ---"
+sleep 0.5
+deskctl snapshot --json | head -1
diff --git a/skills/deskctl/workflows/poll-condition.sh b/skills/deskctl/workflows/poll-condition.sh
new file mode 100755
index 0000000..e173bf5
--- /dev/null
+++ b/skills/deskctl/workflows/poll-condition.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# poll-condition.sh - poll the desktop until a condition is met
+# usage: ./poll-condition.sh <match-string> [interval-seconds] [max-attempts]
+# example: ./poll-condition.sh "Tickets Available" 5 60
+# example: ./poll-condition.sh "Order Confirmed" 3 20
+# example: ./poll-condition.sh "Download Complete" 10 30
+#
+# checks window titles for the match string every N seconds.
+# exits 0 when found, exits 1 after max attempts.
+set -euo pipefail
+
+MATCH="${1:?usage: poll-condition.sh <match-string> [interval] [max-attempts]}"
+INTERVAL="${2:-5}"
+MAX="${3:-60}"
+
+attempt=0
+while [ "$attempt" -lt "$MAX" ]; do
+  attempt=$((attempt + 1))
+
+  # snapshot and check window titles
+  windows=$(deskctl list-windows --json 2>/dev/null || echo '{"success":false}')
+  if echo "$windows" | grep -qi "$MATCH"; then
+    echo "FOUND: '$MATCH' detected on attempt $attempt"
+    deskctl snapshot --annotate
+    exit 0
+  fi
+
+  # also check screenshot text via active window title
+  active=$(deskctl get active-window --json 2>/dev/null || echo '{}')
+  if echo "$active" | grep -qi "$MATCH"; then
+    echo "FOUND: '$MATCH' in active window on attempt $attempt"
+    deskctl snapshot --annotate
+    exit 0
+  fi
+
+  echo "attempt $attempt/$MAX - '$MATCH' not found, waiting ${INTERVAL}s..."
+  sleep "$INTERVAL"
+done
+
+echo "NOT FOUND: '$MATCH' after $MAX attempts"
+deskctl snapshot --annotate
+exit 1