Merge pull request #305 from getcompanion-ai/computer

computer use tool
2026-04-15 05:02:07 +00:00 · 2026-03-11 15:01:34 -04:00 · 2026-03-11 15:01:34 -04:00 · a5d70ce55e
commit a5d70ce55e
parent 3919bbf708 3ff9f21a4f
9 changed files with 1086 additions and 17 deletions
--- a/packages/coding-agent/src/cli/args.ts
+++ b/packages/coding-agent/src/cli/args.ts
@ -325,6 +325,7 @@ ${chalk.bold("Environment Variables:")}
  AWS_REGION                       - AWS region for Amazon Bedrock (e.g., us-east-1)
  ${ENV_AGENT_DIR.padEnd(32)} - Session storage directory (default: ~/${CONFIG_DIR_NAME}/agent)
  COMPANION_PACKAGE_DIR                   - Override package directory (for Nix/Guix store paths)
+  COMPANION_AGENT_COMPUTER_COMMAND        - Override the computer helper command (default: agent-computer)
  COMPANION_OFFLINE                       - Disable startup network operations when set to 1/true/yes
  COMPANION_SHARE_VIEWER_URL              - Base URL for /share command (default: https://companion.dev/session/)
  COMPANION_AI_ANTIGRAVITY_VERSION        - Override Antigravity User-Agent version (e.g., 1.23.0)
@ -333,6 +334,7 @@ ${chalk.bold(`Available Tools (default: ${defaultToolsText}):`)}
  read   - Read file contents
  bash   - Execute bash commands
  browser - Browser automation with persistent state
+  computer - Desktop computer automation with screen observation and native UI control
  edit   - Edit files with find/replace
  write  - Write files (creates/overwrites)
  grep   - Search file contents (read-only, off by default)
--- a/packages/coding-agent/src/core/sdk.ts
+++ b/packages/coding-agent/src/core/sdk.ts
@ -26,10 +26,12 @@ import {
  allTools,
  bashTool,
  browserTool,
+  computerTool,
  codingTools,
  defaultCodingToolNames,
  createBashTool,
  createBrowserTool,
+  createComputerTool,
  createCodingTools,
  createEditTool,
  createFindTool,
@ -67,7 +69,7 @@ export interface CreateAgentSessionOptions {
  /** Models available for cycling (Ctrl+P in interactive mode) */
  scopedModels?: Array<{ model: Model<any>; thinkingLevel?: ThinkingLevel }>;

-  /** Built-in tools to use. Default: codingTools [read, bash, browser, edit, write] */
+  /** Built-in tools to use. Default: codingTools [read, bash, browser, computer, edit, write] */
  tools?: Tool[];
  /** Custom tools to register (in addition to built-in tools). */
  customTools?: ToolDefinition[];
@ -113,6 +115,7 @@ export {
  readTool,
  bashTool,
  browserTool,
+  computerTool,
  editTool,
  writeTool,
  grepTool,
@ -127,6 +130,7 @@ export {
  createReadTool,
  createBashTool,
  createBrowserTool,
+  createComputerTool,
  createEditTool,
  createWriteTool,
  createGrepTool,
--- a/packages/coding-agent/src/core/system-prompt.ts
+++ b/packages/coding-agent/src/core/system-prompt.ts
@ -11,6 +11,8 @@ const toolDescriptions: Record<string, string> = {
  bash: "Run shell commands",
  browser:
    "Browse the web: open, snapshot, click, fill, wait, screenshot, save/load state",
+  computer:
+    "Use the desktop computer: observe the screen, click, type, send hotkeys, manage apps/windows, wait for native UI, and read/write the clipboard",
  edit: "Surgical file edits (find exact text, replace it)",
  write: "Create new files or completely rewrite existing ones",
  grep: "Search file contents by regex (respects .gitignore)",
@ -167,6 +169,7 @@ export function buildSystemPrompt(

  const hasBash = tools.includes("bash");
  const hasBrowser = tools.includes("browser");
+  const hasComputer = tools.includes("computer");
  const hasEdit = tools.includes("edit");
  const hasWrite = tools.includes("write");
  const hasGrep = tools.includes("grep");
@ -215,6 +218,16 @@ export function buildSystemPrompt(
      "Browser: snapshot before interacting with elements. Use it for research and learning too, not just automation",
    );
  }
+  if (hasComputer) {
+    addGuideline(
+      "Computer: observe before interacting. Use it for native UI, desktop apps, file pickers, downloads, and OS dialogs",
+    );
+  }
+  if (hasBrowser && hasComputer) {
+    addGuideline(
+      "Prefer browser for websites and DOM-aware tasks. Switch to computer when native UI or desktop state matters",
+    );
+  }

  // Output hygiene
  if (hasEdit || hasWrite) {
--- a/packages/coding-agent/src/core/tools/computer.ts
+++ b/packages/coding-agent/src/core/tools/computer.ts
@ -0,0 +1,677 @@
+import { spawn } from "node:child_process";
+import { mkdirSync } from "node:fs";
+import { join, resolve } from "node:path";
+import type { AgentTool } from "@mariozechner/companion-agent-core";
+import { type Static, Type } from "@sinclair/typebox";
+import { getAgentDir } from "../../config.js";
+import {
+  getShellEnv,
+  killProcessTree,
+  sanitizeBinaryOutput,
+} from "../../utils/shell.js";
+
+const computerActions = [
+  "observe",
+  "click",
+  "type",
+  "hotkey",
+  "scroll",
+  "drag",
+  "wait",
+  "app_list",
+  "app_open",
+  "app_focus",
+  "window_list",
+  "window_focus",
+  "window_move",
+  "window_resize",
+  "window_close",
+  "clipboard_read",
+  "clipboard_write",
+] as const;
+
+const computerObservationModes = ["hybrid", "ocr"] as const;
+const computerSnapshotIdPattern = /^[A-Za-z0-9_-]+$/;
+
+const DEFAULT_COMPUTER_COMMAND =
+  process.env.COMPANION_AGENT_COMPUTER_COMMAND || "agent-computer";
+const DEFAULT_COMPUTER_TIMEOUT_SECONDS = 90;
+
+const computerSchema = Type.Object({
+  action: Type.Union(
+    computerActions.map((action) => Type.Literal(action)),
+    { description: "Computer action to execute" },
+  ),
+  snapshotId: Type.Optional(
+    Type.String({ description: "Snapshot ID returned from observe" }),
+  ),
+  ref: Type.Optional(
+    Type.String({
+      description:
+        "Target ref from observe output, such as w1 for a window or t3 for OCR text",
+    }),
+  ),
+  x: Type.Optional(Type.Number({ description: "Target x coordinate" })),
+  y: Type.Optional(Type.Number({ description: "Target y coordinate" })),
+  toRef: Type.Optional(
+    Type.String({ description: "Destination ref for drag actions" }),
+  ),
+  toX: Type.Optional(
+    Type.Number({ description: "Destination x coordinate for drag actions" }),
+  ),
+  toY: Type.Optional(
+    Type.Number({ description: "Destination y coordinate for drag actions" }),
+  ),
+  text: Type.Optional(
+    Type.String({
+      description:
+        "Text to type, text to wait for, or clipboard contents depending on action",
+    }),
+  ),
+  keys: Type.Optional(
+    Type.Array(Type.String(), {
+      description: "Hotkey chord or key sequence, for example ['ctrl', 'l']",
+      minItems: 1,
+    }),
+  ),
+  app: Type.Optional(
+    Type.String({
+      description:
+        "Installed app or running app name/class for app_open, app_focus, and wait",
+    }),
+  ),
+  windowId: Type.Optional(
+    Type.String({ description: "Window ID, such as 0x04200007" }),
+  ),
+  windowTitle: Type.Optional(
+    Type.String({ description: "Window title substring to match" }),
+  ),
+  mode: Type.Optional(
+    Type.Union(
+      computerObservationModes.map((mode) => Type.Literal(mode)),
+      { description: "Observation mode. Defaults to hybrid." },
+    ),
+  ),
+  amount: Type.Optional(
+    Type.Number({
+      description:
+        "Scroll amount in wheel steps. Positive scrolls down/right, negative scrolls up/left.",
+    }),
+  ),
+  width: Type.Optional(
+    Type.Number({ description: "Target window width for resize actions" }),
+  ),
+  height: Type.Optional(
+    Type.Number({ description: "Target window height for resize actions" }),
+  ),
+  clear: Type.Optional(
+    Type.Boolean({
+      description: "Clear the active input field before typing",
+    }),
+  ),
+  button: Type.Optional(
+    Type.Number({
+      description: "Mouse button for click or drag. Defaults to 1.",
+      minimum: 1,
+      maximum: 7,
+    }),
+  ),
+  timeoutMs: Type.Optional(
+    Type.Number({
+      description: "Wait timeout in milliseconds for observe-derived waits",
+      minimum: 0,
+    }),
+  ),
+  intervalMs: Type.Optional(
+    Type.Number({
+      description: "Polling interval for wait actions in milliseconds",
+      minimum: 10,
+    }),
+  ),
+});
+
+export type ComputerToolAction = (typeof computerActions)[number];
+export type ComputerObservationMode = (typeof computerObservationModes)[number];
+export type ComputerToolInput = Static<typeof computerSchema>;
+
+export interface ComputerToolDetails {
+  action: ComputerToolAction;
+  command: string;
+  args: string[];
+  stateDir: string;
+  snapshotId?: string;
+  screenshotPath?: string;
+}
+
+export interface ComputerOperations {
+  exec: (
+    command: string,
+    args: string[],
+    options: {
+      cwd: string;
+      env: NodeJS.ProcessEnv;
+      onData: (data: Buffer) => void;
+      signal?: AbortSignal;
+      timeout?: number;
+    },
+  ) => Promise<{ exitCode: number | null }>;
+}
+
+const defaultComputerOperations: ComputerOperations = {
+  exec: (command, args, { cwd, env, onData, signal, timeout }) => {
+    return new Promise((resolvePromise, rejectPromise) => {
+      const child = spawn(command, args, {
+        cwd,
+        detached: true,
+        env,
+        stdio: ["ignore", "pipe", "pipe"],
+      });
+
+      let timedOut = false;
+      let timeoutHandle: NodeJS.Timeout | undefined;
+
+      if (timeout !== undefined && timeout > 0) {
+        timeoutHandle = setTimeout(() => {
+          timedOut = true;
+          if (child.pid) {
+            killProcessTree(child.pid);
+          }
+        }, timeout * 1000);
+      }
+
+      if (child.stdout) {
+        child.stdout.on("data", onData);
+      }
+      if (child.stderr) {
+        child.stderr.on("data", onData);
+      }
+
+      const onAbort = () => {
+        if (child.pid) {
+          killProcessTree(child.pid);
+        }
+      };
+
+      if (signal) {
+        if (signal.aborted) {
+          onAbort();
+        } else {
+          signal.addEventListener("abort", onAbort, { once: true });
+        }
+      }
+
+      child.on("error", (error) => {
+        if (timeoutHandle) clearTimeout(timeoutHandle);
+        if (signal) signal.removeEventListener("abort", onAbort);
+        rejectPromise(error);
+      });
+
+      child.on("close", (code) => {
+        if (timeoutHandle) clearTimeout(timeoutHandle);
+        if (signal) signal.removeEventListener("abort", onAbort);
+
+        if (signal?.aborted) {
+          rejectPromise(new Error("aborted"));
+          return;
+        }
+
+        if (timedOut) {
+          rejectPromise(new Error(`timeout:${timeout}`));
+          return;
+        }
+
+        resolvePromise({ exitCode: code });
+      });
+    });
+  },
+};
+
+export interface ComputerToolOptions {
+  operations?: ComputerOperations;
+  command?: string;
+  defaultTimeoutSeconds?: number;
+  stateDir?: string;
+  agentDir?: string;
+}
+
+interface ComputerCommandContext {
+  action: ComputerToolAction;
+  args: string[];
+  statusMessage: string;
+  successMessage: string;
+  stateDir: string;
+}
+
+function resolveCommandPath(cwd: string, inputPath: string): string {
+  return resolve(cwd, inputPath);
+}
+
+function getComputerRootDir(options?: ComputerToolOptions): string {
+  const baseAgentDir = options?.agentDir ?? getAgentDir();
+  return join(baseAgentDir, "computer");
+}
+
+function getComputerStateDir(
+  cwd: string,
+  options?: ComputerToolOptions,
+): string {
+  const stateDir = options?.stateDir ?? getComputerRootDir(options);
+  return resolveCommandPath(cwd, stateDir);
+}
+
+function ensureComputerDir(stateDir: string): void {
+  mkdirSync(stateDir, { recursive: true });
+}
+
+function normalizeOutput(chunks: Buffer[]): string {
+  return sanitizeBinaryOutput(Buffer.concat(chunks).toString("utf-8")).trim();
+}
+
+function hasCoordinateTarget(input: ComputerToolInput): boolean {
+  return input.x !== undefined && input.y !== undefined;
+}
+
+function hasRefTarget(input: ComputerToolInput): boolean {
+  return input.snapshotId !== undefined && input.ref !== undefined;
+}
+
+function hasWindowTarget(input: ComputerToolInput): boolean {
+  return input.windowId !== undefined || input.windowTitle !== undefined;
+}
+
+function hasDragDestination(input: ComputerToolInput): boolean {
+  return (
+    input.toRef !== undefined ||
+    (input.toX !== undefined && input.toY !== undefined)
+  );
+}
+
+function validateSnapshotId(snapshotId: string): void {
+  if (!computerSnapshotIdPattern.test(snapshotId)) {
+    throw new Error(`Invalid computer snapshotId: "${snapshotId}"`);
+  }
+}
+
+function validateWaitInput(input: ComputerToolInput): void {
+  const targetCount =
+    (input.ref !== undefined ? 1 : 0) +
+    (input.text !== undefined ? 1 : 0) +
+    (input.app !== undefined ? 1 : 0) +
+    (input.windowId !== undefined ? 1 : 0) +
+    (input.windowTitle !== undefined ? 1 : 0);
+
+  if (targetCount === 0 && input.timeoutMs === undefined) {
+    throw new Error(
+      "computer wait requires one of ref, text, app, windowId, windowTitle, or timeoutMs",
+    );
+  }
+
+  if (targetCount > 1) {
+    throw new Error(
+      "computer wait requires exactly one of ref, text, app, windowId, or windowTitle",
+    );
+  }
+}
+
+function validateComputerInput(input: ComputerToolInput): void {
+  if (input.snapshotId !== undefined) {
+    validateSnapshotId(input.snapshotId);
+  }
+
+  switch (input.action) {
+    case "observe":
+    case "app_list":
+    case "window_list":
+    case "clipboard_read":
+      return;
+    case "click":
+      if (!hasRefTarget(input) && !hasCoordinateTarget(input)) {
+        throw new Error(
+          "computer click requires snapshotId and ref, or explicit x and y coordinates",
+        );
+      }
+      return;
+    case "type":
+      if (input.text === undefined) {
+        throw new Error("computer type requires text");
+      }
+      if (input.ref !== undefined && input.snapshotId === undefined) {
+        throw new Error("computer type with ref requires snapshotId");
+      }
+      return;
+    case "hotkey":
+      if (!input.keys || input.keys.length === 0) {
+        throw new Error("computer hotkey requires keys");
+      }
+      return;
+    case "scroll":
+      if (input.amount === undefined || input.amount === 0) {
+        throw new Error("computer scroll requires a non-zero amount");
+      }
+      if (input.ref !== undefined && input.snapshotId === undefined) {
+        throw new Error("computer scroll with ref requires snapshotId");
+      }
+      return;
+    case "drag":
+      if (!hasRefTarget(input) && !hasCoordinateTarget(input)) {
+        throw new Error(
+          "computer drag requires a starting target via snapshotId and ref, or x and y coordinates",
+        );
+      }
+      if (!hasDragDestination(input)) {
+        throw new Error(
+          "computer drag requires a destination via toRef, or explicit toX and toY coordinates",
+        );
+      }
+      if (input.toRef !== undefined && input.snapshotId === undefined) {
+        throw new Error("computer drag with toRef requires snapshotId");
+      }
+      return;
+    case "wait":
+      validateWaitInput(input);
+      if (input.ref !== undefined && input.snapshotId === undefined) {
+        throw new Error("computer wait with ref requires snapshotId");
+      }
+      return;
+    case "app_open":
+    case "app_focus":
+      if (!input.app) {
+        throw new Error(`computer ${input.action} requires app`);
+      }
+      return;
+    case "window_focus":
+    case "window_close":
+      if (!hasWindowTarget(input)) {
+        throw new Error(
+          `computer ${input.action} requires windowId or windowTitle`,
+        );
+      }
+      return;
+    case "window_move":
+      if (!hasWindowTarget(input)) {
+        throw new Error(
+          "computer window_move requires windowId or windowTitle",
+        );
+      }
+      if (input.x === undefined || input.y === undefined) {
+        throw new Error("computer window_move requires x and y");
+      }
+      return;
+    case "window_resize":
+      if (!hasWindowTarget(input)) {
+        throw new Error(
+          "computer window_resize requires windowId or windowTitle",
+        );
+      }
+      if (input.width === undefined || input.height === undefined) {
+        throw new Error("computer window_resize requires width and height");
+      }
+      return;
+    case "clipboard_write":
+      if (input.text === undefined) {
+        throw new Error("computer clipboard_write requires text");
+      }
+      return;
+    default: {
+      const unsupportedAction: never = input.action;
+      throw new Error(`Unsupported computer action: ${unsupportedAction}`);
+    }
+  }
+}
+
+function describeAction(input: ComputerToolInput): {
+  statusMessage: string;
+  successMessage: string;
+} {
+  switch (input.action) {
+    case "observe":
+      return {
+        statusMessage: "Observing desktop...",
+        successMessage: "Captured desktop snapshot",
+      };
+    case "click":
+      return {
+        statusMessage: "Clicking desktop target...",
+        successMessage: "Clicked desktop target",
+      };
+    case "type":
+      return {
+        statusMessage: "Typing into desktop...",
+        successMessage: "Typed into desktop",
+      };
+    case "hotkey":
+      return {
+        statusMessage: "Sending hotkey...",
+        successMessage: "Sent hotkey",
+      };
+    case "scroll":
+      return {
+        statusMessage: "Scrolling desktop...",
+        successMessage: "Scrolled desktop",
+      };
+    case "drag":
+      return {
+        statusMessage: "Dragging desktop target...",
+        successMessage: "Dragged desktop target",
+      };
+    case "wait":
+      return {
+        statusMessage: "Waiting for desktop state...",
+        successMessage: "Desktop wait condition satisfied",
+      };
+    case "app_list":
+      return {
+        statusMessage: "Listing apps...",
+        successMessage: "Listed apps",
+      };
+    case "app_open":
+      return {
+        statusMessage: `Opening app ${input.app}...`,
+        successMessage: `Opened app ${input.app}`,
+      };
+    case "app_focus":
+      return {
+        statusMessage: `Focusing app ${input.app}...`,
+        successMessage: `Focused app ${input.app}`,
+      };
+    case "window_list":
+      return {
+        statusMessage: "Listing windows...",
+        successMessage: "Listed windows",
+      };
+    case "window_focus":
+      return {
+        statusMessage: "Focusing window...",
+        successMessage: "Focused window",
+      };
+    case "window_move":
+      return {
+        statusMessage: "Moving window...",
+        successMessage: "Moved window",
+      };
+    case "window_resize":
+      return {
+        statusMessage: "Resizing window...",
+        successMessage: "Resized window",
+      };
+    case "window_close":
+      return {
+        statusMessage: "Closing window...",
+        successMessage: "Closed window",
+      };
+    case "clipboard_read":
+      return {
+        statusMessage: "Reading clipboard...",
+        successMessage: "Read clipboard",
+      };
+    case "clipboard_write":
+      return {
+        statusMessage: "Writing clipboard...",
+        successMessage: "Wrote clipboard",
+      };
+  }
+}
+
+function buildComputerCommand(
+  cwd: string,
+  input: ComputerToolInput,
+  options?: ComputerToolOptions,
+): ComputerCommandContext {
+  validateComputerInput(input);
+
+  const stateDir = getComputerStateDir(cwd, options);
+  ensureComputerDir(stateDir);
+  const actionDescription = describeAction(input);
+
+  return {
+    action: input.action,
+    args: ["--state-dir", stateDir, "--input", JSON.stringify(input)],
+    statusMessage: actionDescription.statusMessage,
+    successMessage: actionDescription.successMessage,
+    stateDir,
+  };
+}
+
+function buildComputerErrorMessage(
+  action: ComputerToolAction,
+  output: string,
+  exitCode: number | null,
+): string {
+  const base =
+    exitCode === null
+      ? `Computer action "${action}" failed`
+      : `Computer action "${action}" exited with code ${exitCode}`;
+  return output.length > 0 ? `${output}\n\n${base}` : base;
+}
+
+function getMissingComputerCommandMessage(command: string): string {
+  return [
+    `Computer tool could not find "${command}".`,
+    "Desktop sandboxes install agent-computer alongside the browser tool.",
+    "If you are running locally, either install the helper or omit the computer tool.",
+    "Recommended setup inside a sandbox image: copy agent-computer into /usr/local/bin and install xdotool, wmctrl, tesseract-ocr, and xclip.",
+  ].join("\n");
+}
+
+function parseComputerPayload(output: string): {
+  text: string;
+  snapshotId?: string;
+  screenshotPath?: string;
+} {
+  if (output.length === 0) {
+    return { text: "" };
+  }
+
+  try {
+    const payload = JSON.parse(output) as {
+      snapshot?: { snapshotId?: string; screenshotPath?: string };
+      summary?: string;
+      screenshotPath?: string;
+      snapshotId?: string;
+    };
+    return {
+      text: JSON.stringify(payload, null, 2),
+      snapshotId: payload.snapshot?.snapshotId ?? payload.snapshotId,
+      screenshotPath:
+        payload.snapshot?.screenshotPath ?? payload.screenshotPath,
+    };
+  } catch {
+    return { text: output };
+  }
+}
+
+export function createComputerTool(
+  cwd: string,
+  options?: ComputerToolOptions,
+): AgentTool<typeof computerSchema> {
+  const operations = options?.operations ?? defaultComputerOperations;
+  const command = options?.command ?? DEFAULT_COMPUTER_COMMAND;
+  const defaultTimeoutSeconds =
+    options?.defaultTimeoutSeconds ?? DEFAULT_COMPUTER_TIMEOUT_SECONDS;
+
+  return {
+    name: "computer",
+    label: "computer",
+    description:
+      "Use the desktop computer when browser DOM control is not enough: observe the screen, interact with windows and apps, type, click, drag, scroll, wait for native UI changes, and read or write the clipboard.",
+    parameters: computerSchema,
+    execute: async (_toolCallId, input, signal, onUpdate) => {
+      const commandContext = buildComputerCommand(cwd, input, options);
+      const details: ComputerToolDetails = {
+        action: commandContext.action,
+        command,
+        args: commandContext.args,
+        stateDir: commandContext.stateDir,
+      };
+
+      onUpdate?.({
+        content: [{ type: "text", text: commandContext.statusMessage }],
+        details,
+      });
+
+      const chunks: Buffer[] = [];
+
+      try {
+        const { exitCode } = await operations.exec(
+          command,
+          commandContext.args,
+          {
+            cwd,
+            env: getShellEnv(),
+            onData: (data) => chunks.push(data),
+            signal,
+            timeout: defaultTimeoutSeconds,
+          },
+        );
+
+        const output = normalizeOutput(chunks);
+        if (exitCode !== 0) {
+          throw new Error(
+            buildComputerErrorMessage(commandContext.action, output, exitCode),
+          );
+        }
+
+        const parsed = parseComputerPayload(output);
+        if (parsed.snapshotId) {
+          details.snapshotId = parsed.snapshotId;
+        }
+        if (parsed.screenshotPath) {
+          details.screenshotPath = parsed.screenshotPath;
+        }
+
+        return {
+          content: [
+            {
+              type: "text",
+              text:
+                parsed.text.length > 0
+                  ? parsed.text
+                  : commandContext.successMessage,
+            },
+          ],
+          details,
+        };
+      } catch (error) {
+        if (
+          error instanceof Error &&
+          "code" in error &&
+          error.code === "ENOENT"
+        ) {
+          throw new Error(getMissingComputerCommandMessage(command));
+        }
+        if (error instanceof Error && error.message === "aborted") {
+          throw new Error(`Computer action "${commandContext.action}" aborted`);
+        }
+        if (error instanceof Error && error.message.startsWith("timeout:")) {
+          const seconds = error.message.split(":")[1];
+          throw new Error(
+            `Computer action "${commandContext.action}" timed out after ${seconds} seconds`,
+          );
+        }
+        throw error;
+      }
+    },
+  };
+}
+
+export const computerTool = createComputerTool(process.cwd());
--- a/packages/coding-agent/src/core/tools/index.ts
+++ b/packages/coding-agent/src/core/tools/index.ts
@ -19,6 +19,16 @@ export {
  browserTool,
  createBrowserTool,
 } from "./browser.js";
+export {
+  type ComputerObservationMode,
+  type ComputerOperations,
+  type ComputerToolAction,
+  type ComputerToolDetails,
+  type ComputerToolInput,
+  type ComputerToolOptions,
+  computerTool,
+  createComputerTool,
+} from "./computer.js";
 export {
  createEditTool,
  type EditOperations,
@ -84,6 +94,11 @@ import {
  createBrowserTool,
  type BrowserToolOptions,
 } from "./browser.js";
+import {
+  computerTool,
+  createComputerTool,
+  type ComputerToolOptions,
+} from "./computer.js";
 import { createEditTool, editTool } from "./edit.js";
 import { createFindTool, findTool } from "./find.js";
 import { createGrepTool, grepTool } from "./grep.js";
@ -102,6 +117,7 @@ export const allTools = {
  read: readTool,
  bash: bashTool,
  browser: browserTool,
+  computer: computerTool,
  edit: editTool,
  write: writeTool,
  grep: grepTool,
@ -115,6 +131,7 @@ export const defaultCodingToolNames: ToolName[] = [
  "read",
  "bash",
  "browser",
+  "computer",
  "edit",
  "write",
 ];
@ -131,19 +148,16 @@ export interface ToolsOptions {
  bash?: BashToolOptions;
  /** Options for the browser tool */
  browser?: BrowserToolOptions;
+  /** Options for the computer tool */
+  computer?: ComputerToolOptions;
 }

 /**
 * Create coding tools configured for a specific working directory.
 */
 export function createCodingTools(cwd: string, options?: ToolsOptions): Tool[] {
-  return [
-    createReadTool(cwd, options?.read),
-    createBashTool(cwd, options?.bash),
-    createBrowserTool(cwd, options?.browser),
-    createEditTool(cwd),
-    createWriteTool(cwd),
-  ];
+  const tools = createAllTools(cwd, options);
+  return defaultCodingToolNames.map((toolName) => tools[toolName]);
 }

 /**
@ -172,6 +186,7 @@ export function createAllTools(
    read: createReadTool(cwd, options?.read),
    bash: createBashTool(cwd, options?.bash),
    browser: createBrowserTool(cwd, options?.browser),
+    computer: createComputerTool(cwd, options?.computer),
    edit: createEditTool(cwd),
    write: createWriteTool(cwd),
    grep: createGrepTool(cwd),
--- a/packages/coding-agent/src/index.ts
+++ b/packages/coding-agent/src/index.ts
@ -182,6 +182,7 @@ export {
  createAgentSession,
  createBashTool,
  createBrowserTool,
+  createComputerTool,
  // Tool factories (for custom cwd)
  createCodingTools,
  createEditTool,
@ -253,6 +254,13 @@ export {
  type BrowserToolInput,
  type BrowserToolOptions,
  browserTool,
+  type ComputerObservationMode,
+  type ComputerOperations,
+  type ComputerToolAction,
+  type ComputerToolDetails,
+  type ComputerToolInput,
+  type ComputerToolOptions,
+  computerTool,
  codingTools,
  defaultCodingToolNames,
  DEFAULT_MAX_BYTES,
--- a/packages/coding-agent/test/computer-tool.test.ts
+++ b/packages/coding-agent/test/computer-tool.test.ts
@ -0,0 +1,339 @@
+import { spawnSync } from "node:child_process";
+import {
+  chmodSync,
+  existsSync,
+  mkdtempSync,
+  readFileSync,
+  rmSync,
+  writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join, resolve } from "node:path";
+import { afterEach, describe, expect, it } from "vitest";
+import { parseArgs } from "../src/cli/args.js";
+import { buildSystemPrompt } from "../src/core/system-prompt.js";
+import {
+  type ComputerOperations,
+  type ComputerToolDetails,
+  createAllTools,
+  createComputerTool,
+  defaultCodingToolNames,
+} from "../src/core/tools/index.js";
+
+interface TextBlock {
+  type: "text";
+  text: string;
+}
+
+type ToolContentBlock = TextBlock | { type: string };
+
+interface ToolResultLike {
+  content: ToolContentBlock[];
+  details?: unknown;
+}
+
+interface ComputerExecCall {
+  command: string;
+  args: string[];
+  cwd: string;
+  env: NodeJS.ProcessEnv;
+  timeout?: number;
+}
+
+function getTextOutput(result: ToolResultLike): string {
+  return result.content
+    .filter((block): block is TextBlock => block.type === "text")
+    .map((block) => block.text)
+    .join("\n");
+}
+
+function createMockComputerOperations(
+  output = "",
+  exitCode: number | null = 0,
+): {
+  calls: ComputerExecCall[];
+  operations: ComputerOperations;
+} {
+  const calls: ComputerExecCall[] = [];
+
+  return {
+    calls,
+    operations: {
+      exec: async (command, args, options) => {
+        calls.push({
+          command,
+          args,
+          cwd: options.cwd,
+          env: options.env,
+          timeout: options.timeout,
+        });
+        if (output.length > 0) {
+          options.onData(Buffer.from(output, "utf-8"));
+        }
+        return { exitCode };
+      },
+    },
+  };
+}
+
+function getAgentComputerScriptPath(): string {
+  return resolve(
+    process.cwd(),
+    "../../../../docker/companion/agent-computer.js",
+  );
+}
+
+describe("computer tool", () => {
+  const tempDirs: string[] = [];
+
+  afterEach(() => {
+    while (tempDirs.length > 0) {
+      const tempDir = tempDirs.pop();
+      if (tempDir) {
+        rmSync(tempDir, { recursive: true, force: true });
+      }
+    }
+  });
+
+  function createTempDir(prefix: string): string {
+    const tempDir = mkdtempSync(join(tmpdir(), prefix));
+    tempDirs.push(tempDir);
+    return tempDir;
+  }
+
+  it("observes the desktop through the agent-computer helper", async () => {
+    const cwd = createTempDir("coding-agent-computer-observe-");
+    const stateDir = join(cwd, "computer-state");
+    const { calls, operations } = createMockComputerOperations(
+      JSON.stringify({
+        ok: true,
+        action: "observe",
+        summary: "Captured desktop snapshot snap-1",
+        snapshot: {
+          snapshotId: "snap-1",
+          screenshotPath: "/tmp/snap-1.png",
+          backend: "hybrid",
+          activeWindow: null,
+          windows: [],
+          refs: [],
+        },
+      }),
+    );
+
+    const computerTool = createComputerTool(cwd, {
+      operations,
+      command: "agent-computer-test",
+      stateDir,
+    });
+
+    const result = (await computerTool.execute("computer-observe", {
+      action: "observe",
+    })) as ToolResultLike;
+
+    expect(calls).toHaveLength(1);
+    expect(calls[0]).toMatchObject({
+      command: "agent-computer-test",
+      args: ["--state-dir", stateDir, "--input", '{"action":"observe"}'],
+      cwd,
+      timeout: 90,
+    });
+
+    const details = result.details as ComputerToolDetails | undefined;
+    expect(details?.stateDir).toBe(stateDir);
+    expect(details?.snapshotId).toBe("snap-1");
+    expect(details?.screenshotPath).toBe("/tmp/snap-1.png");
+    expect(getTextOutput(result)).toContain('"snapshotId": "snap-1"');
+  });
+
+  it("validates click targets before spawning the helper", async () => {
+    const cwd = createTempDir("coding-agent-computer-click-");
+    const stateDir = join(cwd, "computer-state");
+    const { calls, operations } = createMockComputerOperations();
+
+    const computerTool = createComputerTool(cwd, {
+      operations,
+      stateDir,
+    });
+
+    await expect(
+      computerTool.execute("computer-click-missing-target", {
+        action: "click",
+      }),
+    ).rejects.toThrow(
+      "computer click requires snapshotId and ref, or explicit x and y coordinates",
+    );
+
+    expect(calls).toHaveLength(0);
+  });
+
+  it("rejects unsafe snapshot ids before spawning the helper", async () => {
+    const cwd = createTempDir("coding-agent-computer-snapshot-id-");
+    const stateDir = join(cwd, "computer-state");
+    const { calls, operations } = createMockComputerOperations();
+
+    const computerTool = createComputerTool(cwd, {
+      operations,
+      stateDir,
+    });
+
+    await expect(
+      computerTool.execute("computer-click-invalid-snapshot", {
+        action: "click",
+        snapshotId: "../../auth",
+        ref: "w1",
+      }),
+    ).rejects.toThrow('Invalid computer snapshotId: "../../auth"');
+
+    expect(calls).toHaveLength(0);
+  });
+
+  it("accepts computer in --tools and exposes it in built-in tool wiring", () => {
+    const parsed = parseArgs(["--tools", "computer,read"]);
+    expect(parsed.tools).toEqual(["computer", "read"]);
+
+    expect(defaultCodingToolNames).toContain("computer");
+    expect(createAllTools(process.cwd()).computer.name).toBe("computer");
+  });
+
+  it("mentions computer in the default system prompt", () => {
+    const prompt = buildSystemPrompt();
+
+    expect(prompt).toContain(
+      "- computer: Use the desktop computer: observe the screen",
+    );
+    expect(prompt).toContain(
+      "Computer: observe before interacting. Use it for native UI",
+    );
+    expect(prompt).toContain(
+      "Prefer browser for websites and DOM-aware tasks. Switch to computer",
+    );
+  });
+
+  it("rejects accessibility observe mode until a non-screenshot backend exists", () => {
+    const stateDir = createTempDir(
+      "coding-agent-computer-helper-accessibility-",
+    );
+    const result = spawnSync(
+      process.execPath,
+      [
+        "--no-warnings",
+        getAgentComputerScriptPath(),
+        "--state-dir",
+        stateDir,
+        "--input",
+        JSON.stringify({
+          action: "observe",
+          mode: "accessibility",
+        }),
+      ],
+      {
+        encoding: "utf8",
+      },
+    );
+
+    expect(result.status).not.toBe(0);
+    expect(result.stderr).toContain(
+      "backend_unavailable: accessibility observe mode is not implemented",
+    );
+  });
+
+  it("refuses to shell out when app_open cannot match an installed app", () => {
+    const stateDir = createTempDir("coding-agent-computer-helper-app-open-");
+    const markerPath = join(stateDir, "should-not-exist");
+    const result = spawnSync(
+      process.execPath,
+      [
+        "--no-warnings",
+        getAgentComputerScriptPath(),
+        "--state-dir",
+        stateDir,
+        "--input",
+        JSON.stringify({
+          action: "app_open",
+          app: `definitely-not-an-installed-app && touch ${markerPath}`,
+        }),
+      ],
+      {
+        encoding: "utf8",
+      },
+    );
+
+    expect(result.status).not.toBe(0);
+    expect(result.stderr).toContain("app_not_found:");
+    expect(existsSync(markerPath)).toBe(false);
+  });
+
+  it("rejects snapshot path traversal inside the helper", () => {
+    const stateDir = createTempDir("coding-agent-computer-helper-snapshot-id-");
+    const result = spawnSync(
+      process.execPath,
+      [
+        "--no-warnings",
+        getAgentComputerScriptPath(),
+        "--state-dir",
+        stateDir,
+        "--input",
+        JSON.stringify({
+          action: "click",
+          snapshotId: "../../auth",
+          ref: "w1",
+        }),
+      ],
+      {
+        encoding: "utf8",
+      },
+    );
+
+    expect(result.status).not.toBe(0);
+    expect(result.stderr).toContain("invalid_snapshot_id: ../../auth");
+  });
+
+  it("passes typed text after the xdotool option separator", () => {
+    const stateDir = createTempDir("coding-agent-computer-helper-type-");
+    const binDir = createTempDir("coding-agent-computer-helper-bin-");
+    const argsPath = join(stateDir, "xdotool-args.json");
+    const xdotoolPath = join(binDir, "xdotool");
+    writeFileSync(
+      xdotoolPath,
+      `#!/usr/bin/env node
+const { writeFileSync } = require("node:fs");
+writeFileSync(process.env.TEST_XDOTOOL_ARGS_PATH, JSON.stringify(process.argv.slice(2)));
+`,
+      "utf8",
+    );
+    chmodSync(xdotoolPath, 0o755);
+
+    const result = spawnSync(
+      process.execPath,
+      [
+        "--no-warnings",
+        getAgentComputerScriptPath(),
+        "--state-dir",
+        stateDir,
+        "--input",
+        JSON.stringify({
+          action: "type",
+          text: "--delay",
+        }),
+      ],
+      {
+        encoding: "utf8",
+        env: {
+          ...process.env,
+          PATH: `${binDir}:${process.env.PATH ?? ""}`,
+          TEST_XDOTOOL_ARGS_PATH: argsPath,
+        },
+      },
+    );
+
+    expect(result.status).toBe(0);
+    expect(JSON.parse(readFileSync(argsPath, "utf8"))).toEqual([
+      "type",
+      "--delay",
+      "12",
+      "--clearmodifiers",
+      "--",
+      "--delay",
+    ]);
+  });
+});
--- a/packages/companion-teams/src/adapters/tmux-adapter.test.ts
+++ b/packages/companion-teams/src/adapters/tmux-adapter.test.ts
@ -4,11 +4,10 @@ import { TmuxAdapter } from "./tmux-adapter";

 describe("TmuxAdapter", () => {
  let adapter: TmuxAdapter;
-  let mockExecCommand: ReturnType<typeof vi.spyOn>;

  beforeEach(() => {
    adapter = new TmuxAdapter();
-    mockExecCommand = vi.spyOn(terminalAdapter, "execCommand");
+    vi.spyOn(terminalAdapter, "execCommand");
    delete process.env.TMUX;
    delete process.env.ZELLIJ;
    delete process.env.WEZTERM_PANE;
@ -21,6 +20,7 @@ describe("TmuxAdapter", () => {
  });

  it("detects tmux in headless runtimes when the binary is available", () => {
+    const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
    mockExecCommand.mockReturnValue({
      stdout: "tmux 3.4",
      stderr: "",
@ -33,6 +33,7 @@ describe("TmuxAdapter", () => {

  it("does not detect tmux in GUI terminals just because the binary exists", () => {
    process.env.COLORTERM = "truecolor";
+    const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
    mockExecCommand.mockReturnValue({
      stdout: "tmux 3.4",
      stderr: "",
@ -44,7 +45,8 @@ describe("TmuxAdapter", () => {
  });

  it("creates a detached team session when not already inside tmux", () => {
-    mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
+    const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
+    mockExecCommand.mockImplementation((_bin, args) => {
      if (args[0] === "has-session") {
        return { stdout: "", stderr: "missing", status: 1 };
      }
@ -65,12 +67,18 @@ describe("TmuxAdapter", () => {

    expect(mockExecCommand).toHaveBeenCalledWith(
      "tmux",
-      expect.arrayContaining(["new-session", "-d", "-s", "companion-teams-demo"]),
+      expect.arrayContaining([
+        "new-session",
+        "-d",
+        "-s",
+        "companion-teams-demo",
+      ]),
    );
  });

  it("splits an existing detached session when not already inside tmux", () => {
-    mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
+    const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
+    mockExecCommand.mockImplementation((_bin, args) => {
      if (args[0] === "has-session") {
        return { stdout: "", stderr: "", status: 0 };
      }
@ -96,6 +104,7 @@ describe("TmuxAdapter", () => {
  });

  it("checks pane liveness by pane id", () => {
+    const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
    mockExecCommand.mockReturnValue({
      stdout: "%1\n%7\n",
      stderr: "",
--- a/packages/companion-teams/src/adapters/wezterm-adapter.test.ts
+++ b/packages/companion-teams/src/adapters/wezterm-adapter.test.ts
@ -8,11 +8,10 @@ import { WezTermAdapter } from "./wezterm-adapter";

 describe("WezTermAdapter", () => {
  let adapter: WezTermAdapter;
-  let mockExecCommand: ReturnType<typeof vi.spyOn>;

  beforeEach(() => {
    adapter = new WezTermAdapter();
-    mockExecCommand = vi.spyOn(terminalAdapter, "execCommand");
+    vi.spyOn(terminalAdapter, "execCommand");
    delete process.env.WEZTERM_PANE;
    delete process.env.TMUX;
    delete process.env.ZELLIJ;
@ -31,6 +30,7 @@ describe("WezTermAdapter", () => {

  describe("detect", () => {
    it("should detect when WEZTERM_PANE is set", () => {
+      const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
      mockExecCommand.mockReturnValue({
        stdout: "version 1.0",
        stderr: "",
@ -43,7 +43,8 @@ describe("WezTermAdapter", () => {
  describe("spawn", () => {
    it("should spawn first pane to the right with 50%", () => {
      // Mock getPanes finding only current pane
-      mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
+      const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
+      mockExecCommand.mockImplementation((_bin, args) => {
        if (args.includes("list")) {
          return {
            stdout: JSON.stringify([{ pane_id: 0, tab_id: 0 }]),
@ -79,7 +80,8 @@ describe("WezTermAdapter", () => {

    it("should spawn subsequent panes by splitting the sidebar", () => {
      // Mock getPanes finding current pane (0) and sidebar pane (1)
-      mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
+      const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
+      mockExecCommand.mockImplementation((_bin, args) => {
        if (args.includes("list")) {
          return {
            stdout: JSON.stringify([