diff --git a/packages/coding-agent/src/cli/args.ts b/packages/coding-agent/src/cli/args.ts index 71cb16f..f36aa6d 100644 --- a/packages/coding-agent/src/cli/args.ts +++ b/packages/coding-agent/src/cli/args.ts @@ -325,6 +325,7 @@ ${chalk.bold("Environment Variables:")} AWS_REGION - AWS region for Amazon Bedrock (e.g., us-east-1) ${ENV_AGENT_DIR.padEnd(32)} - Session storage directory (default: ~/${CONFIG_DIR_NAME}/agent) COMPANION_PACKAGE_DIR - Override package directory (for Nix/Guix store paths) + COMPANION_AGENT_COMPUTER_COMMAND - Override the computer helper command (default: agent-computer) COMPANION_OFFLINE - Disable startup network operations when set to 1/true/yes COMPANION_SHARE_VIEWER_URL - Base URL for /share command (default: https://companion.dev/session/) COMPANION_AI_ANTIGRAVITY_VERSION - Override Antigravity User-Agent version (e.g., 1.23.0) @@ -333,6 +334,7 @@ ${chalk.bold(`Available Tools (default: ${defaultToolsText}):`)} read - Read file contents bash - Execute bash commands browser - Browser automation with persistent state + computer - Desktop computer automation with screen observation and native UI control edit - Edit files with find/replace write - Write files (creates/overwrites) grep - Search file contents (read-only, off by default) diff --git a/packages/coding-agent/src/core/sdk.ts b/packages/coding-agent/src/core/sdk.ts index 2796394..054fceb 100644 --- a/packages/coding-agent/src/core/sdk.ts +++ b/packages/coding-agent/src/core/sdk.ts @@ -26,10 +26,12 @@ import { allTools, bashTool, browserTool, + computerTool, codingTools, defaultCodingToolNames, createBashTool, createBrowserTool, + createComputerTool, createCodingTools, createEditTool, createFindTool, @@ -67,7 +69,7 @@ export interface CreateAgentSessionOptions { /** Models available for cycling (Ctrl+P in interactive mode) */ scopedModels?: Array<{ model: Model; thinkingLevel?: ThinkingLevel }>; - /** Built-in tools to use. Default: codingTools [read, bash, browser, edit, write] */ + /** Built-in tools to use. Default: codingTools [read, bash, browser, computer, edit, write] */ tools?: Tool[]; /** Custom tools to register (in addition to built-in tools). */ customTools?: ToolDefinition[]; @@ -113,6 +115,7 @@ export { readTool, bashTool, browserTool, + computerTool, editTool, writeTool, grepTool, @@ -127,6 +130,7 @@ export { createReadTool, createBashTool, createBrowserTool, + createComputerTool, createEditTool, createWriteTool, createGrepTool, diff --git a/packages/coding-agent/src/core/system-prompt.ts b/packages/coding-agent/src/core/system-prompt.ts index e2c825b..c72ea82 100644 --- a/packages/coding-agent/src/core/system-prompt.ts +++ b/packages/coding-agent/src/core/system-prompt.ts @@ -11,6 +11,8 @@ const toolDescriptions: Record = { bash: "Run shell commands", browser: "Browse the web: open, snapshot, click, fill, wait, screenshot, save/load state", + computer: + "Use the desktop computer: observe the screen, click, type, send hotkeys, manage apps/windows, wait for native UI, and read/write the clipboard", edit: "Surgical file edits (find exact text, replace it)", write: "Create new files or completely rewrite existing ones", grep: "Search file contents by regex (respects .gitignore)", @@ -167,6 +169,7 @@ export function buildSystemPrompt( const hasBash = tools.includes("bash"); const hasBrowser = tools.includes("browser"); + const hasComputer = tools.includes("computer"); const hasEdit = tools.includes("edit"); const hasWrite = tools.includes("write"); const hasGrep = tools.includes("grep"); @@ -215,6 +218,16 @@ export function buildSystemPrompt( "Browser: snapshot before interacting with elements. Use it for research and learning too, not just automation", ); } + if (hasComputer) { + addGuideline( + "Computer: observe before interacting. Use it for native UI, desktop apps, file pickers, downloads, and OS dialogs", + ); + } + if (hasBrowser && hasComputer) { + addGuideline( + "Prefer browser for websites and DOM-aware tasks. Switch to computer when native UI or desktop state matters", + ); + } // Output hygiene if (hasEdit || hasWrite) { diff --git a/packages/coding-agent/src/core/tools/computer.ts b/packages/coding-agent/src/core/tools/computer.ts new file mode 100644 index 0000000..f8ea171 --- /dev/null +++ b/packages/coding-agent/src/core/tools/computer.ts @@ -0,0 +1,677 @@ +import { spawn } from "node:child_process"; +import { mkdirSync } from "node:fs"; +import { join, resolve } from "node:path"; +import type { AgentTool } from "@mariozechner/companion-agent-core"; +import { type Static, Type } from "@sinclair/typebox"; +import { getAgentDir } from "../../config.js"; +import { + getShellEnv, + killProcessTree, + sanitizeBinaryOutput, +} from "../../utils/shell.js"; + +const computerActions = [ + "observe", + "click", + "type", + "hotkey", + "scroll", + "drag", + "wait", + "app_list", + "app_open", + "app_focus", + "window_list", + "window_focus", + "window_move", + "window_resize", + "window_close", + "clipboard_read", + "clipboard_write", +] as const; + +const computerObservationModes = ["hybrid", "ocr"] as const; +const computerSnapshotIdPattern = /^[A-Za-z0-9_-]+$/; + +const DEFAULT_COMPUTER_COMMAND = + process.env.COMPANION_AGENT_COMPUTER_COMMAND || "agent-computer"; +const DEFAULT_COMPUTER_TIMEOUT_SECONDS = 90; + +const computerSchema = Type.Object({ + action: Type.Union( + computerActions.map((action) => Type.Literal(action)), + { description: "Computer action to execute" }, + ), + snapshotId: Type.Optional( + Type.String({ description: "Snapshot ID returned from observe" }), + ), + ref: Type.Optional( + Type.String({ + description: + "Target ref from observe output, such as w1 for a window or t3 for OCR text", + }), + ), + x: Type.Optional(Type.Number({ description: "Target x coordinate" })), + y: Type.Optional(Type.Number({ description: "Target y coordinate" })), + toRef: Type.Optional( + Type.String({ description: "Destination ref for drag actions" }), + ), + toX: Type.Optional( + Type.Number({ description: "Destination x coordinate for drag actions" }), + ), + toY: Type.Optional( + Type.Number({ description: "Destination y coordinate for drag actions" }), + ), + text: Type.Optional( + Type.String({ + description: + "Text to type, text to wait for, or clipboard contents depending on action", + }), + ), + keys: Type.Optional( + Type.Array(Type.String(), { + description: "Hotkey chord or key sequence, for example ['ctrl', 'l']", + minItems: 1, + }), + ), + app: Type.Optional( + Type.String({ + description: + "Installed app or running app name/class for app_open, app_focus, and wait", + }), + ), + windowId: Type.Optional( + Type.String({ description: "Window ID, such as 0x04200007" }), + ), + windowTitle: Type.Optional( + Type.String({ description: "Window title substring to match" }), + ), + mode: Type.Optional( + Type.Union( + computerObservationModes.map((mode) => Type.Literal(mode)), + { description: "Observation mode. Defaults to hybrid." }, + ), + ), + amount: Type.Optional( + Type.Number({ + description: + "Scroll amount in wheel steps. Positive scrolls down/right, negative scrolls up/left.", + }), + ), + width: Type.Optional( + Type.Number({ description: "Target window width for resize actions" }), + ), + height: Type.Optional( + Type.Number({ description: "Target window height for resize actions" }), + ), + clear: Type.Optional( + Type.Boolean({ + description: "Clear the active input field before typing", + }), + ), + button: Type.Optional( + Type.Number({ + description: "Mouse button for click or drag. Defaults to 1.", + minimum: 1, + maximum: 7, + }), + ), + timeoutMs: Type.Optional( + Type.Number({ + description: "Wait timeout in milliseconds for observe-derived waits", + minimum: 0, + }), + ), + intervalMs: Type.Optional( + Type.Number({ + description: "Polling interval for wait actions in milliseconds", + minimum: 10, + }), + ), +}); + +export type ComputerToolAction = (typeof computerActions)[number]; +export type ComputerObservationMode = (typeof computerObservationModes)[number]; +export type ComputerToolInput = Static; + +export interface ComputerToolDetails { + action: ComputerToolAction; + command: string; + args: string[]; + stateDir: string; + snapshotId?: string; + screenshotPath?: string; +} + +export interface ComputerOperations { + exec: ( + command: string, + args: string[], + options: { + cwd: string; + env: NodeJS.ProcessEnv; + onData: (data: Buffer) => void; + signal?: AbortSignal; + timeout?: number; + }, + ) => Promise<{ exitCode: number | null }>; +} + +const defaultComputerOperations: ComputerOperations = { + exec: (command, args, { cwd, env, onData, signal, timeout }) => { + return new Promise((resolvePromise, rejectPromise) => { + const child = spawn(command, args, { + cwd, + detached: true, + env, + stdio: ["ignore", "pipe", "pipe"], + }); + + let timedOut = false; + let timeoutHandle: NodeJS.Timeout | undefined; + + if (timeout !== undefined && timeout > 0) { + timeoutHandle = setTimeout(() => { + timedOut = true; + if (child.pid) { + killProcessTree(child.pid); + } + }, timeout * 1000); + } + + if (child.stdout) { + child.stdout.on("data", onData); + } + if (child.stderr) { + child.stderr.on("data", onData); + } + + const onAbort = () => { + if (child.pid) { + killProcessTree(child.pid); + } + }; + + if (signal) { + if (signal.aborted) { + onAbort(); + } else { + signal.addEventListener("abort", onAbort, { once: true }); + } + } + + child.on("error", (error) => { + if (timeoutHandle) clearTimeout(timeoutHandle); + if (signal) signal.removeEventListener("abort", onAbort); + rejectPromise(error); + }); + + child.on("close", (code) => { + if (timeoutHandle) clearTimeout(timeoutHandle); + if (signal) signal.removeEventListener("abort", onAbort); + + if (signal?.aborted) { + rejectPromise(new Error("aborted")); + return; + } + + if (timedOut) { + rejectPromise(new Error(`timeout:${timeout}`)); + return; + } + + resolvePromise({ exitCode: code }); + }); + }); + }, +}; + +export interface ComputerToolOptions { + operations?: ComputerOperations; + command?: string; + defaultTimeoutSeconds?: number; + stateDir?: string; + agentDir?: string; +} + +interface ComputerCommandContext { + action: ComputerToolAction; + args: string[]; + statusMessage: string; + successMessage: string; + stateDir: string; +} + +function resolveCommandPath(cwd: string, inputPath: string): string { + return resolve(cwd, inputPath); +} + +function getComputerRootDir(options?: ComputerToolOptions): string { + const baseAgentDir = options?.agentDir ?? getAgentDir(); + return join(baseAgentDir, "computer"); +} + +function getComputerStateDir( + cwd: string, + options?: ComputerToolOptions, +): string { + const stateDir = options?.stateDir ?? getComputerRootDir(options); + return resolveCommandPath(cwd, stateDir); +} + +function ensureComputerDir(stateDir: string): void { + mkdirSync(stateDir, { recursive: true }); +} + +function normalizeOutput(chunks: Buffer[]): string { + return sanitizeBinaryOutput(Buffer.concat(chunks).toString("utf-8")).trim(); +} + +function hasCoordinateTarget(input: ComputerToolInput): boolean { + return input.x !== undefined && input.y !== undefined; +} + +function hasRefTarget(input: ComputerToolInput): boolean { + return input.snapshotId !== undefined && input.ref !== undefined; +} + +function hasWindowTarget(input: ComputerToolInput): boolean { + return input.windowId !== undefined || input.windowTitle !== undefined; +} + +function hasDragDestination(input: ComputerToolInput): boolean { + return ( + input.toRef !== undefined || + (input.toX !== undefined && input.toY !== undefined) + ); +} + +function validateSnapshotId(snapshotId: string): void { + if (!computerSnapshotIdPattern.test(snapshotId)) { + throw new Error(`Invalid computer snapshotId: "${snapshotId}"`); + } +} + +function validateWaitInput(input: ComputerToolInput): void { + const targetCount = + (input.ref !== undefined ? 1 : 0) + + (input.text !== undefined ? 1 : 0) + + (input.app !== undefined ? 1 : 0) + + (input.windowId !== undefined ? 1 : 0) + + (input.windowTitle !== undefined ? 1 : 0); + + if (targetCount === 0 && input.timeoutMs === undefined) { + throw new Error( + "computer wait requires one of ref, text, app, windowId, windowTitle, or timeoutMs", + ); + } + + if (targetCount > 1) { + throw new Error( + "computer wait requires exactly one of ref, text, app, windowId, or windowTitle", + ); + } +} + +function validateComputerInput(input: ComputerToolInput): void { + if (input.snapshotId !== undefined) { + validateSnapshotId(input.snapshotId); + } + + switch (input.action) { + case "observe": + case "app_list": + case "window_list": + case "clipboard_read": + return; + case "click": + if (!hasRefTarget(input) && !hasCoordinateTarget(input)) { + throw new Error( + "computer click requires snapshotId and ref, or explicit x and y coordinates", + ); + } + return; + case "type": + if (input.text === undefined) { + throw new Error("computer type requires text"); + } + if (input.ref !== undefined && input.snapshotId === undefined) { + throw new Error("computer type with ref requires snapshotId"); + } + return; + case "hotkey": + if (!input.keys || input.keys.length === 0) { + throw new Error("computer hotkey requires keys"); + } + return; + case "scroll": + if (input.amount === undefined || input.amount === 0) { + throw new Error("computer scroll requires a non-zero amount"); + } + if (input.ref !== undefined && input.snapshotId === undefined) { + throw new Error("computer scroll with ref requires snapshotId"); + } + return; + case "drag": + if (!hasRefTarget(input) && !hasCoordinateTarget(input)) { + throw new Error( + "computer drag requires a starting target via snapshotId and ref, or x and y coordinates", + ); + } + if (!hasDragDestination(input)) { + throw new Error( + "computer drag requires a destination via toRef, or explicit toX and toY coordinates", + ); + } + if (input.toRef !== undefined && input.snapshotId === undefined) { + throw new Error("computer drag with toRef requires snapshotId"); + } + return; + case "wait": + validateWaitInput(input); + if (input.ref !== undefined && input.snapshotId === undefined) { + throw new Error("computer wait with ref requires snapshotId"); + } + return; + case "app_open": + case "app_focus": + if (!input.app) { + throw new Error(`computer ${input.action} requires app`); + } + return; + case "window_focus": + case "window_close": + if (!hasWindowTarget(input)) { + throw new Error( + `computer ${input.action} requires windowId or windowTitle`, + ); + } + return; + case "window_move": + if (!hasWindowTarget(input)) { + throw new Error( + "computer window_move requires windowId or windowTitle", + ); + } + if (input.x === undefined || input.y === undefined) { + throw new Error("computer window_move requires x and y"); + } + return; + case "window_resize": + if (!hasWindowTarget(input)) { + throw new Error( + "computer window_resize requires windowId or windowTitle", + ); + } + if (input.width === undefined || input.height === undefined) { + throw new Error("computer window_resize requires width and height"); + } + return; + case "clipboard_write": + if (input.text === undefined) { + throw new Error("computer clipboard_write requires text"); + } + return; + default: { + const unsupportedAction: never = input.action; + throw new Error(`Unsupported computer action: ${unsupportedAction}`); + } + } +} + +function describeAction(input: ComputerToolInput): { + statusMessage: string; + successMessage: string; +} { + switch (input.action) { + case "observe": + return { + statusMessage: "Observing desktop...", + successMessage: "Captured desktop snapshot", + }; + case "click": + return { + statusMessage: "Clicking desktop target...", + successMessage: "Clicked desktop target", + }; + case "type": + return { + statusMessage: "Typing into desktop...", + successMessage: "Typed into desktop", + }; + case "hotkey": + return { + statusMessage: "Sending hotkey...", + successMessage: "Sent hotkey", + }; + case "scroll": + return { + statusMessage: "Scrolling desktop...", + successMessage: "Scrolled desktop", + }; + case "drag": + return { + statusMessage: "Dragging desktop target...", + successMessage: "Dragged desktop target", + }; + case "wait": + return { + statusMessage: "Waiting for desktop state...", + successMessage: "Desktop wait condition satisfied", + }; + case "app_list": + return { + statusMessage: "Listing apps...", + successMessage: "Listed apps", + }; + case "app_open": + return { + statusMessage: `Opening app ${input.app}...`, + successMessage: `Opened app ${input.app}`, + }; + case "app_focus": + return { + statusMessage: `Focusing app ${input.app}...`, + successMessage: `Focused app ${input.app}`, + }; + case "window_list": + return { + statusMessage: "Listing windows...", + successMessage: "Listed windows", + }; + case "window_focus": + return { + statusMessage: "Focusing window...", + successMessage: "Focused window", + }; + case "window_move": + return { + statusMessage: "Moving window...", + successMessage: "Moved window", + }; + case "window_resize": + return { + statusMessage: "Resizing window...", + successMessage: "Resized window", + }; + case "window_close": + return { + statusMessage: "Closing window...", + successMessage: "Closed window", + }; + case "clipboard_read": + return { + statusMessage: "Reading clipboard...", + successMessage: "Read clipboard", + }; + case "clipboard_write": + return { + statusMessage: "Writing clipboard...", + successMessage: "Wrote clipboard", + }; + } +} + +function buildComputerCommand( + cwd: string, + input: ComputerToolInput, + options?: ComputerToolOptions, +): ComputerCommandContext { + validateComputerInput(input); + + const stateDir = getComputerStateDir(cwd, options); + ensureComputerDir(stateDir); + const actionDescription = describeAction(input); + + return { + action: input.action, + args: ["--state-dir", stateDir, "--input", JSON.stringify(input)], + statusMessage: actionDescription.statusMessage, + successMessage: actionDescription.successMessage, + stateDir, + }; +} + +function buildComputerErrorMessage( + action: ComputerToolAction, + output: string, + exitCode: number | null, +): string { + const base = + exitCode === null + ? `Computer action "${action}" failed` + : `Computer action "${action}" exited with code ${exitCode}`; + return output.length > 0 ? `${output}\n\n${base}` : base; +} + +function getMissingComputerCommandMessage(command: string): string { + return [ + `Computer tool could not find "${command}".`, + "Desktop sandboxes install agent-computer alongside the browser tool.", + "If you are running locally, either install the helper or omit the computer tool.", + "Recommended setup inside a sandbox image: copy agent-computer into /usr/local/bin and install xdotool, wmctrl, tesseract-ocr, and xclip.", + ].join("\n"); +} + +function parseComputerPayload(output: string): { + text: string; + snapshotId?: string; + screenshotPath?: string; +} { + if (output.length === 0) { + return { text: "" }; + } + + try { + const payload = JSON.parse(output) as { + snapshot?: { snapshotId?: string; screenshotPath?: string }; + summary?: string; + screenshotPath?: string; + snapshotId?: string; + }; + return { + text: JSON.stringify(payload, null, 2), + snapshotId: payload.snapshot?.snapshotId ?? payload.snapshotId, + screenshotPath: + payload.snapshot?.screenshotPath ?? payload.screenshotPath, + }; + } catch { + return { text: output }; + } +} + +export function createComputerTool( + cwd: string, + options?: ComputerToolOptions, +): AgentTool { + const operations = options?.operations ?? defaultComputerOperations; + const command = options?.command ?? DEFAULT_COMPUTER_COMMAND; + const defaultTimeoutSeconds = + options?.defaultTimeoutSeconds ?? DEFAULT_COMPUTER_TIMEOUT_SECONDS; + + return { + name: "computer", + label: "computer", + description: + "Use the desktop computer when browser DOM control is not enough: observe the screen, interact with windows and apps, type, click, drag, scroll, wait for native UI changes, and read or write the clipboard.", + parameters: computerSchema, + execute: async (_toolCallId, input, signal, onUpdate) => { + const commandContext = buildComputerCommand(cwd, input, options); + const details: ComputerToolDetails = { + action: commandContext.action, + command, + args: commandContext.args, + stateDir: commandContext.stateDir, + }; + + onUpdate?.({ + content: [{ type: "text", text: commandContext.statusMessage }], + details, + }); + + const chunks: Buffer[] = []; + + try { + const { exitCode } = await operations.exec( + command, + commandContext.args, + { + cwd, + env: getShellEnv(), + onData: (data) => chunks.push(data), + signal, + timeout: defaultTimeoutSeconds, + }, + ); + + const output = normalizeOutput(chunks); + if (exitCode !== 0) { + throw new Error( + buildComputerErrorMessage(commandContext.action, output, exitCode), + ); + } + + const parsed = parseComputerPayload(output); + if (parsed.snapshotId) { + details.snapshotId = parsed.snapshotId; + } + if (parsed.screenshotPath) { + details.screenshotPath = parsed.screenshotPath; + } + + return { + content: [ + { + type: "text", + text: + parsed.text.length > 0 + ? parsed.text + : commandContext.successMessage, + }, + ], + details, + }; + } catch (error) { + if ( + error instanceof Error && + "code" in error && + error.code === "ENOENT" + ) { + throw new Error(getMissingComputerCommandMessage(command)); + } + if (error instanceof Error && error.message === "aborted") { + throw new Error(`Computer action "${commandContext.action}" aborted`); + } + if (error instanceof Error && error.message.startsWith("timeout:")) { + const seconds = error.message.split(":")[1]; + throw new Error( + `Computer action "${commandContext.action}" timed out after ${seconds} seconds`, + ); + } + throw error; + } + }, + }; +} + +export const computerTool = createComputerTool(process.cwd()); diff --git a/packages/coding-agent/src/core/tools/index.ts b/packages/coding-agent/src/core/tools/index.ts index 7f89267..d2e1588 100644 --- a/packages/coding-agent/src/core/tools/index.ts +++ b/packages/coding-agent/src/core/tools/index.ts @@ -19,6 +19,16 @@ export { browserTool, createBrowserTool, } from "./browser.js"; +export { + type ComputerObservationMode, + type ComputerOperations, + type ComputerToolAction, + type ComputerToolDetails, + type ComputerToolInput, + type ComputerToolOptions, + computerTool, + createComputerTool, +} from "./computer.js"; export { createEditTool, type EditOperations, @@ -84,6 +94,11 @@ import { createBrowserTool, type BrowserToolOptions, } from "./browser.js"; +import { + computerTool, + createComputerTool, + type ComputerToolOptions, +} from "./computer.js"; import { createEditTool, editTool } from "./edit.js"; import { createFindTool, findTool } from "./find.js"; import { createGrepTool, grepTool } from "./grep.js"; @@ -102,6 +117,7 @@ export const allTools = { read: readTool, bash: bashTool, browser: browserTool, + computer: computerTool, edit: editTool, write: writeTool, grep: grepTool, @@ -115,6 +131,7 @@ export const defaultCodingToolNames: ToolName[] = [ "read", "bash", "browser", + "computer", "edit", "write", ]; @@ -131,19 +148,16 @@ export interface ToolsOptions { bash?: BashToolOptions; /** Options for the browser tool */ browser?: BrowserToolOptions; + /** Options for the computer tool */ + computer?: ComputerToolOptions; } /** * Create coding tools configured for a specific working directory. */ export function createCodingTools(cwd: string, options?: ToolsOptions): Tool[] { - return [ - createReadTool(cwd, options?.read), - createBashTool(cwd, options?.bash), - createBrowserTool(cwd, options?.browser), - createEditTool(cwd), - createWriteTool(cwd), - ]; + const tools = createAllTools(cwd, options); + return defaultCodingToolNames.map((toolName) => tools[toolName]); } /** @@ -172,6 +186,7 @@ export function createAllTools( read: createReadTool(cwd, options?.read), bash: createBashTool(cwd, options?.bash), browser: createBrowserTool(cwd, options?.browser), + computer: createComputerTool(cwd, options?.computer), edit: createEditTool(cwd), write: createWriteTool(cwd), grep: createGrepTool(cwd), diff --git a/packages/coding-agent/src/index.ts b/packages/coding-agent/src/index.ts index ed278e6..4e42eaf 100644 --- a/packages/coding-agent/src/index.ts +++ b/packages/coding-agent/src/index.ts @@ -182,6 +182,7 @@ export { createAgentSession, createBashTool, createBrowserTool, + createComputerTool, // Tool factories (for custom cwd) createCodingTools, createEditTool, @@ -253,6 +254,13 @@ export { type BrowserToolInput, type BrowserToolOptions, browserTool, + type ComputerObservationMode, + type ComputerOperations, + type ComputerToolAction, + type ComputerToolDetails, + type ComputerToolInput, + type ComputerToolOptions, + computerTool, codingTools, defaultCodingToolNames, DEFAULT_MAX_BYTES, diff --git a/packages/coding-agent/test/computer-tool.test.ts b/packages/coding-agent/test/computer-tool.test.ts new file mode 100644 index 0000000..446c064 --- /dev/null +++ b/packages/coding-agent/test/computer-tool.test.ts @@ -0,0 +1,339 @@ +import { spawnSync } from "node:child_process"; +import { + chmodSync, + existsSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { parseArgs } from "../src/cli/args.js"; +import { buildSystemPrompt } from "../src/core/system-prompt.js"; +import { + type ComputerOperations, + type ComputerToolDetails, + createAllTools, + createComputerTool, + defaultCodingToolNames, +} from "../src/core/tools/index.js"; + +interface TextBlock { + type: "text"; + text: string; +} + +type ToolContentBlock = TextBlock | { type: string }; + +interface ToolResultLike { + content: ToolContentBlock[]; + details?: unknown; +} + +interface ComputerExecCall { + command: string; + args: string[]; + cwd: string; + env: NodeJS.ProcessEnv; + timeout?: number; +} + +function getTextOutput(result: ToolResultLike): string { + return result.content + .filter((block): block is TextBlock => block.type === "text") + .map((block) => block.text) + .join("\n"); +} + +function createMockComputerOperations( + output = "", + exitCode: number | null = 0, +): { + calls: ComputerExecCall[]; + operations: ComputerOperations; +} { + const calls: ComputerExecCall[] = []; + + return { + calls, + operations: { + exec: async (command, args, options) => { + calls.push({ + command, + args, + cwd: options.cwd, + env: options.env, + timeout: options.timeout, + }); + if (output.length > 0) { + options.onData(Buffer.from(output, "utf-8")); + } + return { exitCode }; + }, + }, + }; +} + +function getAgentComputerScriptPath(): string { + return resolve( + process.cwd(), + "../../../../docker/companion/agent-computer.js", + ); +} + +describe("computer tool", () => { + const tempDirs: string[] = []; + + afterEach(() => { + while (tempDirs.length > 0) { + const tempDir = tempDirs.pop(); + if (tempDir) { + rmSync(tempDir, { recursive: true, force: true }); + } + } + }); + + function createTempDir(prefix: string): string { + const tempDir = mkdtempSync(join(tmpdir(), prefix)); + tempDirs.push(tempDir); + return tempDir; + } + + it("observes the desktop through the agent-computer helper", async () => { + const cwd = createTempDir("coding-agent-computer-observe-"); + const stateDir = join(cwd, "computer-state"); + const { calls, operations } = createMockComputerOperations( + JSON.stringify({ + ok: true, + action: "observe", + summary: "Captured desktop snapshot snap-1", + snapshot: { + snapshotId: "snap-1", + screenshotPath: "/tmp/snap-1.png", + backend: "hybrid", + activeWindow: null, + windows: [], + refs: [], + }, + }), + ); + + const computerTool = createComputerTool(cwd, { + operations, + command: "agent-computer-test", + stateDir, + }); + + const result = (await computerTool.execute("computer-observe", { + action: "observe", + })) as ToolResultLike; + + expect(calls).toHaveLength(1); + expect(calls[0]).toMatchObject({ + command: "agent-computer-test", + args: ["--state-dir", stateDir, "--input", '{"action":"observe"}'], + cwd, + timeout: 90, + }); + + const details = result.details as ComputerToolDetails | undefined; + expect(details?.stateDir).toBe(stateDir); + expect(details?.snapshotId).toBe("snap-1"); + expect(details?.screenshotPath).toBe("/tmp/snap-1.png"); + expect(getTextOutput(result)).toContain('"snapshotId": "snap-1"'); + }); + + it("validates click targets before spawning the helper", async () => { + const cwd = createTempDir("coding-agent-computer-click-"); + const stateDir = join(cwd, "computer-state"); + const { calls, operations } = createMockComputerOperations(); + + const computerTool = createComputerTool(cwd, { + operations, + stateDir, + }); + + await expect( + computerTool.execute("computer-click-missing-target", { + action: "click", + }), + ).rejects.toThrow( + "computer click requires snapshotId and ref, or explicit x and y coordinates", + ); + + expect(calls).toHaveLength(0); + }); + + it("rejects unsafe snapshot ids before spawning the helper", async () => { + const cwd = createTempDir("coding-agent-computer-snapshot-id-"); + const stateDir = join(cwd, "computer-state"); + const { calls, operations } = createMockComputerOperations(); + + const computerTool = createComputerTool(cwd, { + operations, + stateDir, + }); + + await expect( + computerTool.execute("computer-click-invalid-snapshot", { + action: "click", + snapshotId: "../../auth", + ref: "w1", + }), + ).rejects.toThrow('Invalid computer snapshotId: "../../auth"'); + + expect(calls).toHaveLength(0); + }); + + it("accepts computer in --tools and exposes it in built-in tool wiring", () => { + const parsed = parseArgs(["--tools", "computer,read"]); + expect(parsed.tools).toEqual(["computer", "read"]); + + expect(defaultCodingToolNames).toContain("computer"); + expect(createAllTools(process.cwd()).computer.name).toBe("computer"); + }); + + it("mentions computer in the default system prompt", () => { + const prompt = buildSystemPrompt(); + + expect(prompt).toContain( + "- computer: Use the desktop computer: observe the screen", + ); + expect(prompt).toContain( + "Computer: observe before interacting. Use it for native UI", + ); + expect(prompt).toContain( + "Prefer browser for websites and DOM-aware tasks. Switch to computer", + ); + }); + + it("rejects accessibility observe mode until a non-screenshot backend exists", () => { + const stateDir = createTempDir( + "coding-agent-computer-helper-accessibility-", + ); + const result = spawnSync( + process.execPath, + [ + "--no-warnings", + getAgentComputerScriptPath(), + "--state-dir", + stateDir, + "--input", + JSON.stringify({ + action: "observe", + mode: "accessibility", + }), + ], + { + encoding: "utf8", + }, + ); + + expect(result.status).not.toBe(0); + expect(result.stderr).toContain( + "backend_unavailable: accessibility observe mode is not implemented", + ); + }); + + it("refuses to shell out when app_open cannot match an installed app", () => { + const stateDir = createTempDir("coding-agent-computer-helper-app-open-"); + const markerPath = join(stateDir, "should-not-exist"); + const result = spawnSync( + process.execPath, + [ + "--no-warnings", + getAgentComputerScriptPath(), + "--state-dir", + stateDir, + "--input", + JSON.stringify({ + action: "app_open", + app: `definitely-not-an-installed-app && touch ${markerPath}`, + }), + ], + { + encoding: "utf8", + }, + ); + + expect(result.status).not.toBe(0); + expect(result.stderr).toContain("app_not_found:"); + expect(existsSync(markerPath)).toBe(false); + }); + + it("rejects snapshot path traversal inside the helper", () => { + const stateDir = createTempDir("coding-agent-computer-helper-snapshot-id-"); + const result = spawnSync( + process.execPath, + [ + "--no-warnings", + getAgentComputerScriptPath(), + "--state-dir", + stateDir, + "--input", + JSON.stringify({ + action: "click", + snapshotId: "../../auth", + ref: "w1", + }), + ], + { + encoding: "utf8", + }, + ); + + expect(result.status).not.toBe(0); + expect(result.stderr).toContain("invalid_snapshot_id: ../../auth"); + }); + + it("passes typed text after the xdotool option separator", () => { + const stateDir = createTempDir("coding-agent-computer-helper-type-"); + const binDir = createTempDir("coding-agent-computer-helper-bin-"); + const argsPath = join(stateDir, "xdotool-args.json"); + const xdotoolPath = join(binDir, "xdotool"); + writeFileSync( + xdotoolPath, + `#!/usr/bin/env node +const { writeFileSync } = require("node:fs"); +writeFileSync(process.env.TEST_XDOTOOL_ARGS_PATH, JSON.stringify(process.argv.slice(2))); +`, + "utf8", + ); + chmodSync(xdotoolPath, 0o755); + + const result = spawnSync( + process.execPath, + [ + "--no-warnings", + getAgentComputerScriptPath(), + "--state-dir", + stateDir, + "--input", + JSON.stringify({ + action: "type", + text: "--delay", + }), + ], + { + encoding: "utf8", + env: { + ...process.env, + PATH: `${binDir}:${process.env.PATH ?? ""}`, + TEST_XDOTOOL_ARGS_PATH: argsPath, + }, + }, + ); + + expect(result.status).toBe(0); + expect(JSON.parse(readFileSync(argsPath, "utf8"))).toEqual([ + "type", + "--delay", + "12", + "--clearmodifiers", + "--", + "--delay", + ]); + }); +}); diff --git a/packages/companion-teams/src/adapters/tmux-adapter.test.ts b/packages/companion-teams/src/adapters/tmux-adapter.test.ts index 82ea26c..f7b6d6e 100644 --- a/packages/companion-teams/src/adapters/tmux-adapter.test.ts +++ b/packages/companion-teams/src/adapters/tmux-adapter.test.ts @@ -4,11 +4,10 @@ import { TmuxAdapter } from "./tmux-adapter"; describe("TmuxAdapter", () => { let adapter: TmuxAdapter; - let mockExecCommand: ReturnType; beforeEach(() => { adapter = new TmuxAdapter(); - mockExecCommand = vi.spyOn(terminalAdapter, "execCommand"); + vi.spyOn(terminalAdapter, "execCommand"); delete process.env.TMUX; delete process.env.ZELLIJ; delete process.env.WEZTERM_PANE; @@ -21,6 +20,7 @@ describe("TmuxAdapter", () => { }); it("detects tmux in headless runtimes when the binary is available", () => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); mockExecCommand.mockReturnValue({ stdout: "tmux 3.4", stderr: "", @@ -33,6 +33,7 @@ describe("TmuxAdapter", () => { it("does not detect tmux in GUI terminals just because the binary exists", () => { process.env.COLORTERM = "truecolor"; + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); mockExecCommand.mockReturnValue({ stdout: "tmux 3.4", stderr: "", @@ -44,7 +45,8 @@ describe("TmuxAdapter", () => { }); it("creates a detached team session when not already inside tmux", () => { - mockExecCommand.mockImplementation((_bin: string, args: string[]) => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); + mockExecCommand.mockImplementation((_bin, args) => { if (args[0] === "has-session") { return { stdout: "", stderr: "missing", status: 1 }; } @@ -65,12 +67,18 @@ describe("TmuxAdapter", () => { expect(mockExecCommand).toHaveBeenCalledWith( "tmux", - expect.arrayContaining(["new-session", "-d", "-s", "companion-teams-demo"]), + expect.arrayContaining([ + "new-session", + "-d", + "-s", + "companion-teams-demo", + ]), ); }); it("splits an existing detached session when not already inside tmux", () => { - mockExecCommand.mockImplementation((_bin: string, args: string[]) => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); + mockExecCommand.mockImplementation((_bin, args) => { if (args[0] === "has-session") { return { stdout: "", stderr: "", status: 0 }; } @@ -96,6 +104,7 @@ describe("TmuxAdapter", () => { }); it("checks pane liveness by pane id", () => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); mockExecCommand.mockReturnValue({ stdout: "%1\n%7\n", stderr: "", diff --git a/packages/companion-teams/src/adapters/wezterm-adapter.test.ts b/packages/companion-teams/src/adapters/wezterm-adapter.test.ts index af8bf62..7db2fa4 100644 --- a/packages/companion-teams/src/adapters/wezterm-adapter.test.ts +++ b/packages/companion-teams/src/adapters/wezterm-adapter.test.ts @@ -8,11 +8,10 @@ import { WezTermAdapter } from "./wezterm-adapter"; describe("WezTermAdapter", () => { let adapter: WezTermAdapter; - let mockExecCommand: ReturnType; beforeEach(() => { adapter = new WezTermAdapter(); - mockExecCommand = vi.spyOn(terminalAdapter, "execCommand"); + vi.spyOn(terminalAdapter, "execCommand"); delete process.env.WEZTERM_PANE; delete process.env.TMUX; delete process.env.ZELLIJ; @@ -31,6 +30,7 @@ describe("WezTermAdapter", () => { describe("detect", () => { it("should detect when WEZTERM_PANE is set", () => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); mockExecCommand.mockReturnValue({ stdout: "version 1.0", stderr: "", @@ -43,7 +43,8 @@ describe("WezTermAdapter", () => { describe("spawn", () => { it("should spawn first pane to the right with 50%", () => { // Mock getPanes finding only current pane - mockExecCommand.mockImplementation((_bin: string, args: string[]) => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); + mockExecCommand.mockImplementation((_bin, args) => { if (args.includes("list")) { return { stdout: JSON.stringify([{ pane_id: 0, tab_id: 0 }]), @@ -79,7 +80,8 @@ describe("WezTermAdapter", () => { it("should spawn subsequent panes by splitting the sidebar", () => { // Mock getPanes finding current pane (0) and sidebar pane (1) - mockExecCommand.mockImplementation((_bin: string, args: string[]) => { + const mockExecCommand = vi.mocked(terminalAdapter.execCommand); + mockExecCommand.mockImplementation((_bin, args) => { if (args.includes("list")) { return { stdout: JSON.stringify([