mirror of
https://github.com/harivansh-afk/clanker-agent.git
synced 2026-04-15 03:00:44 +00:00
computer use tool
This commit is contained in:
parent
3919bbf708
commit
e1bba1c1a5
9 changed files with 911 additions and 17 deletions
|
|
@ -325,6 +325,7 @@ ${chalk.bold("Environment Variables:")}
|
|||
AWS_REGION - AWS region for Amazon Bedrock (e.g., us-east-1)
|
||||
${ENV_AGENT_DIR.padEnd(32)} - Session storage directory (default: ~/${CONFIG_DIR_NAME}/agent)
|
||||
COMPANION_PACKAGE_DIR - Override package directory (for Nix/Guix store paths)
|
||||
COMPANION_AGENT_COMPUTER_COMMAND - Override the computer helper command (default: agent-computer)
|
||||
COMPANION_OFFLINE - Disable startup network operations when set to 1/true/yes
|
||||
COMPANION_SHARE_VIEWER_URL - Base URL for /share command (default: https://companion.dev/session/)
|
||||
COMPANION_AI_ANTIGRAVITY_VERSION - Override Antigravity User-Agent version (e.g., 1.23.0)
|
||||
|
|
@ -333,6 +334,7 @@ ${chalk.bold(`Available Tools (default: ${defaultToolsText}):`)}
|
|||
read - Read file contents
|
||||
bash - Execute bash commands
|
||||
browser - Browser automation with persistent state
|
||||
computer - Desktop computer automation with screen observation and native UI control
|
||||
edit - Edit files with find/replace
|
||||
write - Write files (creates/overwrites)
|
||||
grep - Search file contents (read-only, off by default)
|
||||
|
|
|
|||
|
|
@ -26,10 +26,12 @@ import {
|
|||
allTools,
|
||||
bashTool,
|
||||
browserTool,
|
||||
computerTool,
|
||||
codingTools,
|
||||
defaultCodingToolNames,
|
||||
createBashTool,
|
||||
createBrowserTool,
|
||||
createComputerTool,
|
||||
createCodingTools,
|
||||
createEditTool,
|
||||
createFindTool,
|
||||
|
|
@ -67,7 +69,7 @@ export interface CreateAgentSessionOptions {
|
|||
/** Models available for cycling (Ctrl+P in interactive mode) */
|
||||
scopedModels?: Array<{ model: Model<any>; thinkingLevel?: ThinkingLevel }>;
|
||||
|
||||
/** Built-in tools to use. Default: codingTools [read, bash, browser, edit, write] */
|
||||
/** Built-in tools to use. Default: codingTools [read, bash, browser, computer, edit, write] */
|
||||
tools?: Tool[];
|
||||
/** Custom tools to register (in addition to built-in tools). */
|
||||
customTools?: ToolDefinition[];
|
||||
|
|
@ -113,6 +115,7 @@ export {
|
|||
readTool,
|
||||
bashTool,
|
||||
browserTool,
|
||||
computerTool,
|
||||
editTool,
|
||||
writeTool,
|
||||
grepTool,
|
||||
|
|
@ -127,6 +130,7 @@ export {
|
|||
createReadTool,
|
||||
createBashTool,
|
||||
createBrowserTool,
|
||||
createComputerTool,
|
||||
createEditTool,
|
||||
createWriteTool,
|
||||
createGrepTool,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ const toolDescriptions: Record<string, string> = {
|
|||
bash: "Run shell commands",
|
||||
browser:
|
||||
"Browse the web: open, snapshot, click, fill, wait, screenshot, save/load state",
|
||||
computer:
|
||||
"Use the desktop computer: observe the screen, click, type, send hotkeys, manage apps/windows, wait for native UI, and read/write the clipboard",
|
||||
edit: "Surgical file edits (find exact text, replace it)",
|
||||
write: "Create new files or completely rewrite existing ones",
|
||||
grep: "Search file contents by regex (respects .gitignore)",
|
||||
|
|
@ -167,6 +169,7 @@ export function buildSystemPrompt(
|
|||
|
||||
const hasBash = tools.includes("bash");
|
||||
const hasBrowser = tools.includes("browser");
|
||||
const hasComputer = tools.includes("computer");
|
||||
const hasEdit = tools.includes("edit");
|
||||
const hasWrite = tools.includes("write");
|
||||
const hasGrep = tools.includes("grep");
|
||||
|
|
@ -215,6 +218,16 @@ export function buildSystemPrompt(
|
|||
"Browser: snapshot before interacting with elements. Use it for research and learning too, not just automation",
|
||||
);
|
||||
}
|
||||
if (hasComputer) {
|
||||
addGuideline(
|
||||
"Computer: observe before interacting. Use it for native UI, desktop apps, file pickers, downloads, and OS dialogs",
|
||||
);
|
||||
}
|
||||
if (hasBrowser && hasComputer) {
|
||||
addGuideline(
|
||||
"Prefer browser for websites and DOM-aware tasks. Switch to computer when native UI or desktop state matters",
|
||||
);
|
||||
}
|
||||
|
||||
// Output hygiene
|
||||
if (hasEdit || hasWrite) {
|
||||
|
|
|
|||
666
packages/coding-agent/src/core/tools/computer.ts
Normal file
666
packages/coding-agent/src/core/tools/computer.ts
Normal file
|
|
@ -0,0 +1,666 @@
|
|||
import { spawn } from "node:child_process";
|
||||
import { mkdirSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import type { AgentTool } from "@mariozechner/companion-agent-core";
|
||||
import { type Static, Type } from "@sinclair/typebox";
|
||||
import { getAgentDir } from "../../config.js";
|
||||
import {
|
||||
getShellEnv,
|
||||
killProcessTree,
|
||||
sanitizeBinaryOutput,
|
||||
} from "../../utils/shell.js";
|
||||
|
||||
const computerActions = [
|
||||
"observe",
|
||||
"click",
|
||||
"type",
|
||||
"hotkey",
|
||||
"scroll",
|
||||
"drag",
|
||||
"wait",
|
||||
"app_list",
|
||||
"app_open",
|
||||
"app_focus",
|
||||
"window_list",
|
||||
"window_focus",
|
||||
"window_move",
|
||||
"window_resize",
|
||||
"window_close",
|
||||
"clipboard_read",
|
||||
"clipboard_write",
|
||||
] as const;
|
||||
|
||||
const computerObservationModes = ["hybrid", "ocr", "accessibility"] as const;
|
||||
|
||||
const DEFAULT_COMPUTER_COMMAND =
|
||||
process.env.COMPANION_AGENT_COMPUTER_COMMAND || "agent-computer";
|
||||
const DEFAULT_COMPUTER_TIMEOUT_SECONDS = 90;
|
||||
|
||||
const computerSchema = Type.Object({
|
||||
action: Type.Union(
|
||||
computerActions.map((action) => Type.Literal(action)),
|
||||
{ description: "Computer action to execute" },
|
||||
),
|
||||
snapshotId: Type.Optional(
|
||||
Type.String({ description: "Snapshot ID returned from observe" }),
|
||||
),
|
||||
ref: Type.Optional(
|
||||
Type.String({
|
||||
description:
|
||||
"Target ref from observe output, such as w1 for a window or t3 for OCR text",
|
||||
}),
|
||||
),
|
||||
x: Type.Optional(Type.Number({ description: "Target x coordinate" })),
|
||||
y: Type.Optional(Type.Number({ description: "Target y coordinate" })),
|
||||
toRef: Type.Optional(
|
||||
Type.String({ description: "Destination ref for drag actions" }),
|
||||
),
|
||||
toX: Type.Optional(
|
||||
Type.Number({ description: "Destination x coordinate for drag actions" }),
|
||||
),
|
||||
toY: Type.Optional(
|
||||
Type.Number({ description: "Destination y coordinate for drag actions" }),
|
||||
),
|
||||
text: Type.Optional(
|
||||
Type.String({
|
||||
description:
|
||||
"Text to type, text to wait for, or clipboard contents depending on action",
|
||||
}),
|
||||
),
|
||||
keys: Type.Optional(
|
||||
Type.Array(Type.String(), {
|
||||
description: "Hotkey chord or key sequence, for example ['ctrl', 'l']",
|
||||
minItems: 1,
|
||||
}),
|
||||
),
|
||||
app: Type.Optional(
|
||||
Type.String({
|
||||
description:
|
||||
"Installed app or running app name/class for app_open, app_focus, and wait",
|
||||
}),
|
||||
),
|
||||
windowId: Type.Optional(
|
||||
Type.String({ description: "Window ID, such as 0x04200007" }),
|
||||
),
|
||||
windowTitle: Type.Optional(
|
||||
Type.String({ description: "Window title substring to match" }),
|
||||
),
|
||||
mode: Type.Optional(
|
||||
Type.Union(
|
||||
computerObservationModes.map((mode) => Type.Literal(mode)),
|
||||
{ description: "Observation mode. Defaults to hybrid." },
|
||||
),
|
||||
),
|
||||
amount: Type.Optional(
|
||||
Type.Number({
|
||||
description:
|
||||
"Scroll amount in wheel steps. Positive scrolls down/right, negative scrolls up/left.",
|
||||
}),
|
||||
),
|
||||
width: Type.Optional(
|
||||
Type.Number({ description: "Target window width for resize actions" }),
|
||||
),
|
||||
height: Type.Optional(
|
||||
Type.Number({ description: "Target window height for resize actions" }),
|
||||
),
|
||||
clear: Type.Optional(
|
||||
Type.Boolean({
|
||||
description: "Clear the active input field before typing",
|
||||
}),
|
||||
),
|
||||
button: Type.Optional(
|
||||
Type.Number({
|
||||
description: "Mouse button for click or drag. Defaults to 1.",
|
||||
minimum: 1,
|
||||
maximum: 7,
|
||||
}),
|
||||
),
|
||||
timeoutMs: Type.Optional(
|
||||
Type.Number({
|
||||
description: "Wait timeout in milliseconds for observe-derived waits",
|
||||
minimum: 0,
|
||||
}),
|
||||
),
|
||||
intervalMs: Type.Optional(
|
||||
Type.Number({
|
||||
description: "Polling interval for wait actions in milliseconds",
|
||||
minimum: 10,
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
export type ComputerToolAction = (typeof computerActions)[number];
|
||||
export type ComputerObservationMode = (typeof computerObservationModes)[number];
|
||||
export type ComputerToolInput = Static<typeof computerSchema>;
|
||||
|
||||
export interface ComputerToolDetails {
|
||||
action: ComputerToolAction;
|
||||
command: string;
|
||||
args: string[];
|
||||
stateDir: string;
|
||||
snapshotId?: string;
|
||||
screenshotPath?: string;
|
||||
}
|
||||
|
||||
export interface ComputerOperations {
|
||||
exec: (
|
||||
command: string,
|
||||
args: string[],
|
||||
options: {
|
||||
cwd: string;
|
||||
env: NodeJS.ProcessEnv;
|
||||
onData: (data: Buffer) => void;
|
||||
signal?: AbortSignal;
|
||||
timeout?: number;
|
||||
},
|
||||
) => Promise<{ exitCode: number | null }>;
|
||||
}
|
||||
|
||||
const defaultComputerOperations: ComputerOperations = {
|
||||
exec: (command, args, { cwd, env, onData, signal, timeout }) => {
|
||||
return new Promise((resolvePromise, rejectPromise) => {
|
||||
const child = spawn(command, args, {
|
||||
cwd,
|
||||
detached: true,
|
||||
env,
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let timedOut = false;
|
||||
let timeoutHandle: NodeJS.Timeout | undefined;
|
||||
|
||||
if (timeout !== undefined && timeout > 0) {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
timedOut = true;
|
||||
if (child.pid) {
|
||||
killProcessTree(child.pid);
|
||||
}
|
||||
}, timeout * 1000);
|
||||
}
|
||||
|
||||
if (child.stdout) {
|
||||
child.stdout.on("data", onData);
|
||||
}
|
||||
if (child.stderr) {
|
||||
child.stderr.on("data", onData);
|
||||
}
|
||||
|
||||
const onAbort = () => {
|
||||
if (child.pid) {
|
||||
killProcessTree(child.pid);
|
||||
}
|
||||
};
|
||||
|
||||
if (signal) {
|
||||
if (signal.aborted) {
|
||||
onAbort();
|
||||
} else {
|
||||
signal.addEventListener("abort", onAbort, { once: true });
|
||||
}
|
||||
}
|
||||
|
||||
child.on("error", (error) => {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
if (signal) signal.removeEventListener("abort", onAbort);
|
||||
rejectPromise(error);
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
if (signal) signal.removeEventListener("abort", onAbort);
|
||||
|
||||
if (signal?.aborted) {
|
||||
rejectPromise(new Error("aborted"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (timedOut) {
|
||||
rejectPromise(new Error(`timeout:${timeout}`));
|
||||
return;
|
||||
}
|
||||
|
||||
resolvePromise({ exitCode: code });
|
||||
});
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
export interface ComputerToolOptions {
|
||||
operations?: ComputerOperations;
|
||||
command?: string;
|
||||
defaultTimeoutSeconds?: number;
|
||||
stateDir?: string;
|
||||
agentDir?: string;
|
||||
}
|
||||
|
||||
interface ComputerCommandContext {
|
||||
action: ComputerToolAction;
|
||||
args: string[];
|
||||
statusMessage: string;
|
||||
successMessage: string;
|
||||
stateDir: string;
|
||||
}
|
||||
|
||||
function resolveCommandPath(cwd: string, inputPath: string): string {
|
||||
return resolve(cwd, inputPath);
|
||||
}
|
||||
|
||||
function getComputerRootDir(options?: ComputerToolOptions): string {
|
||||
const baseAgentDir = options?.agentDir ?? getAgentDir();
|
||||
return join(baseAgentDir, "computer");
|
||||
}
|
||||
|
||||
function getComputerStateDir(
|
||||
cwd: string,
|
||||
options?: ComputerToolOptions,
|
||||
): string {
|
||||
const stateDir = options?.stateDir ?? getComputerRootDir(options);
|
||||
return resolveCommandPath(cwd, stateDir);
|
||||
}
|
||||
|
||||
function ensureComputerDir(stateDir: string): void {
|
||||
mkdirSync(stateDir, { recursive: true });
|
||||
}
|
||||
|
||||
function normalizeOutput(chunks: Buffer[]): string {
|
||||
return sanitizeBinaryOutput(Buffer.concat(chunks).toString("utf-8")).trim();
|
||||
}
|
||||
|
||||
function hasCoordinateTarget(input: ComputerToolInput): boolean {
|
||||
return input.x !== undefined && input.y !== undefined;
|
||||
}
|
||||
|
||||
function hasRefTarget(input: ComputerToolInput): boolean {
|
||||
return input.snapshotId !== undefined && input.ref !== undefined;
|
||||
}
|
||||
|
||||
function hasWindowTarget(input: ComputerToolInput): boolean {
|
||||
return input.windowId !== undefined || input.windowTitle !== undefined;
|
||||
}
|
||||
|
||||
function hasDragDestination(input: ComputerToolInput): boolean {
|
||||
return (
|
||||
input.toRef !== undefined ||
|
||||
(input.toX !== undefined && input.toY !== undefined)
|
||||
);
|
||||
}
|
||||
|
||||
function validateWaitInput(input: ComputerToolInput): void {
|
||||
const targetCount =
|
||||
(input.ref !== undefined ? 1 : 0) +
|
||||
(input.text !== undefined ? 1 : 0) +
|
||||
(input.app !== undefined ? 1 : 0) +
|
||||
(input.windowId !== undefined ? 1 : 0) +
|
||||
(input.windowTitle !== undefined ? 1 : 0);
|
||||
|
||||
if (targetCount === 0 && input.timeoutMs === undefined) {
|
||||
throw new Error(
|
||||
"computer wait requires one of ref, text, app, windowId, windowTitle, or timeoutMs",
|
||||
);
|
||||
}
|
||||
|
||||
if (targetCount > 1) {
|
||||
throw new Error(
|
||||
"computer wait requires exactly one of ref, text, app, windowId, or windowTitle",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function validateComputerInput(input: ComputerToolInput): void {
|
||||
switch (input.action) {
|
||||
case "observe":
|
||||
case "app_list":
|
||||
case "window_list":
|
||||
case "clipboard_read":
|
||||
return;
|
||||
case "click":
|
||||
if (!hasRefTarget(input) && !hasCoordinateTarget(input)) {
|
||||
throw new Error(
|
||||
"computer click requires snapshotId and ref, or explicit x and y coordinates",
|
||||
);
|
||||
}
|
||||
return;
|
||||
case "type":
|
||||
if (input.text === undefined) {
|
||||
throw new Error("computer type requires text");
|
||||
}
|
||||
if (input.ref !== undefined && input.snapshotId === undefined) {
|
||||
throw new Error("computer type with ref requires snapshotId");
|
||||
}
|
||||
return;
|
||||
case "hotkey":
|
||||
if (!input.keys || input.keys.length === 0) {
|
||||
throw new Error("computer hotkey requires keys");
|
||||
}
|
||||
return;
|
||||
case "scroll":
|
||||
if (input.amount === undefined || input.amount === 0) {
|
||||
throw new Error("computer scroll requires a non-zero amount");
|
||||
}
|
||||
if (input.ref !== undefined && input.snapshotId === undefined) {
|
||||
throw new Error("computer scroll with ref requires snapshotId");
|
||||
}
|
||||
return;
|
||||
case "drag":
|
||||
if (!hasRefTarget(input) && !hasCoordinateTarget(input)) {
|
||||
throw new Error(
|
||||
"computer drag requires a starting target via snapshotId and ref, or x and y coordinates",
|
||||
);
|
||||
}
|
||||
if (!hasDragDestination(input)) {
|
||||
throw new Error(
|
||||
"computer drag requires a destination via toRef, or explicit toX and toY coordinates",
|
||||
);
|
||||
}
|
||||
if (input.toRef !== undefined && input.snapshotId === undefined) {
|
||||
throw new Error("computer drag with toRef requires snapshotId");
|
||||
}
|
||||
return;
|
||||
case "wait":
|
||||
validateWaitInput(input);
|
||||
if (input.ref !== undefined && input.snapshotId === undefined) {
|
||||
throw new Error("computer wait with ref requires snapshotId");
|
||||
}
|
||||
return;
|
||||
case "app_open":
|
||||
case "app_focus":
|
||||
if (!input.app) {
|
||||
throw new Error(`computer ${input.action} requires app`);
|
||||
}
|
||||
return;
|
||||
case "window_focus":
|
||||
case "window_close":
|
||||
if (!hasWindowTarget(input)) {
|
||||
throw new Error(
|
||||
`computer ${input.action} requires windowId or windowTitle`,
|
||||
);
|
||||
}
|
||||
return;
|
||||
case "window_move":
|
||||
if (!hasWindowTarget(input)) {
|
||||
throw new Error(
|
||||
"computer window_move requires windowId or windowTitle",
|
||||
);
|
||||
}
|
||||
if (input.x === undefined || input.y === undefined) {
|
||||
throw new Error("computer window_move requires x and y");
|
||||
}
|
||||
return;
|
||||
case "window_resize":
|
||||
if (!hasWindowTarget(input)) {
|
||||
throw new Error(
|
||||
"computer window_resize requires windowId or windowTitle",
|
||||
);
|
||||
}
|
||||
if (input.width === undefined || input.height === undefined) {
|
||||
throw new Error("computer window_resize requires width and height");
|
||||
}
|
||||
return;
|
||||
case "clipboard_write":
|
||||
if (input.text === undefined) {
|
||||
throw new Error("computer clipboard_write requires text");
|
||||
}
|
||||
return;
|
||||
default: {
|
||||
const unsupportedAction: never = input.action;
|
||||
throw new Error(`Unsupported computer action: ${unsupportedAction}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function describeAction(input: ComputerToolInput): {
|
||||
statusMessage: string;
|
||||
successMessage: string;
|
||||
} {
|
||||
switch (input.action) {
|
||||
case "observe":
|
||||
return {
|
||||
statusMessage: "Observing desktop...",
|
||||
successMessage: "Captured desktop snapshot",
|
||||
};
|
||||
case "click":
|
||||
return {
|
||||
statusMessage: "Clicking desktop target...",
|
||||
successMessage: "Clicked desktop target",
|
||||
};
|
||||
case "type":
|
||||
return {
|
||||
statusMessage: "Typing into desktop...",
|
||||
successMessage: "Typed into desktop",
|
||||
};
|
||||
case "hotkey":
|
||||
return {
|
||||
statusMessage: "Sending hotkey...",
|
||||
successMessage: "Sent hotkey",
|
||||
};
|
||||
case "scroll":
|
||||
return {
|
||||
statusMessage: "Scrolling desktop...",
|
||||
successMessage: "Scrolled desktop",
|
||||
};
|
||||
case "drag":
|
||||
return {
|
||||
statusMessage: "Dragging desktop target...",
|
||||
successMessage: "Dragged desktop target",
|
||||
};
|
||||
case "wait":
|
||||
return {
|
||||
statusMessage: "Waiting for desktop state...",
|
||||
successMessage: "Desktop wait condition satisfied",
|
||||
};
|
||||
case "app_list":
|
||||
return {
|
||||
statusMessage: "Listing apps...",
|
||||
successMessage: "Listed apps",
|
||||
};
|
||||
case "app_open":
|
||||
return {
|
||||
statusMessage: `Opening app ${input.app}...`,
|
||||
successMessage: `Opened app ${input.app}`,
|
||||
};
|
||||
case "app_focus":
|
||||
return {
|
||||
statusMessage: `Focusing app ${input.app}...`,
|
||||
successMessage: `Focused app ${input.app}`,
|
||||
};
|
||||
case "window_list":
|
||||
return {
|
||||
statusMessage: "Listing windows...",
|
||||
successMessage: "Listed windows",
|
||||
};
|
||||
case "window_focus":
|
||||
return {
|
||||
statusMessage: "Focusing window...",
|
||||
successMessage: "Focused window",
|
||||
};
|
||||
case "window_move":
|
||||
return {
|
||||
statusMessage: "Moving window...",
|
||||
successMessage: "Moved window",
|
||||
};
|
||||
case "window_resize":
|
||||
return {
|
||||
statusMessage: "Resizing window...",
|
||||
successMessage: "Resized window",
|
||||
};
|
||||
case "window_close":
|
||||
return {
|
||||
statusMessage: "Closing window...",
|
||||
successMessage: "Closed window",
|
||||
};
|
||||
case "clipboard_read":
|
||||
return {
|
||||
statusMessage: "Reading clipboard...",
|
||||
successMessage: "Read clipboard",
|
||||
};
|
||||
case "clipboard_write":
|
||||
return {
|
||||
statusMessage: "Writing clipboard...",
|
||||
successMessage: "Wrote clipboard",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function buildComputerCommand(
|
||||
cwd: string,
|
||||
input: ComputerToolInput,
|
||||
options?: ComputerToolOptions,
|
||||
): ComputerCommandContext {
|
||||
validateComputerInput(input);
|
||||
|
||||
const stateDir = getComputerStateDir(cwd, options);
|
||||
ensureComputerDir(stateDir);
|
||||
const actionDescription = describeAction(input);
|
||||
|
||||
return {
|
||||
action: input.action,
|
||||
args: ["--state-dir", stateDir, "--input", JSON.stringify(input)],
|
||||
statusMessage: actionDescription.statusMessage,
|
||||
successMessage: actionDescription.successMessage,
|
||||
stateDir,
|
||||
};
|
||||
}
|
||||
|
||||
function buildComputerErrorMessage(
|
||||
action: ComputerToolAction,
|
||||
output: string,
|
||||
exitCode: number | null,
|
||||
): string {
|
||||
const base =
|
||||
exitCode === null
|
||||
? `Computer action "${action}" failed`
|
||||
: `Computer action "${action}" exited with code ${exitCode}`;
|
||||
return output.length > 0 ? `${output}\n\n${base}` : base;
|
||||
}
|
||||
|
||||
function getMissingComputerCommandMessage(command: string): string {
|
||||
return [
|
||||
`Computer tool could not find "${command}".`,
|
||||
"Desktop sandboxes install agent-computer alongside the browser tool.",
|
||||
"If you are running locally, either install the helper or omit the computer tool.",
|
||||
"Recommended setup inside a sandbox image: copy agent-computer into /usr/local/bin and install xdotool, wmctrl, tesseract-ocr, and xclip.",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
function parseComputerPayload(output: string): {
|
||||
text: string;
|
||||
snapshotId?: string;
|
||||
screenshotPath?: string;
|
||||
} {
|
||||
if (output.length === 0) {
|
||||
return { text: "" };
|
||||
}
|
||||
|
||||
try {
|
||||
const payload = JSON.parse(output) as {
|
||||
snapshot?: { snapshotId?: string; screenshotPath?: string };
|
||||
summary?: string;
|
||||
screenshotPath?: string;
|
||||
snapshotId?: string;
|
||||
};
|
||||
return {
|
||||
text: JSON.stringify(payload, null, 2),
|
||||
snapshotId: payload.snapshot?.snapshotId ?? payload.snapshotId,
|
||||
screenshotPath:
|
||||
payload.snapshot?.screenshotPath ?? payload.screenshotPath,
|
||||
};
|
||||
} catch {
|
||||
return { text: output };
|
||||
}
|
||||
}
|
||||
|
||||
export function createComputerTool(
|
||||
cwd: string,
|
||||
options?: ComputerToolOptions,
|
||||
): AgentTool<typeof computerSchema> {
|
||||
const operations = options?.operations ?? defaultComputerOperations;
|
||||
const command = options?.command ?? DEFAULT_COMPUTER_COMMAND;
|
||||
const defaultTimeoutSeconds =
|
||||
options?.defaultTimeoutSeconds ?? DEFAULT_COMPUTER_TIMEOUT_SECONDS;
|
||||
|
||||
return {
|
||||
name: "computer",
|
||||
label: "computer",
|
||||
description:
|
||||
"Use the desktop computer when browser DOM control is not enough: observe the screen, interact with windows and apps, type, click, drag, scroll, wait for native UI changes, and read or write the clipboard.",
|
||||
parameters: computerSchema,
|
||||
execute: async (_toolCallId, input, signal, onUpdate) => {
|
||||
const commandContext = buildComputerCommand(cwd, input, options);
|
||||
const details: ComputerToolDetails = {
|
||||
action: commandContext.action,
|
||||
command,
|
||||
args: commandContext.args,
|
||||
stateDir: commandContext.stateDir,
|
||||
};
|
||||
|
||||
onUpdate?.({
|
||||
content: [{ type: "text", text: commandContext.statusMessage }],
|
||||
details,
|
||||
});
|
||||
|
||||
const chunks: Buffer[] = [];
|
||||
|
||||
try {
|
||||
const { exitCode } = await operations.exec(
|
||||
command,
|
||||
commandContext.args,
|
||||
{
|
||||
cwd,
|
||||
env: getShellEnv(),
|
||||
onData: (data) => chunks.push(data),
|
||||
signal,
|
||||
timeout: defaultTimeoutSeconds,
|
||||
},
|
||||
);
|
||||
|
||||
const output = normalizeOutput(chunks);
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(
|
||||
buildComputerErrorMessage(commandContext.action, output, exitCode),
|
||||
);
|
||||
}
|
||||
|
||||
const parsed = parseComputerPayload(output);
|
||||
if (parsed.snapshotId) {
|
||||
details.snapshotId = parsed.snapshotId;
|
||||
}
|
||||
if (parsed.screenshotPath) {
|
||||
details.screenshotPath = parsed.screenshotPath;
|
||||
}
|
||||
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text:
|
||||
parsed.text.length > 0
|
||||
? parsed.text
|
||||
: commandContext.successMessage,
|
||||
},
|
||||
],
|
||||
details,
|
||||
};
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof Error &&
|
||||
"code" in error &&
|
||||
error.code === "ENOENT"
|
||||
) {
|
||||
throw new Error(getMissingComputerCommandMessage(command));
|
||||
}
|
||||
if (error instanceof Error && error.message === "aborted") {
|
||||
throw new Error(`Computer action "${commandContext.action}" aborted`);
|
||||
}
|
||||
if (error instanceof Error && error.message.startsWith("timeout:")) {
|
||||
const seconds = error.message.split(":")[1];
|
||||
throw new Error(
|
||||
`Computer action "${commandContext.action}" timed out after ${seconds} seconds`,
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export const computerTool = createComputerTool(process.cwd());
|
||||
|
|
@ -19,6 +19,16 @@ export {
|
|||
browserTool,
|
||||
createBrowserTool,
|
||||
} from "./browser.js";
|
||||
export {
|
||||
type ComputerObservationMode,
|
||||
type ComputerOperations,
|
||||
type ComputerToolAction,
|
||||
type ComputerToolDetails,
|
||||
type ComputerToolInput,
|
||||
type ComputerToolOptions,
|
||||
computerTool,
|
||||
createComputerTool,
|
||||
} from "./computer.js";
|
||||
export {
|
||||
createEditTool,
|
||||
type EditOperations,
|
||||
|
|
@ -84,6 +94,11 @@ import {
|
|||
createBrowserTool,
|
||||
type BrowserToolOptions,
|
||||
} from "./browser.js";
|
||||
import {
|
||||
computerTool,
|
||||
createComputerTool,
|
||||
type ComputerToolOptions,
|
||||
} from "./computer.js";
|
||||
import { createEditTool, editTool } from "./edit.js";
|
||||
import { createFindTool, findTool } from "./find.js";
|
||||
import { createGrepTool, grepTool } from "./grep.js";
|
||||
|
|
@ -102,6 +117,7 @@ export const allTools = {
|
|||
read: readTool,
|
||||
bash: bashTool,
|
||||
browser: browserTool,
|
||||
computer: computerTool,
|
||||
edit: editTool,
|
||||
write: writeTool,
|
||||
grep: grepTool,
|
||||
|
|
@ -115,6 +131,7 @@ export const defaultCodingToolNames: ToolName[] = [
|
|||
"read",
|
||||
"bash",
|
||||
"browser",
|
||||
"computer",
|
||||
"edit",
|
||||
"write",
|
||||
];
|
||||
|
|
@ -131,19 +148,16 @@ export interface ToolsOptions {
|
|||
bash?: BashToolOptions;
|
||||
/** Options for the browser tool */
|
||||
browser?: BrowserToolOptions;
|
||||
/** Options for the computer tool */
|
||||
computer?: ComputerToolOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create coding tools configured for a specific working directory.
|
||||
*/
|
||||
export function createCodingTools(cwd: string, options?: ToolsOptions): Tool[] {
|
||||
return [
|
||||
createReadTool(cwd, options?.read),
|
||||
createBashTool(cwd, options?.bash),
|
||||
createBrowserTool(cwd, options?.browser),
|
||||
createEditTool(cwd),
|
||||
createWriteTool(cwd),
|
||||
];
|
||||
const tools = createAllTools(cwd, options);
|
||||
return defaultCodingToolNames.map((toolName) => tools[toolName]);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -172,6 +186,7 @@ export function createAllTools(
|
|||
read: createReadTool(cwd, options?.read),
|
||||
bash: createBashTool(cwd, options?.bash),
|
||||
browser: createBrowserTool(cwd, options?.browser),
|
||||
computer: createComputerTool(cwd, options?.computer),
|
||||
edit: createEditTool(cwd),
|
||||
write: createWriteTool(cwd),
|
||||
grep: createGrepTool(cwd),
|
||||
|
|
|
|||
|
|
@ -182,6 +182,7 @@ export {
|
|||
createAgentSession,
|
||||
createBashTool,
|
||||
createBrowserTool,
|
||||
createComputerTool,
|
||||
// Tool factories (for custom cwd)
|
||||
createCodingTools,
|
||||
createEditTool,
|
||||
|
|
@ -253,6 +254,13 @@ export {
|
|||
type BrowserToolInput,
|
||||
type BrowserToolOptions,
|
||||
browserTool,
|
||||
type ComputerObservationMode,
|
||||
type ComputerOperations,
|
||||
type ComputerToolAction,
|
||||
type ComputerToolDetails,
|
||||
type ComputerToolInput,
|
||||
type ComputerToolOptions,
|
||||
computerTool,
|
||||
codingTools,
|
||||
defaultCodingToolNames,
|
||||
DEFAULT_MAX_BYTES,
|
||||
|
|
|
|||
175
packages/coding-agent/test/computer-tool.test.ts
Normal file
175
packages/coding-agent/test/computer-tool.test.ts
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
import { mkdtempSync, rmSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { parseArgs } from "../src/cli/args.js";
|
||||
import { buildSystemPrompt } from "../src/core/system-prompt.js";
|
||||
import {
|
||||
type ComputerOperations,
|
||||
type ComputerToolDetails,
|
||||
createAllTools,
|
||||
createComputerTool,
|
||||
defaultCodingToolNames,
|
||||
} from "../src/core/tools/index.js";
|
||||
|
||||
interface TextBlock {
|
||||
type: "text";
|
||||
text: string;
|
||||
}
|
||||
|
||||
type ToolContentBlock = TextBlock | { type: string };
|
||||
|
||||
interface ToolResultLike {
|
||||
content: ToolContentBlock[];
|
||||
details?: unknown;
|
||||
}
|
||||
|
||||
interface ComputerExecCall {
|
||||
command: string;
|
||||
args: string[];
|
||||
cwd: string;
|
||||
env: NodeJS.ProcessEnv;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
function getTextOutput(result: ToolResultLike): string {
|
||||
return result.content
|
||||
.filter((block): block is TextBlock => block.type === "text")
|
||||
.map((block) => block.text)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
function createMockComputerOperations(
|
||||
output = "",
|
||||
exitCode: number | null = 0,
|
||||
): {
|
||||
calls: ComputerExecCall[];
|
||||
operations: ComputerOperations;
|
||||
} {
|
||||
const calls: ComputerExecCall[] = [];
|
||||
|
||||
return {
|
||||
calls,
|
||||
operations: {
|
||||
exec: async (command, args, options) => {
|
||||
calls.push({
|
||||
command,
|
||||
args,
|
||||
cwd: options.cwd,
|
||||
env: options.env,
|
||||
timeout: options.timeout,
|
||||
});
|
||||
if (output.length > 0) {
|
||||
options.onData(Buffer.from(output, "utf-8"));
|
||||
}
|
||||
return { exitCode };
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe("computer tool", () => {
|
||||
const tempDirs: string[] = [];
|
||||
|
||||
afterEach(() => {
|
||||
while (tempDirs.length > 0) {
|
||||
const tempDir = tempDirs.pop();
|
||||
if (tempDir) {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
function createTempDir(prefix: string): string {
|
||||
const tempDir = mkdtempSync(join(tmpdir(), prefix));
|
||||
tempDirs.push(tempDir);
|
||||
return tempDir;
|
||||
}
|
||||
|
||||
it("observes the desktop through the agent-computer helper", async () => {
|
||||
const cwd = createTempDir("coding-agent-computer-observe-");
|
||||
const stateDir = join(cwd, "computer-state");
|
||||
const { calls, operations } = createMockComputerOperations(
|
||||
JSON.stringify({
|
||||
ok: true,
|
||||
action: "observe",
|
||||
summary: "Captured desktop snapshot snap-1",
|
||||
snapshot: {
|
||||
snapshotId: "snap-1",
|
||||
screenshotPath: "/tmp/snap-1.png",
|
||||
backend: "hybrid",
|
||||
activeWindow: null,
|
||||
windows: [],
|
||||
refs: [],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const computerTool = createComputerTool(cwd, {
|
||||
operations,
|
||||
command: "agent-computer-test",
|
||||
stateDir,
|
||||
});
|
||||
|
||||
const result = (await computerTool.execute("computer-observe", {
|
||||
action: "observe",
|
||||
})) as ToolResultLike;
|
||||
|
||||
expect(calls).toHaveLength(1);
|
||||
expect(calls[0]).toMatchObject({
|
||||
command: "agent-computer-test",
|
||||
args: ["--state-dir", stateDir, "--input", '{"action":"observe"}'],
|
||||
cwd,
|
||||
timeout: 90,
|
||||
});
|
||||
|
||||
const details = result.details as ComputerToolDetails | undefined;
|
||||
expect(details?.stateDir).toBe(stateDir);
|
||||
expect(details?.snapshotId).toBe("snap-1");
|
||||
expect(details?.screenshotPath).toBe("/tmp/snap-1.png");
|
||||
expect(getTextOutput(result)).toContain('"snapshotId": "snap-1"');
|
||||
});
|
||||
|
||||
it("validates click targets before spawning the helper", async () => {
|
||||
const cwd = createTempDir("coding-agent-computer-click-");
|
||||
const stateDir = join(cwd, "computer-state");
|
||||
const { calls, operations } = createMockComputerOperations();
|
||||
|
||||
const computerTool = createComputerTool(cwd, {
|
||||
operations,
|
||||
stateDir,
|
||||
});
|
||||
|
||||
await expect(
|
||||
computerTool.execute("computer-click-missing-target", {
|
||||
action: "click",
|
||||
}),
|
||||
).rejects.toThrow(
|
||||
"computer click requires snapshotId and ref, or explicit x and y coordinates",
|
||||
);
|
||||
|
||||
expect(calls).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("accepts computer in --tools and exposes it in built-in tool wiring", () => {
|
||||
const parsed = parseArgs(["--tools", "computer,read"]);
|
||||
expect(parsed.tools).toEqual(["computer", "read"]);
|
||||
|
||||
expect(defaultCodingToolNames).toContain("computer");
|
||||
expect(createAllTools(process.cwd()).computer.name).toBe("computer");
|
||||
});
|
||||
|
||||
it("mentions computer in the default system prompt", () => {
|
||||
const prompt = buildSystemPrompt();
|
||||
|
||||
expect(prompt).toContain(
|
||||
"- computer: Use the desktop computer: observe the screen",
|
||||
);
|
||||
expect(prompt).toContain(
|
||||
"Computer: observe before interacting. Use it for native UI",
|
||||
);
|
||||
expect(prompt).toContain(
|
||||
"Prefer browser for websites and DOM-aware tasks. Switch to computer",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -4,11 +4,10 @@ import { TmuxAdapter } from "./tmux-adapter";
|
|||
|
||||
describe("TmuxAdapter", () => {
|
||||
let adapter: TmuxAdapter;
|
||||
let mockExecCommand: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
adapter = new TmuxAdapter();
|
||||
mockExecCommand = vi.spyOn(terminalAdapter, "execCommand");
|
||||
vi.spyOn(terminalAdapter, "execCommand");
|
||||
delete process.env.TMUX;
|
||||
delete process.env.ZELLIJ;
|
||||
delete process.env.WEZTERM_PANE;
|
||||
|
|
@ -21,6 +20,7 @@ describe("TmuxAdapter", () => {
|
|||
});
|
||||
|
||||
it("detects tmux in headless runtimes when the binary is available", () => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockReturnValue({
|
||||
stdout: "tmux 3.4",
|
||||
stderr: "",
|
||||
|
|
@ -33,6 +33,7 @@ describe("TmuxAdapter", () => {
|
|||
|
||||
it("does not detect tmux in GUI terminals just because the binary exists", () => {
|
||||
process.env.COLORTERM = "truecolor";
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockReturnValue({
|
||||
stdout: "tmux 3.4",
|
||||
stderr: "",
|
||||
|
|
@ -44,7 +45,8 @@ describe("TmuxAdapter", () => {
|
|||
});
|
||||
|
||||
it("creates a detached team session when not already inside tmux", () => {
|
||||
mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockImplementation((_bin, args) => {
|
||||
if (args[0] === "has-session") {
|
||||
return { stdout: "", stderr: "missing", status: 1 };
|
||||
}
|
||||
|
|
@ -65,12 +67,18 @@ describe("TmuxAdapter", () => {
|
|||
|
||||
expect(mockExecCommand).toHaveBeenCalledWith(
|
||||
"tmux",
|
||||
expect.arrayContaining(["new-session", "-d", "-s", "companion-teams-demo"]),
|
||||
expect.arrayContaining([
|
||||
"new-session",
|
||||
"-d",
|
||||
"-s",
|
||||
"companion-teams-demo",
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it("splits an existing detached session when not already inside tmux", () => {
|
||||
mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockImplementation((_bin, args) => {
|
||||
if (args[0] === "has-session") {
|
||||
return { stdout: "", stderr: "", status: 0 };
|
||||
}
|
||||
|
|
@ -96,6 +104,7 @@ describe("TmuxAdapter", () => {
|
|||
});
|
||||
|
||||
it("checks pane liveness by pane id", () => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockReturnValue({
|
||||
stdout: "%1\n%7\n",
|
||||
stderr: "",
|
||||
|
|
|
|||
|
|
@ -8,11 +8,10 @@ import { WezTermAdapter } from "./wezterm-adapter";
|
|||
|
||||
describe("WezTermAdapter", () => {
|
||||
let adapter: WezTermAdapter;
|
||||
let mockExecCommand: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
adapter = new WezTermAdapter();
|
||||
mockExecCommand = vi.spyOn(terminalAdapter, "execCommand");
|
||||
vi.spyOn(terminalAdapter, "execCommand");
|
||||
delete process.env.WEZTERM_PANE;
|
||||
delete process.env.TMUX;
|
||||
delete process.env.ZELLIJ;
|
||||
|
|
@ -31,6 +30,7 @@ describe("WezTermAdapter", () => {
|
|||
|
||||
describe("detect", () => {
|
||||
it("should detect when WEZTERM_PANE is set", () => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockReturnValue({
|
||||
stdout: "version 1.0",
|
||||
stderr: "",
|
||||
|
|
@ -43,7 +43,8 @@ describe("WezTermAdapter", () => {
|
|||
describe("spawn", () => {
|
||||
it("should spawn first pane to the right with 50%", () => {
|
||||
// Mock getPanes finding only current pane
|
||||
mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockImplementation((_bin, args) => {
|
||||
if (args.includes("list")) {
|
||||
return {
|
||||
stdout: JSON.stringify([{ pane_id: 0, tab_id: 0 }]),
|
||||
|
|
@ -79,7 +80,8 @@ describe("WezTermAdapter", () => {
|
|||
|
||||
it("should spawn subsequent panes by splitting the sidebar", () => {
|
||||
// Mock getPanes finding current pane (0) and sidebar pane (1)
|
||||
mockExecCommand.mockImplementation((_bin: string, args: string[]) => {
|
||||
const mockExecCommand = vi.mocked(terminalAdapter.execCommand);
|
||||
mockExecCommand.mockImplementation((_bin, args) => {
|
||||
if (args.includes("list")) {
|
||||
return {
|
||||
stdout: JSON.stringify([
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue