From 0720c47495f663f3e4c3b81c52d5b54899b938e6 Mon Sep 17 00:00:00 2001 From: Harivansh Rathi Date: Sun, 8 Mar 2026 12:50:02 -0700 Subject: [PATCH] feat: add built-in browser tool Add a first-class browser tool backed by agent-browser and enable it by default in coding sessions. Include CLI and system-prompt wiring plus focused coverage for the new tool. Co-authored-by: Codex --- packages/coding-agent/src/cli/args.ts | 19 +- .../coding-agent/src/core/agent-session.ts | 6 +- packages/coding-agent/src/core/sdk.ts | 9 +- .../coding-agent/src/core/system-prompt.ts | 14 +- .../coding-agent/src/core/tools/browser.ts | 505 ++++++++++++++++++ packages/coding-agent/src/core/tools/index.ts | 37 +- packages/coding-agent/src/index.ts | 10 + .../coding-agent/test/browser-tool.test.ts | 212 ++++++++ 8 files changed, 797 insertions(+), 15 deletions(-) create mode 100644 packages/coding-agent/src/core/tools/browser.ts create mode 100644 packages/coding-agent/test/browser-tool.test.ts diff --git a/packages/coding-agent/src/cli/args.ts b/packages/coding-agent/src/cli/args.ts index d0a59fe..8d7688b 100644 --- a/packages/coding-agent/src/cli/args.ts +++ b/packages/coding-agent/src/cli/args.ts @@ -5,7 +5,11 @@ import type { ThinkingLevel } from "@mariozechner/pi-agent-core"; import chalk from "chalk"; import { APP_NAME, CONFIG_DIR_NAME, ENV_AGENT_DIR } from "../config.js"; -import { allTools, type ToolName } from "../core/tools/index.js"; +import { + allTools, + defaultCodingToolNames, + type ToolName, +} from "../core/tools/index.js"; export type Mode = "text" | "json" | "rpc"; @@ -193,7 +197,11 @@ export function parseArgs( } export function printHelp(): void { - console.log(`${chalk.bold(APP_NAME)} - AI coding assistant with read, bash, edit, write tools + const defaultToolsText = defaultCodingToolNames.join(","); + const availableToolsText = Object.keys(allTools).join(", "); + const defaultToolsLabel = defaultCodingToolNames.join(", "); + + console.log(`${chalk.bold(APP_NAME)} - AI coding assistant with read, bash, browser, edit, write tools ${chalk.bold("Usage:")} ${APP_NAME} [options] [@files...] [messages...] @@ -224,8 +232,8 @@ ${chalk.bold("Options:")} --models Comma-separated model patterns for Ctrl+P cycling Supports globs (anthropic/*, *sonnet*) and fuzzy matching --no-tools Disable all built-in tools - --tools Comma-separated list of tools to enable (default: read,bash,edit,write) - Available: read, bash, edit, write, grep, find, ls + --tools Comma-separated list of tools to enable (default: ${defaultToolsText}) + Available: ${availableToolsText} --thinking Set thinking level: off, minimal, low, medium, high, xhigh --extension, -e Load an extension file (can be used multiple times) --no-extensions, -ne Disable extension discovery (explicit -e paths still work) @@ -322,9 +330,10 @@ ${chalk.bold("Environment Variables:")} PI_SHARE_VIEWER_URL - Base URL for /share command (default: https://pi.dev/session/) PI_AI_ANTIGRAVITY_VERSION - Override Antigravity User-Agent version (e.g., 1.23.0) -${chalk.bold("Available Tools (default: read, bash, edit, write):")} +${chalk.bold(`Available Tools (default: ${defaultToolsLabel}):`)} read - Read file contents bash - Execute bash commands + browser - Browser automation with persistent state edit - Edit files with find/replace write - Write files (creates/overwrites) grep - Search file contents (read-only, off by default) diff --git a/packages/coding-agent/src/core/agent-session.ts b/packages/coding-agent/src/core/agent-session.ts index c4f6282..a4d13d3 100644 --- a/packages/coding-agent/src/core/agent-session.ts +++ b/packages/coding-agent/src/core/agent-session.ts @@ -111,7 +111,7 @@ import { } from "./slash-commands.js"; import { buildSystemPrompt } from "./system-prompt.js"; import type { BashOperations } from "./tools/bash.js"; -import { createAllTools } from "./tools/index.js"; +import { createAllTools, defaultCodingToolNames } from "./tools/index.js"; // ============================================================================ // Skill Block Parsing @@ -187,7 +187,7 @@ export interface AgentSessionConfig { customTools?: ToolDefinition[]; /** Model registry for API key resolution and model discovery */ modelRegistry: ModelRegistry; - /** Initial active built-in tool names. Default: [read, bash, edit, write] */ + /** Initial active built-in tool names. Default: [read, bash, browser, edit, write] */ initialActiveToolNames?: string[]; /** Override base tools (useful for custom runtimes). */ baseToolsOverride?: Record; @@ -2447,7 +2447,7 @@ export class AgentSession { const defaultActiveToolNames = this._baseToolsOverride ? Object.keys(this._baseToolsOverride) - : ["read", "bash", "edit", "write"]; + : defaultCodingToolNames; const baseActiveToolNames = options.activeToolNames ?? defaultActiveToolNames; this._refreshToolRegistry({ diff --git a/packages/coding-agent/src/core/sdk.ts b/packages/coding-agent/src/core/sdk.ts index 9773f0a..cef1231 100644 --- a/packages/coding-agent/src/core/sdk.ts +++ b/packages/coding-agent/src/core/sdk.ts @@ -25,8 +25,11 @@ import { time } from "./timings.js"; import { allTools, bashTool, + browserTool, codingTools, + defaultCodingToolNames, createBashTool, + createBrowserTool, createCodingTools, createEditTool, createFindTool, @@ -64,7 +67,7 @@ export interface CreateAgentSessionOptions { /** Models available for cycling (Ctrl+P in interactive mode) */ scopedModels?: Array<{ model: Model; thinkingLevel?: ThinkingLevel }>; - /** Built-in tools to use. Default: codingTools [read, bash, edit, write] */ + /** Built-in tools to use. Default: codingTools [read, bash, browser, edit, write] */ tools?: Tool[]; /** Custom tools to register (in addition to built-in tools). */ customTools?: ToolDefinition[]; @@ -109,6 +112,7 @@ export { // Pre-built tools (use process.cwd()) readTool, bashTool, + browserTool, editTool, writeTool, grepTool, @@ -122,6 +126,7 @@ export { createReadOnlyTools, createReadTool, createBashTool, + createBrowserTool, createEditTool, createWriteTool, createGrepTool, @@ -262,7 +267,7 @@ export async function createAgentSession( thinkingLevel = "off"; } - const defaultActiveToolNames: ToolName[] = ["read", "bash", "edit", "write"]; + const defaultActiveToolNames: ToolName[] = [...defaultCodingToolNames]; const initialActiveToolNames: ToolName[] = options.tools ? options.tools .map((t) => t.name) diff --git a/packages/coding-agent/src/core/system-prompt.ts b/packages/coding-agent/src/core/system-prompt.ts index cac3c81..9fb37ac 100644 --- a/packages/coding-agent/src/core/system-prompt.ts +++ b/packages/coding-agent/src/core/system-prompt.ts @@ -4,11 +4,14 @@ import { getDocsPath, getReadmePath } from "../config.js"; import { formatSkillsForPrompt, type Skill } from "./skills.js"; +import { defaultCodingToolNames } from "./tools/index.js"; /** Tool descriptions for system prompt */ const toolDescriptions: Record = { read: "Read file contents", bash: "Execute bash commands (ls, grep, find, etc.)", + browser: + "Open websites, inspect pages with snapshot, click/fill/wait, take screenshots, and save/load browser state", edit: "Make surgical edits to files (find exact text and replace)", write: "Create or overwrite files", grep: "Search file contents for patterns (respects .gitignore)", @@ -19,7 +22,7 @@ const toolDescriptions: Record = { export interface BuildSystemPromptOptions { /** Custom system prompt (replaces default). */ customPrompt?: string; - /** Tools to include in prompt. Default: [read, bash, edit, write] */ + /** Tools to include in prompt. Default: coding tools including browser */ selectedTools?: string[]; /** Optional one-line tool snippets keyed by tool name. */ toolSnippets?: Record; @@ -123,7 +126,7 @@ export function buildSystemPrompt( // Build tools list based on selected tools. // Built-ins use toolDescriptions. Custom tools can provide one-line snippets. - const tools = selectedTools || ["read", "bash", "edit", "write"]; + const tools = selectedTools ?? defaultCodingToolNames; const toolsList = tools.length > 0 ? tools @@ -147,6 +150,7 @@ export function buildSystemPrompt( }; const hasBash = tools.includes("bash"); + const hasBrowser = tools.includes("browser"); const hasEdit = tools.includes("edit"); const hasWrite = tools.includes("write"); const hasGrep = tools.includes("grep"); @@ -180,6 +184,12 @@ export function buildSystemPrompt( addGuideline("Use write only for new files or complete rewrites"); } + if (hasBrowser) { + addGuideline( + "Use browser for website tasks. Open the page, use snapshot to inspect interactive elements, then click, fill, wait, or screenshot as needed", + ); + } + // Output guideline (only when actually writing or executing) if (hasEdit || hasWrite) { addGuideline( diff --git a/packages/coding-agent/src/core/tools/browser.ts b/packages/coding-agent/src/core/tools/browser.ts new file mode 100644 index 0000000..fbe6dc8 --- /dev/null +++ b/packages/coding-agent/src/core/tools/browser.ts @@ -0,0 +1,505 @@ +import { spawn } from "node:child_process"; +import { randomBytes } from "node:crypto"; +import { existsSync, mkdirSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; +import type { AgentTool } from "@mariozechner/pi-agent-core"; +import { type Static, Type } from "@sinclair/typebox"; +import { getAgentDir } from "../../config.js"; +import { getShellEnv, killProcessTree, sanitizeBinaryOutput } from "../../utils/shell.js"; + +const browserActions = [ + "open", + "snapshot", + "click", + "fill", + "wait", + "screenshot", + "state_save", + "state_load", + "close", +] as const; + +const browserSnapshotModes = ["interactive", "full"] as const; +const browserLoadStates = ["load", "domcontentloaded", "networkidle"] as const; + +const DEFAULT_BROWSER_COMMAND = process.env.PI_AGENT_BROWSER_COMMAND || "agent-browser"; +const DEFAULT_BROWSER_TIMEOUT_SECONDS = 90; + +const browserSchema = Type.Object({ + action: Type.Union( + browserActions.map((action) => Type.Literal(action)), + { description: "Browser action to execute" }, + ), + url: Type.Optional(Type.String({ description: "URL to open, or URL glob to wait for" })), + mode: Type.Optional( + Type.Union( + browserSnapshotModes.map((mode) => Type.Literal(mode)), + { description: "Snapshot mode. Defaults to interactive." }, + ), + ), + ref: Type.Optional( + Type.String({ + description: "Element ref from snapshot output, such as @e2", + }), + ), + value: Type.Optional(Type.String({ description: "Text value to fill into a field" })), + text: Type.Optional(Type.String({ description: "Visible text to wait for" })), + ms: Type.Optional( + Type.Number({ + description: "Milliseconds to wait", + minimum: 0, + }), + ), + loadState: Type.Optional( + Type.Union( + browserLoadStates.map((state) => Type.Literal(state)), + { description: "Page load state to wait for" }, + ), + ), + path: Type.Optional( + Type.String({ + description: "Output path for screenshots, relative to the current working directory if not absolute", + }), + ), + fullPage: Type.Optional(Type.Boolean({ description: "Capture a full-page screenshot" })), + stateName: Type.Optional( + Type.String({ + description: "Named browser state checkpoint stored under ~/.pi/agent/browser/states/", + }), + ), +}); + +export type BrowserToolAction = (typeof browserActions)[number]; +export type BrowserSnapshotMode = (typeof browserSnapshotModes)[number]; +export type BrowserLoadState = (typeof browserLoadStates)[number]; +export type BrowserToolInput = Static; + +export interface BrowserToolDetails { + action: BrowserToolAction; + command: string; + args: string[]; + profilePath: string; + screenshotPath?: string; + statePath?: string; +} + +export interface BrowserOperations { + exec: ( + command: string, + args: string[], + options: { + cwd: string; + env: NodeJS.ProcessEnv; + onData: (data: Buffer) => void; + signal?: AbortSignal; + timeout?: number; + }, + ) => Promise<{ exitCode: number | null }>; +} + +const defaultBrowserOperations: BrowserOperations = { + exec: (command, args, { cwd, env, onData, signal, timeout }) => { + return new Promise((resolvePromise, rejectPromise) => { + const child = spawn(command, args, { + cwd, + detached: true, + env, + stdio: ["ignore", "pipe", "pipe"], + }); + + let timedOut = false; + let timeoutHandle: NodeJS.Timeout | undefined; + + if (timeout !== undefined && timeout > 0) { + timeoutHandle = setTimeout(() => { + timedOut = true; + if (child.pid) { + killProcessTree(child.pid); + } + }, timeout * 1000); + } + + if (child.stdout) { + child.stdout.on("data", onData); + } + if (child.stderr) { + child.stderr.on("data", onData); + } + + const onAbort = () => { + if (child.pid) { + killProcessTree(child.pid); + } + }; + + if (signal) { + if (signal.aborted) { + onAbort(); + } else { + signal.addEventListener("abort", onAbort, { once: true }); + } + } + + child.on("error", (error) => { + if (timeoutHandle) clearTimeout(timeoutHandle); + if (signal) signal.removeEventListener("abort", onAbort); + rejectPromise(error); + }); + + child.on("close", (code) => { + if (timeoutHandle) clearTimeout(timeoutHandle); + if (signal) signal.removeEventListener("abort", onAbort); + + if (signal?.aborted) { + rejectPromise(new Error("aborted")); + return; + } + + if (timedOut) { + rejectPromise(new Error(`timeout:${timeout}`)); + return; + } + + resolvePromise({ exitCode: code }); + }); + }); + }, +}; + +export interface BrowserToolOptions { + operations?: BrowserOperations; + command?: string; + defaultTimeoutSeconds?: number; + profileDir?: string; + stateDir?: string; + agentDir?: string; +} + +interface BrowserCommandContext { + action: BrowserToolAction; + args: string[]; + statusMessage: string; + successMessage: string; + profilePath: string; + screenshotPath?: string; + statePath?: string; +} + +function resolveCommandPath(cwd: string, inputPath: string): string { + return resolve(cwd, inputPath); +} + +function getBrowserRootDir(options?: BrowserToolOptions): string { + const baseAgentDir = options?.agentDir ?? getAgentDir(); + return join(baseAgentDir, "browser"); +} + +function getBrowserProfilePath(cwd: string, options?: BrowserToolOptions): string { + const profilePath = options?.profileDir ?? join(getBrowserRootDir(options), "profile"); + return resolveCommandPath(cwd, profilePath); +} + +function getBrowserStateDir(cwd: string, options?: BrowserToolOptions): string { + const stateDir = options?.stateDir ?? join(getBrowserRootDir(options), "states"); + return resolveCommandPath(cwd, stateDir); +} + +function createTempScreenshotPath(): string { + const id = randomBytes(8).toString("hex"); + return join(tmpdir(), `pi-browser-screenshot-${id}.png`); +} + +function normalizeOutput(chunks: Buffer[]): string { + return sanitizeBinaryOutput(Buffer.concat(chunks).toString("utf-8")).trim(); +} + +function sanitizeStateName(stateName: string): string { + const trimmed = stateName.trim(); + if (trimmed.length === 0) { + throw new Error("stateName is required for browser state actions"); + } + + const withoutJsonSuffix = trimmed.endsWith(".json") ? trimmed.slice(0, -".json".length) : trimmed; + const sanitized = withoutJsonSuffix.replace(/[^a-zA-Z0-9._-]+/g, "-").replace(/^-+|-+$/g, ""); + + if (sanitized.length === 0) { + throw new Error(`Invalid browser state name: "${stateName}"`); + } + + return sanitized; +} + +function ensureBrowserDirs(profilePath: string, stateDir: string): void { + mkdirSync(profilePath, { recursive: true }); + mkdirSync(stateDir, { recursive: true }); +} + +function buildWaitArgs(input: BrowserToolInput): { args: string[]; status: string } { + const targets = [ + input.ref !== undefined ? "ref" : undefined, + input.url !== undefined ? "url" : undefined, + input.text !== undefined ? "text" : undefined, + input.ms !== undefined ? "ms" : undefined, + input.loadState !== undefined ? "loadState" : undefined, + ].filter((target): target is string => target !== undefined); + + if (targets.length !== 1) { + throw new Error("browser wait requires exactly one of ref, url, text, ms, or loadState"); + } + + if (input.ref) { + return { args: ["wait", input.ref], status: `Waiting for ${input.ref}...` }; + } + if (input.url) { + return { + args: ["wait", "--url", input.url], + status: `Waiting for URL ${input.url}...`, + }; + } + if (input.text) { + return { + args: ["wait", "--text", input.text], + status: `Waiting for text "${input.text}"...`, + }; + } + if (input.ms !== undefined) { + return { + args: ["wait", String(input.ms)], + status: `Waiting ${input.ms}ms...`, + }; + } + + return { + args: ["wait", "--load", input.loadState!], + status: `Waiting for load state ${input.loadState}...`, + }; +} + +function buildBrowserCommand( + cwd: string, + input: BrowserToolInput, + options?: BrowserToolOptions, +): BrowserCommandContext { + const profilePath = getBrowserProfilePath(cwd, options); + const stateDir = getBrowserStateDir(cwd, options); + ensureBrowserDirs(profilePath, stateDir); + + const baseArgs = ["--profile", profilePath]; + + switch (input.action) { + case "open": { + if (!input.url) { + throw new Error("browser open requires url"); + } + return { + action: input.action, + args: [...baseArgs, "open", input.url], + statusMessage: `Opening ${input.url}...`, + successMessage: `Opened ${input.url}`, + profilePath, + }; + } + case "snapshot": { + const mode = input.mode ?? "interactive"; + const args = mode === "interactive" ? [...baseArgs, "snapshot", "-i"] : [...baseArgs, "snapshot"]; + return { + action: input.action, + args, + statusMessage: "Capturing browser snapshot...", + successMessage: "Captured browser snapshot", + profilePath, + }; + } + case "click": { + if (!input.ref) { + throw new Error("browser click requires ref"); + } + return { + action: input.action, + args: [...baseArgs, "click", input.ref], + statusMessage: `Clicking ${input.ref}...`, + successMessage: `Clicked ${input.ref}`, + profilePath, + }; + } + case "fill": { + if (!input.ref || input.value === undefined) { + throw new Error("browser fill requires ref and value"); + } + return { + action: input.action, + args: [...baseArgs, "fill", input.ref, input.value], + statusMessage: `Filling ${input.ref}...`, + successMessage: `Filled ${input.ref}`, + profilePath, + }; + } + case "wait": { + const wait = buildWaitArgs(input); + return { + action: input.action, + args: [...baseArgs, ...wait.args], + statusMessage: wait.status, + successMessage: "Browser wait condition satisfied", + profilePath, + }; + } + case "screenshot": { + const screenshotPath = input.path ? resolveCommandPath(cwd, input.path) : createTempScreenshotPath(); + const args = [...baseArgs, "screenshot"]; + if (input.fullPage) { + args.push("--full"); + } + args.push(screenshotPath); + + return { + action: input.action, + args, + statusMessage: "Taking browser screenshot...", + successMessage: `Saved browser screenshot to ${screenshotPath}`, + profilePath, + screenshotPath, + }; + } + case "state_save": { + if (!input.stateName) { + throw new Error("browser state_save requires stateName"); + } + const statePath = join(stateDir, `${sanitizeStateName(input.stateName)}.json`); + return { + action: input.action, + args: [...baseArgs, "state", "save", statePath], + statusMessage: `Saving browser state "${input.stateName}"...`, + successMessage: `Saved browser state "${input.stateName}" to ${statePath}`, + profilePath, + statePath, + }; + } + case "state_load": { + if (!input.stateName) { + throw new Error("browser state_load requires stateName"); + } + const statePath = join(stateDir, `${sanitizeStateName(input.stateName)}.json`); + if (!existsSync(statePath)) { + throw new Error(`Saved browser state "${input.stateName}" not found at ${statePath}`); + } + return { + action: input.action, + args: [...baseArgs, "state", "load", statePath], + statusMessage: `Loading browser state "${input.stateName}"...`, + successMessage: `Loaded browser state "${input.stateName}" from ${statePath}`, + profilePath, + statePath, + }; + } + case "close": + return { + action: input.action, + args: [...baseArgs, "close"], + statusMessage: "Closing browser...", + successMessage: "Closed browser", + profilePath, + }; + default: { + const unsupportedAction: never = input.action; + throw new Error(`Unsupported browser action: ${unsupportedAction}`); + } + } +} + +function buildBrowserErrorMessage(action: BrowserToolAction, output: string, exitCode: number | null): string { + const base = + exitCode === null + ? `Browser action "${action}" failed` + : `Browser action "${action}" exited with code ${exitCode}`; + return output.length > 0 ? `${output}\n\n${base}` : base; +} + +function getMissingBrowserCommandMessage(command: string): string { + return [ + `Browser tool could not find "${command}".`, + "Install agent-browser so the first-class browser tool can run.", + "Recommended setup:", + " npm install -g agent-browser", + " agent-browser install", + "If Chromium lives at a custom path, set AGENT_BROWSER_EXECUTABLE_PATH.", + ].join("\n"); +} + +export function createBrowserTool(cwd: string, options?: BrowserToolOptions): AgentTool { + const operations = options?.operations ?? defaultBrowserOperations; + const command = options?.command ?? DEFAULT_BROWSER_COMMAND; + const defaultTimeoutSeconds = options?.defaultTimeoutSeconds ?? DEFAULT_BROWSER_TIMEOUT_SECONDS; + + return { + name: "browser", + label: "browser", + description: + "Use a persistent browser for websites: open pages, inspect them with snapshot, click or fill elements, wait for changes, take screenshots, and save or load named browser state.", + parameters: browserSchema, + execute: async (_toolCallId, input, signal, onUpdate) => { + const commandContext = buildBrowserCommand(cwd, input, options); + const details: BrowserToolDetails = { + action: commandContext.action, + command, + args: commandContext.args, + profilePath: commandContext.profilePath, + screenshotPath: commandContext.screenshotPath, + statePath: commandContext.statePath, + }; + + onUpdate?.({ + content: [{ type: "text", text: commandContext.statusMessage }], + details, + }); + + const chunks: Buffer[] = []; + + try { + const { exitCode } = await operations.exec(command, commandContext.args, { + cwd, + env: getShellEnv(), + onData: (data) => chunks.push(data), + signal, + timeout: defaultTimeoutSeconds, + }); + + const output = normalizeOutput(chunks); + if (exitCode !== 0 && exitCode !== null) { + throw new Error(buildBrowserErrorMessage(commandContext.action, output, exitCode)); + } + + if (commandContext.action === "snapshot") { + if (output.length === 0) { + throw new Error("Browser snapshot returned no output"); + } + return { + content: [{ type: "text", text: output }], + details, + }; + } + + const text = output.length > 0 ? output : commandContext.successMessage; + return { + content: [{ type: "text", text }], + details, + }; + } catch (error) { + if (error instanceof Error && "code" in error && error.code === "ENOENT") { + throw new Error(getMissingBrowserCommandMessage(command)); + } + if (error instanceof Error && error.message === "aborted") { + throw new Error(`Browser action "${commandContext.action}" aborted`); + } + if (error instanceof Error && error.message.startsWith("timeout:")) { + const seconds = error.message.split(":")[1]; + throw new Error(`Browser action "${commandContext.action}" timed out after ${seconds} seconds`); + } + throw error; + } + }, + }; +} + +export const browserTool = createBrowserTool(process.cwd()); diff --git a/packages/coding-agent/src/core/tools/index.ts b/packages/coding-agent/src/core/tools/index.ts index e4eb7d9..39f30ed 100644 --- a/packages/coding-agent/src/core/tools/index.ts +++ b/packages/coding-agent/src/core/tools/index.ts @@ -8,6 +8,17 @@ export { bashTool, createBashTool, } from "./bash.js"; +export { + type BrowserLoadState, + type BrowserOperations, + type BrowserSnapshotMode, + type BrowserToolAction, + type BrowserToolDetails, + type BrowserToolInput, + type BrowserToolOptions, + browserTool, + createBrowserTool, +} from "./browser.js"; export { createEditTool, type EditOperations, @@ -68,6 +79,11 @@ export { import type { AgentTool } from "@mariozechner/pi-agent-core"; import { type BashToolOptions, bashTool, createBashTool } from "./bash.js"; +import { + browserTool, + createBrowserTool, + type BrowserToolOptions, +} from "./browser.js"; import { createEditTool, editTool } from "./edit.js"; import { createFindTool, findTool } from "./find.js"; import { createGrepTool, grepTool } from "./grep.js"; @@ -78,9 +94,6 @@ import { createWriteTool, writeTool } from "./write.js"; /** Tool type (AgentTool from pi-ai) */ export type Tool = AgentTool; -// Default tools for full access mode (using process.cwd()) -export const codingTools: Tool[] = [readTool, bashTool, editTool, writeTool]; - // Read-only tools for exploration without modification (using process.cwd()) export const readOnlyTools: Tool[] = [readTool, grepTool, findTool, lsTool]; @@ -88,6 +101,7 @@ export const readOnlyTools: Tool[] = [readTool, grepTool, findTool, lsTool]; export const allTools = { read: readTool, bash: bashTool, + browser: browserTool, edit: editTool, write: writeTool, grep: grepTool, @@ -97,11 +111,26 @@ export const allTools = { export type ToolName = keyof typeof allTools; +export const defaultCodingToolNames: ToolName[] = [ + "read", + "bash", + "browser", + "edit", + "write", +]; + +// Default tools for full access mode (using process.cwd()) +export const codingTools: Tool[] = defaultCodingToolNames.map( + (toolName) => allTools[toolName], +); + export interface ToolsOptions { /** Options for the read tool */ read?: ReadToolOptions; /** Options for the bash tool */ bash?: BashToolOptions; + /** Options for the browser tool */ + browser?: BrowserToolOptions; } /** @@ -111,6 +140,7 @@ export function createCodingTools(cwd: string, options?: ToolsOptions): Tool[] { return [ createReadTool(cwd, options?.read), createBashTool(cwd, options?.bash), + createBrowserTool(cwd, options?.browser), createEditTool(cwd), createWriteTool(cwd), ]; @@ -141,6 +171,7 @@ export function createAllTools( return { read: createReadTool(cwd, options?.read), bash: createBashTool(cwd, options?.bash), + browser: createBrowserTool(cwd, options?.browser), edit: createEditTool(cwd), write: createWriteTool(cwd), grep: createGrepTool(cwd), diff --git a/packages/coding-agent/src/index.ts b/packages/coding-agent/src/index.ts index 4a33917..ed278e6 100644 --- a/packages/coding-agent/src/index.ts +++ b/packages/coding-agent/src/index.ts @@ -181,6 +181,7 @@ export { // Factory createAgentSession, createBashTool, + createBrowserTool, // Tool factories (for custom cwd) createCodingTools, createEditTool, @@ -244,7 +245,16 @@ export { type BashToolInput, type BashToolOptions, bashTool, + type BrowserLoadState, + type BrowserOperations, + type BrowserSnapshotMode, + type BrowserToolAction, + type BrowserToolDetails, + type BrowserToolInput, + type BrowserToolOptions, + browserTool, codingTools, + defaultCodingToolNames, DEFAULT_MAX_BYTES, DEFAULT_MAX_LINES, type EditOperations, diff --git a/packages/coding-agent/test/browser-tool.test.ts b/packages/coding-agent/test/browser-tool.test.ts new file mode 100644 index 0000000..7c035e4 --- /dev/null +++ b/packages/coding-agent/test/browser-tool.test.ts @@ -0,0 +1,212 @@ +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { parseArgs } from "../src/cli/args.js"; +import { buildSystemPrompt } from "../src/core/system-prompt.js"; +import { + type BrowserOperations, + type BrowserToolDetails, + createAllTools, + createBrowserTool, + defaultCodingToolNames, +} from "../src/core/tools/index.js"; + +interface TextBlock { + type: "text"; + text: string; +} + +type ToolContentBlock = TextBlock | { type: string }; + +interface ToolResultLike { + content: ToolContentBlock[]; + details?: unknown; +} + +interface BrowserExecCall { + command: string; + args: string[]; + cwd: string; + env: NodeJS.ProcessEnv; + timeout?: number; +} + +function getTextOutput(result: ToolResultLike): string { + return result.content + .filter((block): block is TextBlock => block.type === "text") + .map((block) => block.text) + .join("\n"); +} + +function createMockBrowserOperations( + output = "", + exitCode = 0, +): { + calls: BrowserExecCall[]; + operations: BrowserOperations; +} { + const calls: BrowserExecCall[] = []; + + return { + calls, + operations: { + exec: async (command, args, options) => { + calls.push({ + command, + args, + cwd: options.cwd, + env: options.env, + timeout: options.timeout, + }); + if (output.length > 0) { + options.onData(Buffer.from(output, "utf-8")); + } + return { exitCode }; + }, + }, + }; +} + +describe("browser tool", () => { + const tempDirs: string[] = []; + + afterEach(() => { + while (tempDirs.length > 0) { + const tempDir = tempDirs.pop(); + if (tempDir) { + rmSync(tempDir, { recursive: true, force: true }); + } + } + }); + + function createTempDir(prefix: string): string { + const tempDir = mkdtempSync(join(tmpdir(), prefix)); + tempDirs.push(tempDir); + return tempDir; + } + + it("opens pages through agent-browser with a shared profile", async () => { + const cwd = createTempDir("coding-agent-browser-open-"); + const profileDir = join(cwd, "profile"); + const stateDir = join(cwd, "states"); + const { calls, operations } = createMockBrowserOperations(); + + const browserTool = createBrowserTool(cwd, { + operations, + command: "agent-browser-test", + profileDir, + stateDir, + }); + + const result = (await browserTool.execute("browser-open", { + action: "open", + url: "https://example.com", + })) as ToolResultLike; + + expect(calls).toHaveLength(1); + expect(calls[0]).toMatchObject({ + command: "agent-browser-test", + args: ["--profile", profileDir, "open", "https://example.com"], + cwd, + timeout: 90, + }); + expect(getTextOutput(result)).toBe("Opened https://example.com"); + + const details = result.details as BrowserToolDetails | undefined; + expect(details?.profilePath).toBe(profileDir); + }); + + it("uses interactive snapshots by default and returns snapshot text", async () => { + const cwd = createTempDir("coding-agent-browser-snapshot-"); + const profileDir = join(cwd, "profile"); + const stateDir = join(cwd, "states"); + const { calls, operations } = createMockBrowserOperations("main [ref=@e1]\nbutton [ref=@e2] Sign in"); + + const browserTool = createBrowserTool(cwd, { + operations, + profileDir, + stateDir, + }); + + const result = (await browserTool.execute("browser-snapshot", { + action: "snapshot", + })) as ToolResultLike; + + expect(calls[0]?.args).toEqual(["--profile", profileDir, "snapshot", "-i"]); + expect(getTextOutput(result)).toContain("button [ref=@e2] Sign in"); + }); + + it("validates wait targets before spawning agent-browser", async () => { + const cwd = createTempDir("coding-agent-browser-wait-"); + const profileDir = join(cwd, "profile"); + const stateDir = join(cwd, "states"); + const { calls, operations } = createMockBrowserOperations(); + + const browserTool = createBrowserTool(cwd, { + operations, + profileDir, + stateDir, + }); + + await expect( + browserTool.execute("browser-wait-missing", { + action: "wait", + }), + ).rejects.toThrow("browser wait requires exactly one of ref, url, text, ms, or loadState"); + + await expect( + browserTool.execute("browser-wait-ambiguous", { + action: "wait", + ref: "@e2", + text: "Done", + }), + ).rejects.toThrow("browser wait requires exactly one of ref, url, text, ms, or loadState"); + + expect(calls).toHaveLength(0); + }); + + it("stores named state under the managed browser state directory", async () => { + const cwd = createTempDir("coding-agent-browser-state-"); + const profileDir = join(cwd, "profile"); + const stateDir = join(cwd, "states"); + const { calls, operations } = createMockBrowserOperations(); + + const browserTool = createBrowserTool(cwd, { + operations, + profileDir, + stateDir, + }); + + const result = (await browserTool.execute("browser-state-save", { + action: "state_save", + stateName: "my session/prod", + })) as ToolResultLike; + + const expectedStatePath = join(stateDir, "my-session-prod.json"); + expect(calls[0]?.args).toEqual(["--profile", profileDir, "state", "save", expectedStatePath]); + + const details = result.details as BrowserToolDetails | undefined; + expect(details?.statePath).toBe(expectedStatePath); + expect(getTextOutput(result)).toContain(expectedStatePath); + }); + + it("accepts browser in --tools and exposes it in default tool wiring", () => { + const parsed = parseArgs(["--tools", "browser,read"]); + expect(parsed.tools).toEqual(["browser", "read"]); + + expect(defaultCodingToolNames).toContain("browser"); + expect(createAllTools(process.cwd()).browser.name).toBe("browser"); + }); + + it("mentions browser in the default system prompt", () => { + const prompt = buildSystemPrompt(); + + expect(prompt).toContain( + "- browser: Open websites, inspect pages with snapshot, click/fill/wait, take screenshots, and save/load browser state", + ); + expect(prompt).toContain( + "Use browser for website tasks. Open the page, use snapshot to inspect interactive elements, then click, fill, wait, or screenshot as needed", + ); + }); +});