mirror of
https://github.com/harivansh-afk/clanker-agent.git
synced 2026-04-15 07:04:45 +00:00
feat: add built-in browser tool
Add a first-class browser tool backed by agent-browser and enable it by default in coding sessions. Include CLI and system-prompt wiring plus focused coverage for the new tool. Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
parent
df702d95a3
commit
0720c47495
8 changed files with 797 additions and 15 deletions
|
|
@ -5,7 +5,11 @@
|
|||
import type { ThinkingLevel } from "@mariozechner/pi-agent-core";
|
||||
import chalk from "chalk";
|
||||
import { APP_NAME, CONFIG_DIR_NAME, ENV_AGENT_DIR } from "../config.js";
|
||||
import { allTools, type ToolName } from "../core/tools/index.js";
|
||||
import {
|
||||
allTools,
|
||||
defaultCodingToolNames,
|
||||
type ToolName,
|
||||
} from "../core/tools/index.js";
|
||||
|
||||
export type Mode = "text" | "json" | "rpc";
|
||||
|
||||
|
|
@ -193,7 +197,11 @@ export function parseArgs(
|
|||
}
|
||||
|
||||
export function printHelp(): void {
|
||||
console.log(`${chalk.bold(APP_NAME)} - AI coding assistant with read, bash, edit, write tools
|
||||
const defaultToolsText = defaultCodingToolNames.join(",");
|
||||
const availableToolsText = Object.keys(allTools).join(", ");
|
||||
const defaultToolsLabel = defaultCodingToolNames.join(", ");
|
||||
|
||||
console.log(`${chalk.bold(APP_NAME)} - AI coding assistant with read, bash, browser, edit, write tools
|
||||
|
||||
${chalk.bold("Usage:")}
|
||||
${APP_NAME} [options] [@files...] [messages...]
|
||||
|
|
@ -224,8 +232,8 @@ ${chalk.bold("Options:")}
|
|||
--models <patterns> Comma-separated model patterns for Ctrl+P cycling
|
||||
Supports globs (anthropic/*, *sonnet*) and fuzzy matching
|
||||
--no-tools Disable all built-in tools
|
||||
--tools <tools> Comma-separated list of tools to enable (default: read,bash,edit,write)
|
||||
Available: read, bash, edit, write, grep, find, ls
|
||||
--tools <tools> Comma-separated list of tools to enable (default: ${defaultToolsText})
|
||||
Available: ${availableToolsText}
|
||||
--thinking <level> Set thinking level: off, minimal, low, medium, high, xhigh
|
||||
--extension, -e <path> Load an extension file (can be used multiple times)
|
||||
--no-extensions, -ne Disable extension discovery (explicit -e paths still work)
|
||||
|
|
@ -322,9 +330,10 @@ ${chalk.bold("Environment Variables:")}
|
|||
PI_SHARE_VIEWER_URL - Base URL for /share command (default: https://pi.dev/session/)
|
||||
PI_AI_ANTIGRAVITY_VERSION - Override Antigravity User-Agent version (e.g., 1.23.0)
|
||||
|
||||
${chalk.bold("Available Tools (default: read, bash, edit, write):")}
|
||||
${chalk.bold(`Available Tools (default: ${defaultToolsLabel}):`)}
|
||||
read - Read file contents
|
||||
bash - Execute bash commands
|
||||
browser - Browser automation with persistent state
|
||||
edit - Edit files with find/replace
|
||||
write - Write files (creates/overwrites)
|
||||
grep - Search file contents (read-only, off by default)
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ import {
|
|||
} from "./slash-commands.js";
|
||||
import { buildSystemPrompt } from "./system-prompt.js";
|
||||
import type { BashOperations } from "./tools/bash.js";
|
||||
import { createAllTools } from "./tools/index.js";
|
||||
import { createAllTools, defaultCodingToolNames } from "./tools/index.js";
|
||||
|
||||
// ============================================================================
|
||||
// Skill Block Parsing
|
||||
|
|
@ -187,7 +187,7 @@ export interface AgentSessionConfig {
|
|||
customTools?: ToolDefinition[];
|
||||
/** Model registry for API key resolution and model discovery */
|
||||
modelRegistry: ModelRegistry;
|
||||
/** Initial active built-in tool names. Default: [read, bash, edit, write] */
|
||||
/** Initial active built-in tool names. Default: [read, bash, browser, edit, write] */
|
||||
initialActiveToolNames?: string[];
|
||||
/** Override base tools (useful for custom runtimes). */
|
||||
baseToolsOverride?: Record<string, AgentTool>;
|
||||
|
|
@ -2447,7 +2447,7 @@ export class AgentSession {
|
|||
|
||||
const defaultActiveToolNames = this._baseToolsOverride
|
||||
? Object.keys(this._baseToolsOverride)
|
||||
: ["read", "bash", "edit", "write"];
|
||||
: defaultCodingToolNames;
|
||||
const baseActiveToolNames =
|
||||
options.activeToolNames ?? defaultActiveToolNames;
|
||||
this._refreshToolRegistry({
|
||||
|
|
|
|||
|
|
@ -25,8 +25,11 @@ import { time } from "./timings.js";
|
|||
import {
|
||||
allTools,
|
||||
bashTool,
|
||||
browserTool,
|
||||
codingTools,
|
||||
defaultCodingToolNames,
|
||||
createBashTool,
|
||||
createBrowserTool,
|
||||
createCodingTools,
|
||||
createEditTool,
|
||||
createFindTool,
|
||||
|
|
@ -64,7 +67,7 @@ export interface CreateAgentSessionOptions {
|
|||
/** Models available for cycling (Ctrl+P in interactive mode) */
|
||||
scopedModels?: Array<{ model: Model<any>; thinkingLevel?: ThinkingLevel }>;
|
||||
|
||||
/** Built-in tools to use. Default: codingTools [read, bash, edit, write] */
|
||||
/** Built-in tools to use. Default: codingTools [read, bash, browser, edit, write] */
|
||||
tools?: Tool[];
|
||||
/** Custom tools to register (in addition to built-in tools). */
|
||||
customTools?: ToolDefinition[];
|
||||
|
|
@ -109,6 +112,7 @@ export {
|
|||
// Pre-built tools (use process.cwd())
|
||||
readTool,
|
||||
bashTool,
|
||||
browserTool,
|
||||
editTool,
|
||||
writeTool,
|
||||
grepTool,
|
||||
|
|
@ -122,6 +126,7 @@ export {
|
|||
createReadOnlyTools,
|
||||
createReadTool,
|
||||
createBashTool,
|
||||
createBrowserTool,
|
||||
createEditTool,
|
||||
createWriteTool,
|
||||
createGrepTool,
|
||||
|
|
@ -262,7 +267,7 @@ export async function createAgentSession(
|
|||
thinkingLevel = "off";
|
||||
}
|
||||
|
||||
const defaultActiveToolNames: ToolName[] = ["read", "bash", "edit", "write"];
|
||||
const defaultActiveToolNames: ToolName[] = [...defaultCodingToolNames];
|
||||
const initialActiveToolNames: ToolName[] = options.tools
|
||||
? options.tools
|
||||
.map((t) => t.name)
|
||||
|
|
|
|||
|
|
@ -4,11 +4,14 @@
|
|||
|
||||
import { getDocsPath, getReadmePath } from "../config.js";
|
||||
import { formatSkillsForPrompt, type Skill } from "./skills.js";
|
||||
import { defaultCodingToolNames } from "./tools/index.js";
|
||||
|
||||
/** Tool descriptions for system prompt */
|
||||
const toolDescriptions: Record<string, string> = {
|
||||
read: "Read file contents",
|
||||
bash: "Execute bash commands (ls, grep, find, etc.)",
|
||||
browser:
|
||||
"Open websites, inspect pages with snapshot, click/fill/wait, take screenshots, and save/load browser state",
|
||||
edit: "Make surgical edits to files (find exact text and replace)",
|
||||
write: "Create or overwrite files",
|
||||
grep: "Search file contents for patterns (respects .gitignore)",
|
||||
|
|
@ -19,7 +22,7 @@ const toolDescriptions: Record<string, string> = {
|
|||
export interface BuildSystemPromptOptions {
|
||||
/** Custom system prompt (replaces default). */
|
||||
customPrompt?: string;
|
||||
/** Tools to include in prompt. Default: [read, bash, edit, write] */
|
||||
/** Tools to include in prompt. Default: coding tools including browser */
|
||||
selectedTools?: string[];
|
||||
/** Optional one-line tool snippets keyed by tool name. */
|
||||
toolSnippets?: Record<string, string>;
|
||||
|
|
@ -123,7 +126,7 @@ export function buildSystemPrompt(
|
|||
|
||||
// Build tools list based on selected tools.
|
||||
// Built-ins use toolDescriptions. Custom tools can provide one-line snippets.
|
||||
const tools = selectedTools || ["read", "bash", "edit", "write"];
|
||||
const tools = selectedTools ?? defaultCodingToolNames;
|
||||
const toolsList =
|
||||
tools.length > 0
|
||||
? tools
|
||||
|
|
@ -147,6 +150,7 @@ export function buildSystemPrompt(
|
|||
};
|
||||
|
||||
const hasBash = tools.includes("bash");
|
||||
const hasBrowser = tools.includes("browser");
|
||||
const hasEdit = tools.includes("edit");
|
||||
const hasWrite = tools.includes("write");
|
||||
const hasGrep = tools.includes("grep");
|
||||
|
|
@ -180,6 +184,12 @@ export function buildSystemPrompt(
|
|||
addGuideline("Use write only for new files or complete rewrites");
|
||||
}
|
||||
|
||||
if (hasBrowser) {
|
||||
addGuideline(
|
||||
"Use browser for website tasks. Open the page, use snapshot to inspect interactive elements, then click, fill, wait, or screenshot as needed",
|
||||
);
|
||||
}
|
||||
|
||||
// Output guideline (only when actually writing or executing)
|
||||
if (hasEdit || hasWrite) {
|
||||
addGuideline(
|
||||
|
|
|
|||
505
packages/coding-agent/src/core/tools/browser.ts
Normal file
505
packages/coding-agent/src/core/tools/browser.ts
Normal file
|
|
@ -0,0 +1,505 @@
|
|||
import { spawn } from "node:child_process";
|
||||
import { randomBytes } from "node:crypto";
|
||||
import { existsSync, mkdirSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join, resolve } from "node:path";
|
||||
import type { AgentTool } from "@mariozechner/pi-agent-core";
|
||||
import { type Static, Type } from "@sinclair/typebox";
|
||||
import { getAgentDir } from "../../config.js";
|
||||
import { getShellEnv, killProcessTree, sanitizeBinaryOutput } from "../../utils/shell.js";
|
||||
|
||||
const browserActions = [
|
||||
"open",
|
||||
"snapshot",
|
||||
"click",
|
||||
"fill",
|
||||
"wait",
|
||||
"screenshot",
|
||||
"state_save",
|
||||
"state_load",
|
||||
"close",
|
||||
] as const;
|
||||
|
||||
const browserSnapshotModes = ["interactive", "full"] as const;
|
||||
const browserLoadStates = ["load", "domcontentloaded", "networkidle"] as const;
|
||||
|
||||
const DEFAULT_BROWSER_COMMAND = process.env.PI_AGENT_BROWSER_COMMAND || "agent-browser";
|
||||
const DEFAULT_BROWSER_TIMEOUT_SECONDS = 90;
|
||||
|
||||
const browserSchema = Type.Object({
|
||||
action: Type.Union(
|
||||
browserActions.map((action) => Type.Literal(action)),
|
||||
{ description: "Browser action to execute" },
|
||||
),
|
||||
url: Type.Optional(Type.String({ description: "URL to open, or URL glob to wait for" })),
|
||||
mode: Type.Optional(
|
||||
Type.Union(
|
||||
browserSnapshotModes.map((mode) => Type.Literal(mode)),
|
||||
{ description: "Snapshot mode. Defaults to interactive." },
|
||||
),
|
||||
),
|
||||
ref: Type.Optional(
|
||||
Type.String({
|
||||
description: "Element ref from snapshot output, such as @e2",
|
||||
}),
|
||||
),
|
||||
value: Type.Optional(Type.String({ description: "Text value to fill into a field" })),
|
||||
text: Type.Optional(Type.String({ description: "Visible text to wait for" })),
|
||||
ms: Type.Optional(
|
||||
Type.Number({
|
||||
description: "Milliseconds to wait",
|
||||
minimum: 0,
|
||||
}),
|
||||
),
|
||||
loadState: Type.Optional(
|
||||
Type.Union(
|
||||
browserLoadStates.map((state) => Type.Literal(state)),
|
||||
{ description: "Page load state to wait for" },
|
||||
),
|
||||
),
|
||||
path: Type.Optional(
|
||||
Type.String({
|
||||
description: "Output path for screenshots, relative to the current working directory if not absolute",
|
||||
}),
|
||||
),
|
||||
fullPage: Type.Optional(Type.Boolean({ description: "Capture a full-page screenshot" })),
|
||||
stateName: Type.Optional(
|
||||
Type.String({
|
||||
description: "Named browser state checkpoint stored under ~/.pi/agent/browser/states/",
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
export type BrowserToolAction = (typeof browserActions)[number];
|
||||
export type BrowserSnapshotMode = (typeof browserSnapshotModes)[number];
|
||||
export type BrowserLoadState = (typeof browserLoadStates)[number];
|
||||
export type BrowserToolInput = Static<typeof browserSchema>;
|
||||
|
||||
export interface BrowserToolDetails {
|
||||
action: BrowserToolAction;
|
||||
command: string;
|
||||
args: string[];
|
||||
profilePath: string;
|
||||
screenshotPath?: string;
|
||||
statePath?: string;
|
||||
}
|
||||
|
||||
export interface BrowserOperations {
|
||||
exec: (
|
||||
command: string,
|
||||
args: string[],
|
||||
options: {
|
||||
cwd: string;
|
||||
env: NodeJS.ProcessEnv;
|
||||
onData: (data: Buffer) => void;
|
||||
signal?: AbortSignal;
|
||||
timeout?: number;
|
||||
},
|
||||
) => Promise<{ exitCode: number | null }>;
|
||||
}
|
||||
|
||||
const defaultBrowserOperations: BrowserOperations = {
|
||||
exec: (command, args, { cwd, env, onData, signal, timeout }) => {
|
||||
return new Promise((resolvePromise, rejectPromise) => {
|
||||
const child = spawn(command, args, {
|
||||
cwd,
|
||||
detached: true,
|
||||
env,
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let timedOut = false;
|
||||
let timeoutHandle: NodeJS.Timeout | undefined;
|
||||
|
||||
if (timeout !== undefined && timeout > 0) {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
timedOut = true;
|
||||
if (child.pid) {
|
||||
killProcessTree(child.pid);
|
||||
}
|
||||
}, timeout * 1000);
|
||||
}
|
||||
|
||||
if (child.stdout) {
|
||||
child.stdout.on("data", onData);
|
||||
}
|
||||
if (child.stderr) {
|
||||
child.stderr.on("data", onData);
|
||||
}
|
||||
|
||||
const onAbort = () => {
|
||||
if (child.pid) {
|
||||
killProcessTree(child.pid);
|
||||
}
|
||||
};
|
||||
|
||||
if (signal) {
|
||||
if (signal.aborted) {
|
||||
onAbort();
|
||||
} else {
|
||||
signal.addEventListener("abort", onAbort, { once: true });
|
||||
}
|
||||
}
|
||||
|
||||
child.on("error", (error) => {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
if (signal) signal.removeEventListener("abort", onAbort);
|
||||
rejectPromise(error);
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (timeoutHandle) clearTimeout(timeoutHandle);
|
||||
if (signal) signal.removeEventListener("abort", onAbort);
|
||||
|
||||
if (signal?.aborted) {
|
||||
rejectPromise(new Error("aborted"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (timedOut) {
|
||||
rejectPromise(new Error(`timeout:${timeout}`));
|
||||
return;
|
||||
}
|
||||
|
||||
resolvePromise({ exitCode: code });
|
||||
});
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
export interface BrowserToolOptions {
|
||||
operations?: BrowserOperations;
|
||||
command?: string;
|
||||
defaultTimeoutSeconds?: number;
|
||||
profileDir?: string;
|
||||
stateDir?: string;
|
||||
agentDir?: string;
|
||||
}
|
||||
|
||||
interface BrowserCommandContext {
|
||||
action: BrowserToolAction;
|
||||
args: string[];
|
||||
statusMessage: string;
|
||||
successMessage: string;
|
||||
profilePath: string;
|
||||
screenshotPath?: string;
|
||||
statePath?: string;
|
||||
}
|
||||
|
||||
function resolveCommandPath(cwd: string, inputPath: string): string {
|
||||
return resolve(cwd, inputPath);
|
||||
}
|
||||
|
||||
function getBrowserRootDir(options?: BrowserToolOptions): string {
|
||||
const baseAgentDir = options?.agentDir ?? getAgentDir();
|
||||
return join(baseAgentDir, "browser");
|
||||
}
|
||||
|
||||
function getBrowserProfilePath(cwd: string, options?: BrowserToolOptions): string {
|
||||
const profilePath = options?.profileDir ?? join(getBrowserRootDir(options), "profile");
|
||||
return resolveCommandPath(cwd, profilePath);
|
||||
}
|
||||
|
||||
function getBrowserStateDir(cwd: string, options?: BrowserToolOptions): string {
|
||||
const stateDir = options?.stateDir ?? join(getBrowserRootDir(options), "states");
|
||||
return resolveCommandPath(cwd, stateDir);
|
||||
}
|
||||
|
||||
function createTempScreenshotPath(): string {
|
||||
const id = randomBytes(8).toString("hex");
|
||||
return join(tmpdir(), `pi-browser-screenshot-${id}.png`);
|
||||
}
|
||||
|
||||
function normalizeOutput(chunks: Buffer[]): string {
|
||||
return sanitizeBinaryOutput(Buffer.concat(chunks).toString("utf-8")).trim();
|
||||
}
|
||||
|
||||
function sanitizeStateName(stateName: string): string {
|
||||
const trimmed = stateName.trim();
|
||||
if (trimmed.length === 0) {
|
||||
throw new Error("stateName is required for browser state actions");
|
||||
}
|
||||
|
||||
const withoutJsonSuffix = trimmed.endsWith(".json") ? trimmed.slice(0, -".json".length) : trimmed;
|
||||
const sanitized = withoutJsonSuffix.replace(/[^a-zA-Z0-9._-]+/g, "-").replace(/^-+|-+$/g, "");
|
||||
|
||||
if (sanitized.length === 0) {
|
||||
throw new Error(`Invalid browser state name: "${stateName}"`);
|
||||
}
|
||||
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
function ensureBrowserDirs(profilePath: string, stateDir: string): void {
|
||||
mkdirSync(profilePath, { recursive: true });
|
||||
mkdirSync(stateDir, { recursive: true });
|
||||
}
|
||||
|
||||
function buildWaitArgs(input: BrowserToolInput): { args: string[]; status: string } {
|
||||
const targets = [
|
||||
input.ref !== undefined ? "ref" : undefined,
|
||||
input.url !== undefined ? "url" : undefined,
|
||||
input.text !== undefined ? "text" : undefined,
|
||||
input.ms !== undefined ? "ms" : undefined,
|
||||
input.loadState !== undefined ? "loadState" : undefined,
|
||||
].filter((target): target is string => target !== undefined);
|
||||
|
||||
if (targets.length !== 1) {
|
||||
throw new Error("browser wait requires exactly one of ref, url, text, ms, or loadState");
|
||||
}
|
||||
|
||||
if (input.ref) {
|
||||
return { args: ["wait", input.ref], status: `Waiting for ${input.ref}...` };
|
||||
}
|
||||
if (input.url) {
|
||||
return {
|
||||
args: ["wait", "--url", input.url],
|
||||
status: `Waiting for URL ${input.url}...`,
|
||||
};
|
||||
}
|
||||
if (input.text) {
|
||||
return {
|
||||
args: ["wait", "--text", input.text],
|
||||
status: `Waiting for text "${input.text}"...`,
|
||||
};
|
||||
}
|
||||
if (input.ms !== undefined) {
|
||||
return {
|
||||
args: ["wait", String(input.ms)],
|
||||
status: `Waiting ${input.ms}ms...`,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
args: ["wait", "--load", input.loadState!],
|
||||
status: `Waiting for load state ${input.loadState}...`,
|
||||
};
|
||||
}
|
||||
|
||||
function buildBrowserCommand(
|
||||
cwd: string,
|
||||
input: BrowserToolInput,
|
||||
options?: BrowserToolOptions,
|
||||
): BrowserCommandContext {
|
||||
const profilePath = getBrowserProfilePath(cwd, options);
|
||||
const stateDir = getBrowserStateDir(cwd, options);
|
||||
ensureBrowserDirs(profilePath, stateDir);
|
||||
|
||||
const baseArgs = ["--profile", profilePath];
|
||||
|
||||
switch (input.action) {
|
||||
case "open": {
|
||||
if (!input.url) {
|
||||
throw new Error("browser open requires url");
|
||||
}
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, "open", input.url],
|
||||
statusMessage: `Opening ${input.url}...`,
|
||||
successMessage: `Opened ${input.url}`,
|
||||
profilePath,
|
||||
};
|
||||
}
|
||||
case "snapshot": {
|
||||
const mode = input.mode ?? "interactive";
|
||||
const args = mode === "interactive" ? [...baseArgs, "snapshot", "-i"] : [...baseArgs, "snapshot"];
|
||||
return {
|
||||
action: input.action,
|
||||
args,
|
||||
statusMessage: "Capturing browser snapshot...",
|
||||
successMessage: "Captured browser snapshot",
|
||||
profilePath,
|
||||
};
|
||||
}
|
||||
case "click": {
|
||||
if (!input.ref) {
|
||||
throw new Error("browser click requires ref");
|
||||
}
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, "click", input.ref],
|
||||
statusMessage: `Clicking ${input.ref}...`,
|
||||
successMessage: `Clicked ${input.ref}`,
|
||||
profilePath,
|
||||
};
|
||||
}
|
||||
case "fill": {
|
||||
if (!input.ref || input.value === undefined) {
|
||||
throw new Error("browser fill requires ref and value");
|
||||
}
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, "fill", input.ref, input.value],
|
||||
statusMessage: `Filling ${input.ref}...`,
|
||||
successMessage: `Filled ${input.ref}`,
|
||||
profilePath,
|
||||
};
|
||||
}
|
||||
case "wait": {
|
||||
const wait = buildWaitArgs(input);
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, ...wait.args],
|
||||
statusMessage: wait.status,
|
||||
successMessage: "Browser wait condition satisfied",
|
||||
profilePath,
|
||||
};
|
||||
}
|
||||
case "screenshot": {
|
||||
const screenshotPath = input.path ? resolveCommandPath(cwd, input.path) : createTempScreenshotPath();
|
||||
const args = [...baseArgs, "screenshot"];
|
||||
if (input.fullPage) {
|
||||
args.push("--full");
|
||||
}
|
||||
args.push(screenshotPath);
|
||||
|
||||
return {
|
||||
action: input.action,
|
||||
args,
|
||||
statusMessage: "Taking browser screenshot...",
|
||||
successMessage: `Saved browser screenshot to ${screenshotPath}`,
|
||||
profilePath,
|
||||
screenshotPath,
|
||||
};
|
||||
}
|
||||
case "state_save": {
|
||||
if (!input.stateName) {
|
||||
throw new Error("browser state_save requires stateName");
|
||||
}
|
||||
const statePath = join(stateDir, `${sanitizeStateName(input.stateName)}.json`);
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, "state", "save", statePath],
|
||||
statusMessage: `Saving browser state "${input.stateName}"...`,
|
||||
successMessage: `Saved browser state "${input.stateName}" to ${statePath}`,
|
||||
profilePath,
|
||||
statePath,
|
||||
};
|
||||
}
|
||||
case "state_load": {
|
||||
if (!input.stateName) {
|
||||
throw new Error("browser state_load requires stateName");
|
||||
}
|
||||
const statePath = join(stateDir, `${sanitizeStateName(input.stateName)}.json`);
|
||||
if (!existsSync(statePath)) {
|
||||
throw new Error(`Saved browser state "${input.stateName}" not found at ${statePath}`);
|
||||
}
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, "state", "load", statePath],
|
||||
statusMessage: `Loading browser state "${input.stateName}"...`,
|
||||
successMessage: `Loaded browser state "${input.stateName}" from ${statePath}`,
|
||||
profilePath,
|
||||
statePath,
|
||||
};
|
||||
}
|
||||
case "close":
|
||||
return {
|
||||
action: input.action,
|
||||
args: [...baseArgs, "close"],
|
||||
statusMessage: "Closing browser...",
|
||||
successMessage: "Closed browser",
|
||||
profilePath,
|
||||
};
|
||||
default: {
|
||||
const unsupportedAction: never = input.action;
|
||||
throw new Error(`Unsupported browser action: ${unsupportedAction}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function buildBrowserErrorMessage(action: BrowserToolAction, output: string, exitCode: number | null): string {
|
||||
const base =
|
||||
exitCode === null
|
||||
? `Browser action "${action}" failed`
|
||||
: `Browser action "${action}" exited with code ${exitCode}`;
|
||||
return output.length > 0 ? `${output}\n\n${base}` : base;
|
||||
}
|
||||
|
||||
function getMissingBrowserCommandMessage(command: string): string {
|
||||
return [
|
||||
`Browser tool could not find "${command}".`,
|
||||
"Install agent-browser so the first-class browser tool can run.",
|
||||
"Recommended setup:",
|
||||
" npm install -g agent-browser",
|
||||
" agent-browser install",
|
||||
"If Chromium lives at a custom path, set AGENT_BROWSER_EXECUTABLE_PATH.",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
export function createBrowserTool(cwd: string, options?: BrowserToolOptions): AgentTool<typeof browserSchema> {
|
||||
const operations = options?.operations ?? defaultBrowserOperations;
|
||||
const command = options?.command ?? DEFAULT_BROWSER_COMMAND;
|
||||
const defaultTimeoutSeconds = options?.defaultTimeoutSeconds ?? DEFAULT_BROWSER_TIMEOUT_SECONDS;
|
||||
|
||||
return {
|
||||
name: "browser",
|
||||
label: "browser",
|
||||
description:
|
||||
"Use a persistent browser for websites: open pages, inspect them with snapshot, click or fill elements, wait for changes, take screenshots, and save or load named browser state.",
|
||||
parameters: browserSchema,
|
||||
execute: async (_toolCallId, input, signal, onUpdate) => {
|
||||
const commandContext = buildBrowserCommand(cwd, input, options);
|
||||
const details: BrowserToolDetails = {
|
||||
action: commandContext.action,
|
||||
command,
|
||||
args: commandContext.args,
|
||||
profilePath: commandContext.profilePath,
|
||||
screenshotPath: commandContext.screenshotPath,
|
||||
statePath: commandContext.statePath,
|
||||
};
|
||||
|
||||
onUpdate?.({
|
||||
content: [{ type: "text", text: commandContext.statusMessage }],
|
||||
details,
|
||||
});
|
||||
|
||||
const chunks: Buffer[] = [];
|
||||
|
||||
try {
|
||||
const { exitCode } = await operations.exec(command, commandContext.args, {
|
||||
cwd,
|
||||
env: getShellEnv(),
|
||||
onData: (data) => chunks.push(data),
|
||||
signal,
|
||||
timeout: defaultTimeoutSeconds,
|
||||
});
|
||||
|
||||
const output = normalizeOutput(chunks);
|
||||
if (exitCode !== 0 && exitCode !== null) {
|
||||
throw new Error(buildBrowserErrorMessage(commandContext.action, output, exitCode));
|
||||
}
|
||||
|
||||
if (commandContext.action === "snapshot") {
|
||||
if (output.length === 0) {
|
||||
throw new Error("Browser snapshot returned no output");
|
||||
}
|
||||
return {
|
||||
content: [{ type: "text", text: output }],
|
||||
details,
|
||||
};
|
||||
}
|
||||
|
||||
const text = output.length > 0 ? output : commandContext.successMessage;
|
||||
return {
|
||||
content: [{ type: "text", text }],
|
||||
details,
|
||||
};
|
||||
} catch (error) {
|
||||
if (error instanceof Error && "code" in error && error.code === "ENOENT") {
|
||||
throw new Error(getMissingBrowserCommandMessage(command));
|
||||
}
|
||||
if (error instanceof Error && error.message === "aborted") {
|
||||
throw new Error(`Browser action "${commandContext.action}" aborted`);
|
||||
}
|
||||
if (error instanceof Error && error.message.startsWith("timeout:")) {
|
||||
const seconds = error.message.split(":")[1];
|
||||
throw new Error(`Browser action "${commandContext.action}" timed out after ${seconds} seconds`);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export const browserTool = createBrowserTool(process.cwd());
|
||||
|
|
@ -8,6 +8,17 @@ export {
|
|||
bashTool,
|
||||
createBashTool,
|
||||
} from "./bash.js";
|
||||
export {
|
||||
type BrowserLoadState,
|
||||
type BrowserOperations,
|
||||
type BrowserSnapshotMode,
|
||||
type BrowserToolAction,
|
||||
type BrowserToolDetails,
|
||||
type BrowserToolInput,
|
||||
type BrowserToolOptions,
|
||||
browserTool,
|
||||
createBrowserTool,
|
||||
} from "./browser.js";
|
||||
export {
|
||||
createEditTool,
|
||||
type EditOperations,
|
||||
|
|
@ -68,6 +79,11 @@ export {
|
|||
|
||||
import type { AgentTool } from "@mariozechner/pi-agent-core";
|
||||
import { type BashToolOptions, bashTool, createBashTool } from "./bash.js";
|
||||
import {
|
||||
browserTool,
|
||||
createBrowserTool,
|
||||
type BrowserToolOptions,
|
||||
} from "./browser.js";
|
||||
import { createEditTool, editTool } from "./edit.js";
|
||||
import { createFindTool, findTool } from "./find.js";
|
||||
import { createGrepTool, grepTool } from "./grep.js";
|
||||
|
|
@ -78,9 +94,6 @@ import { createWriteTool, writeTool } from "./write.js";
|
|||
/** Tool type (AgentTool from pi-ai) */
|
||||
export type Tool = AgentTool<any>;
|
||||
|
||||
// Default tools for full access mode (using process.cwd())
|
||||
export const codingTools: Tool[] = [readTool, bashTool, editTool, writeTool];
|
||||
|
||||
// Read-only tools for exploration without modification (using process.cwd())
|
||||
export const readOnlyTools: Tool[] = [readTool, grepTool, findTool, lsTool];
|
||||
|
||||
|
|
@ -88,6 +101,7 @@ export const readOnlyTools: Tool[] = [readTool, grepTool, findTool, lsTool];
|
|||
export const allTools = {
|
||||
read: readTool,
|
||||
bash: bashTool,
|
||||
browser: browserTool,
|
||||
edit: editTool,
|
||||
write: writeTool,
|
||||
grep: grepTool,
|
||||
|
|
@ -97,11 +111,26 @@ export const allTools = {
|
|||
|
||||
export type ToolName = keyof typeof allTools;
|
||||
|
||||
export const defaultCodingToolNames: ToolName[] = [
|
||||
"read",
|
||||
"bash",
|
||||
"browser",
|
||||
"edit",
|
||||
"write",
|
||||
];
|
||||
|
||||
// Default tools for full access mode (using process.cwd())
|
||||
export const codingTools: Tool[] = defaultCodingToolNames.map(
|
||||
(toolName) => allTools[toolName],
|
||||
);
|
||||
|
||||
export interface ToolsOptions {
|
||||
/** Options for the read tool */
|
||||
read?: ReadToolOptions;
|
||||
/** Options for the bash tool */
|
||||
bash?: BashToolOptions;
|
||||
/** Options for the browser tool */
|
||||
browser?: BrowserToolOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -111,6 +140,7 @@ export function createCodingTools(cwd: string, options?: ToolsOptions): Tool[] {
|
|||
return [
|
||||
createReadTool(cwd, options?.read),
|
||||
createBashTool(cwd, options?.bash),
|
||||
createBrowserTool(cwd, options?.browser),
|
||||
createEditTool(cwd),
|
||||
createWriteTool(cwd),
|
||||
];
|
||||
|
|
@ -141,6 +171,7 @@ export function createAllTools(
|
|||
return {
|
||||
read: createReadTool(cwd, options?.read),
|
||||
bash: createBashTool(cwd, options?.bash),
|
||||
browser: createBrowserTool(cwd, options?.browser),
|
||||
edit: createEditTool(cwd),
|
||||
write: createWriteTool(cwd),
|
||||
grep: createGrepTool(cwd),
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ export {
|
|||
// Factory
|
||||
createAgentSession,
|
||||
createBashTool,
|
||||
createBrowserTool,
|
||||
// Tool factories (for custom cwd)
|
||||
createCodingTools,
|
||||
createEditTool,
|
||||
|
|
@ -244,7 +245,16 @@ export {
|
|||
type BashToolInput,
|
||||
type BashToolOptions,
|
||||
bashTool,
|
||||
type BrowserLoadState,
|
||||
type BrowserOperations,
|
||||
type BrowserSnapshotMode,
|
||||
type BrowserToolAction,
|
||||
type BrowserToolDetails,
|
||||
type BrowserToolInput,
|
||||
type BrowserToolOptions,
|
||||
browserTool,
|
||||
codingTools,
|
||||
defaultCodingToolNames,
|
||||
DEFAULT_MAX_BYTES,
|
||||
DEFAULT_MAX_LINES,
|
||||
type EditOperations,
|
||||
|
|
|
|||
212
packages/coding-agent/test/browser-tool.test.ts
Normal file
212
packages/coding-agent/test/browser-tool.test.ts
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
import { mkdtempSync, rmSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { parseArgs } from "../src/cli/args.js";
|
||||
import { buildSystemPrompt } from "../src/core/system-prompt.js";
|
||||
import {
|
||||
type BrowserOperations,
|
||||
type BrowserToolDetails,
|
||||
createAllTools,
|
||||
createBrowserTool,
|
||||
defaultCodingToolNames,
|
||||
} from "../src/core/tools/index.js";
|
||||
|
||||
interface TextBlock {
|
||||
type: "text";
|
||||
text: string;
|
||||
}
|
||||
|
||||
type ToolContentBlock = TextBlock | { type: string };
|
||||
|
||||
interface ToolResultLike {
|
||||
content: ToolContentBlock[];
|
||||
details?: unknown;
|
||||
}
|
||||
|
||||
interface BrowserExecCall {
|
||||
command: string;
|
||||
args: string[];
|
||||
cwd: string;
|
||||
env: NodeJS.ProcessEnv;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
function getTextOutput(result: ToolResultLike): string {
|
||||
return result.content
|
||||
.filter((block): block is TextBlock => block.type === "text")
|
||||
.map((block) => block.text)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
function createMockBrowserOperations(
|
||||
output = "",
|
||||
exitCode = 0,
|
||||
): {
|
||||
calls: BrowserExecCall[];
|
||||
operations: BrowserOperations;
|
||||
} {
|
||||
const calls: BrowserExecCall[] = [];
|
||||
|
||||
return {
|
||||
calls,
|
||||
operations: {
|
||||
exec: async (command, args, options) => {
|
||||
calls.push({
|
||||
command,
|
||||
args,
|
||||
cwd: options.cwd,
|
||||
env: options.env,
|
||||
timeout: options.timeout,
|
||||
});
|
||||
if (output.length > 0) {
|
||||
options.onData(Buffer.from(output, "utf-8"));
|
||||
}
|
||||
return { exitCode };
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe("browser tool", () => {
|
||||
const tempDirs: string[] = [];
|
||||
|
||||
afterEach(() => {
|
||||
while (tempDirs.length > 0) {
|
||||
const tempDir = tempDirs.pop();
|
||||
if (tempDir) {
|
||||
rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
function createTempDir(prefix: string): string {
|
||||
const tempDir = mkdtempSync(join(tmpdir(), prefix));
|
||||
tempDirs.push(tempDir);
|
||||
return tempDir;
|
||||
}
|
||||
|
||||
it("opens pages through agent-browser with a shared profile", async () => {
|
||||
const cwd = createTempDir("coding-agent-browser-open-");
|
||||
const profileDir = join(cwd, "profile");
|
||||
const stateDir = join(cwd, "states");
|
||||
const { calls, operations } = createMockBrowserOperations();
|
||||
|
||||
const browserTool = createBrowserTool(cwd, {
|
||||
operations,
|
||||
command: "agent-browser-test",
|
||||
profileDir,
|
||||
stateDir,
|
||||
});
|
||||
|
||||
const result = (await browserTool.execute("browser-open", {
|
||||
action: "open",
|
||||
url: "https://example.com",
|
||||
})) as ToolResultLike;
|
||||
|
||||
expect(calls).toHaveLength(1);
|
||||
expect(calls[0]).toMatchObject({
|
||||
command: "agent-browser-test",
|
||||
args: ["--profile", profileDir, "open", "https://example.com"],
|
||||
cwd,
|
||||
timeout: 90,
|
||||
});
|
||||
expect(getTextOutput(result)).toBe("Opened https://example.com");
|
||||
|
||||
const details = result.details as BrowserToolDetails | undefined;
|
||||
expect(details?.profilePath).toBe(profileDir);
|
||||
});
|
||||
|
||||
it("uses interactive snapshots by default and returns snapshot text", async () => {
|
||||
const cwd = createTempDir("coding-agent-browser-snapshot-");
|
||||
const profileDir = join(cwd, "profile");
|
||||
const stateDir = join(cwd, "states");
|
||||
const { calls, operations } = createMockBrowserOperations("main [ref=@e1]\nbutton [ref=@e2] Sign in");
|
||||
|
||||
const browserTool = createBrowserTool(cwd, {
|
||||
operations,
|
||||
profileDir,
|
||||
stateDir,
|
||||
});
|
||||
|
||||
const result = (await browserTool.execute("browser-snapshot", {
|
||||
action: "snapshot",
|
||||
})) as ToolResultLike;
|
||||
|
||||
expect(calls[0]?.args).toEqual(["--profile", profileDir, "snapshot", "-i"]);
|
||||
expect(getTextOutput(result)).toContain("button [ref=@e2] Sign in");
|
||||
});
|
||||
|
||||
it("validates wait targets before spawning agent-browser", async () => {
|
||||
const cwd = createTempDir("coding-agent-browser-wait-");
|
||||
const profileDir = join(cwd, "profile");
|
||||
const stateDir = join(cwd, "states");
|
||||
const { calls, operations } = createMockBrowserOperations();
|
||||
|
||||
const browserTool = createBrowserTool(cwd, {
|
||||
operations,
|
||||
profileDir,
|
||||
stateDir,
|
||||
});
|
||||
|
||||
await expect(
|
||||
browserTool.execute("browser-wait-missing", {
|
||||
action: "wait",
|
||||
}),
|
||||
).rejects.toThrow("browser wait requires exactly one of ref, url, text, ms, or loadState");
|
||||
|
||||
await expect(
|
||||
browserTool.execute("browser-wait-ambiguous", {
|
||||
action: "wait",
|
||||
ref: "@e2",
|
||||
text: "Done",
|
||||
}),
|
||||
).rejects.toThrow("browser wait requires exactly one of ref, url, text, ms, or loadState");
|
||||
|
||||
expect(calls).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("stores named state under the managed browser state directory", async () => {
|
||||
const cwd = createTempDir("coding-agent-browser-state-");
|
||||
const profileDir = join(cwd, "profile");
|
||||
const stateDir = join(cwd, "states");
|
||||
const { calls, operations } = createMockBrowserOperations();
|
||||
|
||||
const browserTool = createBrowserTool(cwd, {
|
||||
operations,
|
||||
profileDir,
|
||||
stateDir,
|
||||
});
|
||||
|
||||
const result = (await browserTool.execute("browser-state-save", {
|
||||
action: "state_save",
|
||||
stateName: "my session/prod",
|
||||
})) as ToolResultLike;
|
||||
|
||||
const expectedStatePath = join(stateDir, "my-session-prod.json");
|
||||
expect(calls[0]?.args).toEqual(["--profile", profileDir, "state", "save", expectedStatePath]);
|
||||
|
||||
const details = result.details as BrowserToolDetails | undefined;
|
||||
expect(details?.statePath).toBe(expectedStatePath);
|
||||
expect(getTextOutput(result)).toContain(expectedStatePath);
|
||||
});
|
||||
|
||||
it("accepts browser in --tools and exposes it in default tool wiring", () => {
|
||||
const parsed = parseArgs(["--tools", "browser,read"]);
|
||||
expect(parsed.tools).toEqual(["browser", "read"]);
|
||||
|
||||
expect(defaultCodingToolNames).toContain("browser");
|
||||
expect(createAllTools(process.cwd()).browser.name).toBe("browser");
|
||||
});
|
||||
|
||||
it("mentions browser in the default system prompt", () => {
|
||||
const prompt = buildSystemPrompt();
|
||||
|
||||
expect(prompt).toContain(
|
||||
"- browser: Open websites, inspect pages with snapshot, click/fill/wait, take screenshots, and save/load browser state",
|
||||
);
|
||||
expect(prompt).toContain(
|
||||
"Use browser for website tasks. Open the page, use snapshot to inspect interactive elements, then click, fill, wait, or screenshot as needed",
|
||||
);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue