feat: add built-in browser tool

Add a first-class browser tool backed by agent-browser and enable it by default in coding sessions.

Include CLI and system-prompt wiring plus focused coverage for the new tool.

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Harivansh Rathi 2026-03-08 12:50:02 -07:00
parent df702d95a3
commit 0720c47495
8 changed files with 797 additions and 15 deletions

View file

@ -0,0 +1,212 @@
import { mkdtempSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import { parseArgs } from "../src/cli/args.js";
import { buildSystemPrompt } from "../src/core/system-prompt.js";
import {
type BrowserOperations,
type BrowserToolDetails,
createAllTools,
createBrowserTool,
defaultCodingToolNames,
} from "../src/core/tools/index.js";
interface TextBlock {
type: "text";
text: string;
}
type ToolContentBlock = TextBlock | { type: string };
interface ToolResultLike {
content: ToolContentBlock[];
details?: unknown;
}
interface BrowserExecCall {
command: string;
args: string[];
cwd: string;
env: NodeJS.ProcessEnv;
timeout?: number;
}
function getTextOutput(result: ToolResultLike): string {
return result.content
.filter((block): block is TextBlock => block.type === "text")
.map((block) => block.text)
.join("\n");
}
function createMockBrowserOperations(
output = "",
exitCode = 0,
): {
calls: BrowserExecCall[];
operations: BrowserOperations;
} {
const calls: BrowserExecCall[] = [];
return {
calls,
operations: {
exec: async (command, args, options) => {
calls.push({
command,
args,
cwd: options.cwd,
env: options.env,
timeout: options.timeout,
});
if (output.length > 0) {
options.onData(Buffer.from(output, "utf-8"));
}
return { exitCode };
},
},
};
}
describe("browser tool", () => {
const tempDirs: string[] = [];
afterEach(() => {
while (tempDirs.length > 0) {
const tempDir = tempDirs.pop();
if (tempDir) {
rmSync(tempDir, { recursive: true, force: true });
}
}
});
function createTempDir(prefix: string): string {
const tempDir = mkdtempSync(join(tmpdir(), prefix));
tempDirs.push(tempDir);
return tempDir;
}
it("opens pages through agent-browser with a shared profile", async () => {
const cwd = createTempDir("coding-agent-browser-open-");
const profileDir = join(cwd, "profile");
const stateDir = join(cwd, "states");
const { calls, operations } = createMockBrowserOperations();
const browserTool = createBrowserTool(cwd, {
operations,
command: "agent-browser-test",
profileDir,
stateDir,
});
const result = (await browserTool.execute("browser-open", {
action: "open",
url: "https://example.com",
})) as ToolResultLike;
expect(calls).toHaveLength(1);
expect(calls[0]).toMatchObject({
command: "agent-browser-test",
args: ["--profile", profileDir, "open", "https://example.com"],
cwd,
timeout: 90,
});
expect(getTextOutput(result)).toBe("Opened https://example.com");
const details = result.details as BrowserToolDetails | undefined;
expect(details?.profilePath).toBe(profileDir);
});
it("uses interactive snapshots by default and returns snapshot text", async () => {
const cwd = createTempDir("coding-agent-browser-snapshot-");
const profileDir = join(cwd, "profile");
const stateDir = join(cwd, "states");
const { calls, operations } = createMockBrowserOperations("main [ref=@e1]\nbutton [ref=@e2] Sign in");
const browserTool = createBrowserTool(cwd, {
operations,
profileDir,
stateDir,
});
const result = (await browserTool.execute("browser-snapshot", {
action: "snapshot",
})) as ToolResultLike;
expect(calls[0]?.args).toEqual(["--profile", profileDir, "snapshot", "-i"]);
expect(getTextOutput(result)).toContain("button [ref=@e2] Sign in");
});
it("validates wait targets before spawning agent-browser", async () => {
const cwd = createTempDir("coding-agent-browser-wait-");
const profileDir = join(cwd, "profile");
const stateDir = join(cwd, "states");
const { calls, operations } = createMockBrowserOperations();
const browserTool = createBrowserTool(cwd, {
operations,
profileDir,
stateDir,
});
await expect(
browserTool.execute("browser-wait-missing", {
action: "wait",
}),
).rejects.toThrow("browser wait requires exactly one of ref, url, text, ms, or loadState");
await expect(
browserTool.execute("browser-wait-ambiguous", {
action: "wait",
ref: "@e2",
text: "Done",
}),
).rejects.toThrow("browser wait requires exactly one of ref, url, text, ms, or loadState");
expect(calls).toHaveLength(0);
});
it("stores named state under the managed browser state directory", async () => {
const cwd = createTempDir("coding-agent-browser-state-");
const profileDir = join(cwd, "profile");
const stateDir = join(cwd, "states");
const { calls, operations } = createMockBrowserOperations();
const browserTool = createBrowserTool(cwd, {
operations,
profileDir,
stateDir,
});
const result = (await browserTool.execute("browser-state-save", {
action: "state_save",
stateName: "my session/prod",
})) as ToolResultLike;
const expectedStatePath = join(stateDir, "my-session-prod.json");
expect(calls[0]?.args).toEqual(["--profile", profileDir, "state", "save", expectedStatePath]);
const details = result.details as BrowserToolDetails | undefined;
expect(details?.statePath).toBe(expectedStatePath);
expect(getTextOutput(result)).toContain(expectedStatePath);
});
it("accepts browser in --tools and exposes it in default tool wiring", () => {
const parsed = parseArgs(["--tools", "browser,read"]);
expect(parsed.tools).toEqual(["browser", "read"]);
expect(defaultCodingToolNames).toContain("browser");
expect(createAllTools(process.cwd()).browser.name).toBe("browser");
});
it("mentions browser in the default system prompt", () => {
const prompt = buildSystemPrompt();
expect(prompt).toContain(
"- browser: Open websites, inspect pages with snapshot, click/fill/wait, take screenshots, and save/load browser state",
);
expect(prompt).toContain(
"Use browser for website tasks. Open the page, use snapshot to inspect interactive elements, then click, fill, wait, or screenshot as needed",
);
});
});