From a6ba0ecee0e12ecab2e7f542e72c7a7ac6e3a3f6 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 17 Mar 2026 15:21:26 -0700 Subject: [PATCH] feat: [US-033] - Fix default display dimensions to match spec (1280x720) Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 1 + docs/browser-feature-matrix.mdx | 148 ++ docs/docs.json | 1 + notes/specs/browser-automation-spec.md | 1442 +++++++++++++++++ scripts/ralph/.last-branch | 1 + scripts/ralph/CLAUDE.md | 104 ++ scripts/ralph/prd.json | 164 +- scripts/ralph/progress.txt | 299 ++++ scripts/ralph/ralph.sh | 135 ++ .../sandbox-agent/src/browser_runtime.rs | 4 +- 10 files changed, 2275 insertions(+), 24 deletions(-) create mode 100644 docs/browser-feature-matrix.mdx create mode 100644 notes/specs/browser-automation-spec.md create mode 100644 scripts/ralph/.last-branch create mode 100644 scripts/ralph/CLAUDE.md create mode 100755 scripts/ralph/ralph.sh diff --git a/.gitignore b/.gitignore index de4d863..4805dea 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ Cargo.lock .agents/ .claude/ .opencode/ +.ralph/ # Example temp files .tmp-upload/ diff --git a/docs/browser-feature-matrix.mdx b/docs/browser-feature-matrix.mdx new file mode 100644 index 0000000..f2c7202 --- /dev/null +++ b/docs/browser-feature-matrix.mdx @@ -0,0 +1,148 @@ +--- +title: "Feature Matrix" +description: "Compare Sandbox Agent's capabilities against other sandbox and browser automation providers." +sidebarTitle: "Feature Matrix" +icon: "table-columns" +--- + +A comparison of Sandbox Agent's features against Daytona, E2B, Cloudflare (Browser Rendering), Browserbase, and common agent-browser tools (Steel, Stagehand, Browser Use). + +## Sandbox Lifecycle + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| Create sandbox | ✓ | ✓ | ✓ | ✓ | ✓ | +| Destroy/delete | ✓ | ✓ | ✓ | ✓ | ✓ | +| List sandboxes | ✓ | ✓ | ✓ | - | ✓ | +| Start/stop | ✓ | ✓ | ✓ | ✓ | - | +| Pause/resume | - | - | ✓ | - | - | +| Snapshots/templates | - | ✓ | ✓ | - | ✓ | +| Auto-stop timeout | - | ✓ | ✓ | ✓ | ✓ | +| Region selection | - | - | - | ✓ | ✓ | + +## Filesystem + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| Read file | ✓ | ✓ | ✓ | - | - | +| Write file | ✓ | ✓ | ✓ | - | - | +| List directory (recursive) | ✓ | ✓ | ✓ | - | - | +| Delete file/dir | ✓ | - | ✓ | - | - | +| Move/rename | ✓ | - | ✓ | - | - | +| Mkdir | ✓ | - | ✓ | - | - | +| File stat/metadata | ✓ | - | - | - | - | +| Batch upload (tar) | ✓ | - | - | - | - | +| File watch/events | - | - | ✓ | - | - | + +## Process Management + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| One-shot exec | ✓ | ✓ | ✓ | - | - | +| Background processes | ✓ | - | ✓ | - | - | +| Stream stdout/stderr | ✓ | - | ✓ | - | - | +| Interactive PTY (WebSocket) | ✓ | ✓ | ✓ | - | - | +| Terminal resize | ✓ | ✓ | ✓ | - | - | +| Send stdin | ✓ | - | ✓ | - | - | +| Kill/stop process | ✓ | - | ✓ | - | - | +| List processes | ✓ | - | ✓ | - | - | +| Process config (limits) | ✓ | - | - | - | - | + +## Desktop / Computer-Use + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| Virtual desktop | ✓ | - | ✓ | - | - | +| Screenshot (full) | ✓ | - | ✓ | ✓ | - | +| Screenshot (region) | ✓ | - | - | - | - | +| Mouse (move/click/drag/scroll) | ✓ | - | ✓ | - | - | +| Keyboard (type/press) | ✓ | - | ✓ | - | - | +| Window management | ✓ | - | - | - | - | +| Clipboard read/write | ✓ | - | - | - | - | +| Launch application | ✓ | - | - | - | - | +| Display info / DPI config | ✓ | - | - | - | - | +| Desktop recording | ✓ | - | - | - | ✓ | + +## Live Streaming + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| Live desktop stream | ✓ | - | ✓ | - | - | +| Protocol | WebRTC (Neko) | - | VNC | - | CDP screencast | +| Video codecs | VP8, VP9, H.264 | - | - | - | JPEG | +| Audio streaming | ✓ (Opus, G.722) | - | - | - | - | +| Interactive input via stream | ✓ | - | ✓ | - | Limited | +| Configurable FPS (1-60) | ✓ | - | - | - | - | +| Multi-viewer | ✓ | - | - | - | ✓ | +| Typical latency | 50-150ms | - | 100-500ms | - | 200-1000ms | + +## Browser Automation + +| Feature | Sandbox Agent | Cloudflare | Browserbase | Steel | Stagehand | +|---------|:---:|:---:|:---:|:---:|:---:| +| Start/stop browser | ✓ | ✓ | ✓ | ✓ | ✓ | +| CDP WebSocket access | ✓ | ✓ | ✓ | ✓ | - | +| Navigate / back / forward | ✓ | ✓ | ✓ | ✓ | ✓ | +| Tab management | ✓ | - | ✓ | - | - | +| Click / type / scroll (selector) | ✓ | ✓ | ✓ | ✓ | ✓ | +| Screenshot (browser-level) | ✓ | ✓ | ✓ | ✓ | ✓ | +| PDF generation | ✓ | ✓ | ✓ | ✓ | - | +| Get page HTML | ✓ | ✓ | ✓ | - | - | +| Get page as Markdown | ✓ | ✓ | - | - | - | +| Scrape elements (selectors) | ✓ | ✓ | ✓ | - | - | +| Extract all links | ✓ | ✓ | - | - | - | +| Accessibility tree snapshot | ✓ | - | - | - | ✓ | +| Execute JavaScript | ✓ | - | ✓ | ✓ | - | +| Console log capture | ✓ | - | ✓ | - | - | +| Network request capture | ✓ | - | ✓ | - | - | +| Web crawling | ✓ | ✓ | - | - | - | +| Persistent browser profiles | ✓ | - | ✓ | ✓ | - | +| Cookie management | ✓ | - | ✓ | ✓ | - | +| File upload to input | ✓ | - | ✓ | ✓ | - | +| Dialog handling | ✓ | - | ✓ | - | - | +| Live browser streaming | ✓ (WebRTC) | - | ✓ (CDP) | ✓ | - | +| Anti-detection/stealth | - | - | ✓ | ✓ | - | +| Proxy support | - | - | ✓ | ✓ | - | +| CAPTCHA solving | - | - | ✓ | - | - | +| Browser extensions | - | - | ✓ | - | - | + +## Agent Integration + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| Agent Client Protocol (ACP) | ✓ | - | - | - | - | +| MCP server config | ✓ | - | - | - | - | +| Skills config | ✓ | - | - | - | - | +| Agent install/management | ✓ | - | - | - | - | +| Session persistence | ✓ | - | - | - | - | +| Permission system | ✓ | - | - | - | - | +| Code interpreter | - | - | ✓ | - | - | + +## SDKs and Tooling + +| Feature | Sandbox Agent | Daytona | E2B | Cloudflare | Browserbase | +|---------|:---:|:---:|:---:|:---:|:---:| +| TypeScript SDK | ✓ | ✓ | ✓ | ✓ | ✓ | +| Python SDK | - | ✓ | ✓ | - | ✓ | +| React components | ✓ | - | - | - | - | +| Inspector UI | ✓ | - | - | - | - | +| Provider abstraction (7+) | ✓ | - | - | - | - | +| WebRTC client library | ✓ | - | - | - | - | +| CLI | ✓ | ✓ | ✓ | ✓ | - | + +## Streaming Technology Comparison + +For platforms that support live desktop/browser streaming, here is how the underlying technologies compare: + +| Dimension | WebRTC (Neko) | VNC (noVNC) | CDP Screencast | WebSocket + JPEG | +|-----------|:---:|:---:|:---:|:---:| +| Typical latency | 50-150ms | 100-500ms | 200-1000ms | 150-400ms | +| Frame rate | 30-60 fps | 10-30 fps | 1-15 fps | 5-20 fps | +| Video quality | High | Medium | Low-Medium | Medium | +| Audio support | Yes | No | No | No | +| Interactive input | Full | Full | Limited | Limited | +| Bandwidth (adaptive) | Yes | No | No | No | +| Used by | Sandbox Agent | E2B, Gitpod | Browserbase | Various | + +Sandbox Agent uses [Neko](https://github.com/m1k1o/neko) (WebRTC) for streaming, which provides the lowest latency and best interactivity of any approach. The same stream serves both the full desktop and browser automation modes. + diff --git a/docs/docs.json b/docs/docs.json index 0c2b19a..c9ebb0f 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -103,6 +103,7 @@ { "group": "More", "pages": [ + "browser-feature-matrix", "daemon", "cors", "session-restoration", diff --git a/notes/specs/browser-automation-spec.md b/notes/specs/browser-automation-spec.md new file mode 100644 index 0000000..d9b0f20 --- /dev/null +++ b/notes/specs/browser-automation-spec.md @@ -0,0 +1,1442 @@ +# Browser Automation Spec + +Implementation-ready specification for browser automation in Sandbox Agent. Covers the HTTP API, CLI install command, TypeScript SDK surface, inspector UI tab, and Rust module structure. + +## Table of Contents + +1. [Architecture Overview](#1-architecture-overview) +2. [CLI: `install browser`](#2-cli-install-browser) +3. [HTTP API: `/v1/browser/*`](#3-http-api-v1browser) +4. [Rust Module Structure](#4-rust-module-structure) +5. [TypeScript SDK](#5-typescript-sdk) +6. [Inspector UI: Browser Tab](#6-inspector-ui-browser-tab) +7. [React Component: `BrowserViewer`](#7-react-component-browserviewer) +8. [Desktop Integration](#8-desktop-integration) +9. [Error Handling](#9-error-handling) +10. [Testing](#10-testing) + +--- + +## 1. Architecture Overview + +Browser automation reuses the existing desktop infrastructure (Xvfb + Neko) in a minimal mode where Chromium is the only application. No window manager is needed. + +``` +┌─────────────────────────────────────────────────┐ +│ Sandbox │ +│ │ +│ ┌────────┐ ┌──────────────┐ ┌───────────┐ │ +│ │ Neko │←──│ Chromium │──→│ CDP Server│ │ +│ │(WebRTC)│ │ (on Xvfb) │ │ (:9222) │ │ +│ └───┬────┘ └──────────────┘ └─────┬─────┘ │ +│ │ stream │ CDP │ +└──────┼─────────────────────────────────┼─────────┘ + │ │ + WebRTC (UDP) WebSocket / REST + │ │ +┌──────┴─────────────────────────────────┴─────────┐ +│ Sandbox Agent HTTP API │ +│ │ +│ /v1/browser/* (REST convenience wrappers) │ +│ /v1/browser/cdp (WebSocket proxy to :9222) │ +│ /v1/desktop/stream/signaling (Neko WebRTC) │ +│ /v1/desktop/screenshot (shared with desktop) │ +│ /v1/desktop/recording/* (shared with desktop) │ +└───────────────────────────────────────────────────┘ +``` + +### Key decisions + +- **Minimal mode**: Chromium runs directly on Xvfb. No Openbox window manager. Chromium starts in `--kiosk` or `--start-maximized` mode, filling the entire virtual display. +- **Neko reuse**: The existing `DesktopStreamingManager` streams the Xvfb framebuffer. No changes needed; Chromium renders to the same display Neko captures. +- **CDP proxy**: Sandbox Agent proxies WebSocket connections to Chromium's CDP server on localhost:9222. This lets external Playwright/Puppeteer clients connect through the Sandbox Agent URL without exposing the raw CDP port. +- **REST convenience endpoints**: Thin wrappers around CDP calls for common operations (navigate, screenshot, content extraction). These call Chromium's CDP internally via a persistent connection. +- **Shared desktop infrastructure**: Screenshots, recordings, streaming, and mouse/keyboard input all reuse existing desktop endpoints. The browser API adds browser-specific operations on top. + +--- + +## 2. CLI: `install browser` + +### Command + +```bash +sandbox-agent install browser [--yes] [--print-only] [--package-manager ] +``` + +### Implementation + +New file: `server/packages/sandbox-agent/src/browser_install.rs` + +Follow the exact pattern of `desktop_install.rs`: + +```rust +#[derive(Debug, Clone)] +pub struct BrowserInstallRequest { + pub yes: bool, + pub print_only: bool, + pub package_manager: Option, // reuse from desktop_install +} + +pub fn install_browser(request: BrowserInstallRequest) -> Result<(), String> { + // 1. Platform check (Linux only) + // 2. Detect or validate package manager (reuse detect_package_manager) + // 3. Build package list + // 4. Privilege check (root or sudo) + // 5. Display packages + confirm + // 6. Run install commands +} +``` + +### Packages by distro + +**APT (Debian/Ubuntu):** +``` +chromium +chromium-sandbox +libnss3 +libatk-bridge2.0-0 +libdrm2 +libxcomposite1 +libxdamage1 +libxrandr2 +libgbm1 +libasound2 +libpangocairo-1.0-0 +libgtk-3-0 +``` + +**DNF (Fedora/RHEL):** +``` +chromium +``` + +**APK (Alpine):** +``` +chromium +nss +``` + +### CLI registration + +In `cli.rs`, add to the `InstallCommand` enum: + +```rust +#[derive(Subcommand, Debug)] +pub enum InstallCommand { + /// Install desktop runtime dependencies. + Desktop(InstallDesktopArgs), + /// Install browser automation dependencies (Chromium). + Browser(InstallBrowserArgs), +} + +#[derive(Args, Debug)] +pub struct InstallBrowserArgs { + #[arg(long, default_value_t = false)] + yes: bool, + #[arg(long, default_value_t = false)] + print_only: bool, + #[arg(long, value_enum)] + package_manager: Option, +} +``` + +### Dependency detection + +Add `detect_missing_browser_dependencies()` to check for: +- `chromium` or `chromium-browser` binary in PATH +- Desktop dependencies (Xvfb, xdotool, etc.) since browser mode requires them + +Return helpful install suggestion: +``` +"sandbox-agent install browser --yes" +``` + +If desktop deps are also missing, suggest: +``` +"sandbox-agent install desktop --yes && sandbox-agent install browser --yes" +``` + +Or consider having `install browser` also install desktop deps automatically (since browser mode requires Xvfb). + +--- + +## 3. HTTP API: `/v1/browser/*` + +All endpoints return `application/json` unless otherwise noted. Error responses use `application/problem+json` (same as desktop API). + +### 3.1 Lifecycle + +#### `POST /v1/browser/start` + +Start the browser runtime: Xvfb + Chromium + Neko streaming. + +**Request body:** +```typescript +{ + // Display + width?: number, // default: 1280 + height?: number, // default: 720 + dpi?: number, // default: 96 + + // Browser + url?: string, // initial URL to navigate to (default: "about:blank") + headless?: boolean, // if true, skip Neko (no streaming). default: false + + // Streaming (same as desktop) + streamVideoCodec?: string, // "vp8" | "vp9" | "h264", default: "vp8" + streamAudioCodec?: string, // "opus" | "g722", default: "opus" + streamFrameRate?: number, // 1-60, default: 30 + webrtcPortRange?: string, // default: "59050-59070" + recordingFps?: number, // default: 30 +} +``` + +**Response (200):** +```typescript +{ + state: "active" | "starting" | "inactive" | "install_required" | "failed", + display?: string, // ":99" + resolution?: { width: number, height: number, dpi?: number }, + startedAt?: string, // ISO 8601 + cdpUrl?: string, // "ws://127.0.0.1:9222/devtools/browser/..." + url?: string, // current page URL + missingDependencies: string[], + installCommand?: string, + processes: Array<{ name: string, pid?: number, running: boolean }>, + lastError?: { code: string, message: string }, +} +``` + +**Internal sequence:** +1. Check for missing dependencies (Xvfb, chromium, neko) +2. Start Xvfb on chosen display (reuse `start_xvfb_locked`) +3. Wait for X11 socket +4. Start Chromium: + ```bash + chromium \ + --no-sandbox \ + --disable-gpu \ + --disable-dev-shm-usage \ + --remote-debugging-port=9222 \ + --remote-debugging-address=127.0.0.1 \ + --start-maximized \ + --window-size=WIDTH,HEIGHT \ + --window-position=0,0 \ + --no-first-run \ + --no-default-browser-check \ + --disable-infobars \ + --disable-background-networking \ + --disable-sync \ + --disable-translate \ + --disable-extensions \ + --user-data-dir=/tmp/chromium-profile \ + [URL] + ``` +5. Poll CDP endpoint `http://127.0.0.1:9222/json/version` until ready (15s timeout) +6. If not headless: start Neko streaming (reuse `DesktopStreamingManager.start()`) +7. Return status + +#### `POST /v1/browser/stop` + +Stop browser, Neko, and Xvfb. + +**Response (200):** +```typescript +{ state: "inactive" } +``` + +#### `GET /v1/browser/status` + +**Response (200):** Same shape as start response. + +### 3.2 CDP Access + +#### `GET /v1/browser/cdp` + +WebSocket upgrade. Proxies the connection to Chromium's CDP server at `ws://127.0.0.1:9222/devtools/browser/{id}`. + +This allows external Playwright/Puppeteer to connect: +```typescript +const browser = await chromium.connectOverCDP("ws://sandbox-host:2468/v1/browser/cdp"); +``` + +**Implementation:** Bidirectional WebSocket relay (same pattern as the Neko signaling proxy in `router.rs:2817-2921`). + +### 3.3 Navigation + +#### `POST /v1/browser/navigate` + +```typescript +// Request +{ url: string, waitUntil?: "load" | "domcontentloaded" | "networkidle" } + +// Response (200) +{ url: string, title: string, status: number } +``` + +**CDP calls:** `Page.navigate` + `Page.lifecycleEvent` wait. + +#### `POST /v1/browser/back` + +```typescript +// Response (200) +{ url: string, title: string } +``` + +**CDP call:** `Page.navigateHistory` with delta -1. + +#### `POST /v1/browser/forward` + +```typescript +// Response (200) +{ url: string, title: string } +``` + +#### `POST /v1/browser/reload` + +```typescript +// Request (optional) +{ ignoreCache?: boolean } + +// Response (200) +{ url: string, title: string } +``` + +#### `POST /v1/browser/wait` + +```typescript +// Request +{ selector?: string, timeout?: number, state?: "visible" | "hidden" | "attached" } + +// Response (200) +{ found: boolean } +``` + +**CDP calls:** `Runtime.evaluate` with MutationObserver or `DOM.querySelector` polling. + +### 3.4 Tab Management + +#### `GET /v1/browser/tabs` + +```typescript +// Response (200) +{ + tabs: Array<{ + id: string, // CDP target ID + url: string, + title: string, + active: boolean, // true for the tab currently receiving input + }> +} +``` + +**CDP call:** `Target.getTargets` filtered to `type: "page"`. + +#### `POST /v1/browser/tabs` + +Create a new tab. + +```typescript +// Request +{ url?: string } + +// Response (201) +{ id: string, url: string, title: string } +``` + +**CDP call:** `Target.createTarget`. + +#### `POST /v1/browser/tabs/{id}/activate` + +Switch to this tab (bring to foreground, receive input from Neko stream). + +```typescript +// Response (200) +{ id: string, url: string, title: string } +``` + +**CDP call:** `Target.activateTarget`. + +#### `DELETE /v1/browser/tabs/{id}` + +Close a tab. + +```typescript +// Response (200) +{ ok: true } +``` + +**CDP call:** `Target.closeTarget`. + +### 3.5 Content Extraction + +#### `GET /v1/browser/screenshot` + +Screenshot of the current browser tab. + +```typescript +// Query params +format?: "png" | "jpeg" | "webp" // default: png +quality?: number // 0-100, for jpeg/webp +fullPage?: boolean // capture entire scrollable page +selector?: string // screenshot specific element + +// Response: image binary with appropriate Content-Type +``` + +**CDP call:** `Page.captureScreenshot`. This is browser-level (just the viewport/page), distinct from `GET /v1/desktop/screenshot` which captures the entire Xvfb display. + +#### `GET /v1/browser/pdf` + +Generate PDF of current page. + +```typescript +// Query params +format?: "a4" | "letter" | "legal" +landscape?: boolean +printBackground?: boolean +scale?: number + +// Response: application/pdf binary +``` + +**CDP call:** `Page.printToPDF`. + +#### `GET /v1/browser/content` + +Get page HTML. + +```typescript +// Query params +selector?: string // if provided, return innerHTML of matching element + +// Response (200) +{ html: string, url: string, title: string } +``` + +**CDP call:** `Runtime.evaluate` with `document.documentElement.outerHTML` or element query. + +#### `GET /v1/browser/markdown` + +Get page content as markdown. + +```typescript +// Response (200) +{ markdown: string, url: string, title: string } +``` + +**Implementation:** Extract DOM via CDP, convert to markdown using a Rust markdown conversion library (e.g., `html2md` crate). Strip nav/footer/aside elements before conversion for cleaner output. + +#### `POST /v1/browser/scrape` + +Extract elements matching CSS selectors. + +```typescript +// Request +{ + selectors: Record, // { "title": "h1", "price": ".price" } + url?: string // optionally navigate first +} + +// Response (200) +{ + data: Record, + // e.g. { "title": ["Product Name"], "price": ["$29.99"] } + url: string, + title: string +} +``` + +**CDP call:** `Runtime.evaluate` with `document.querySelectorAll` + `textContent` extraction. + +#### `GET /v1/browser/links` + +Extract all links from the page. + +```typescript +// Response (200) +{ + links: Array<{ href: string, text: string }>, + url: string +} +``` + +#### `POST /v1/browser/execute` + +Execute JavaScript in the page context. + +```typescript +// Request +{ expression: string, awaitPromise?: boolean } + +// Response (200) +{ result: any, type: string } +``` + +**CDP call:** `Runtime.evaluate`. + +#### `GET /v1/browser/snapshot` + +Get the accessibility tree of the current page. + +```typescript +// Response (200) +{ + snapshot: string, // text representation of accessibility tree + url: string, + title: string +} +``` + +**CDP call:** `Accessibility.getFullAXTree` or `DOM.getDocument` + role extraction. + +### 3.6 Interaction + +These are browser-level click/type/scroll that use CDP (target DOM elements by selector). They complement the desktop-level `xdotool` input which operates on raw X11 coordinates. + +#### `POST /v1/browser/click` + +```typescript +// Request +{ + selector: string, // CSS selector + button?: "left" | "right" | "middle", + clickCount?: number, // 1 = click, 2 = double-click + timeout?: number, // ms to wait for selector +} + +// Response (200) +{ ok: true } +``` + +**CDP calls:** `DOM.querySelector` to find node, `DOM.getBoxModel` to get coordinates, `Input.dispatchMouseEvent`. + +#### `POST /v1/browser/type` + +```typescript +// Request +{ + selector: string, + text: string, + delay?: number, // ms between keystrokes + clear?: boolean, // clear field first +} + +// Response (200) +{ ok: true } +``` + +**CDP calls:** Focus element via `DOM.focus`, then `Input.dispatchKeyEvent` per character. + +#### `POST /v1/browser/select` + +```typescript +// Request +{ selector: string, value: string } + +// Response (200) +{ ok: true } +``` + +#### `POST /v1/browser/hover` + +```typescript +// Request +{ selector: string } + +// Response (200) +{ ok: true } +``` + +#### `POST /v1/browser/scroll` + +```typescript +// Request +{ + selector?: string, // element to scroll (default: viewport) + x?: number, // horizontal scroll delta + y?: number, // vertical scroll delta +} + +// Response (200) +{ ok: true } +``` + +#### `POST /v1/browser/upload` + +Upload a file to a file input element. + +```typescript +// Request +{ + selector: string, // file input selector + path: string, // path to file inside the sandbox +} + +// Response (200) +{ ok: true } +``` + +**CDP call:** `DOM.setFileInputFiles`. + +#### `POST /v1/browser/dialog` + +Handle a JavaScript dialog (alert/confirm/prompt). + +```typescript +// Request +{ + accept: boolean, + text?: string, // for prompt dialogs +} + +// Response (200) +{ ok: true } +``` + +**CDP call:** `Page.handleJavaScriptDialog`. + +### 3.7 Monitoring + +#### `GET /v1/browser/console` + +Get console log messages. + +```typescript +// Query params +level?: "log" | "warn" | "error" | "info" | "debug" +limit?: number // max messages (default: 100) + +// Response (200) +{ + messages: Array<{ + level: string, + text: string, + url?: string, + line?: number, + timestamp: string, + }> +} +``` + +**CDP:** Subscribe to `Runtime.consoleAPICalled` events, buffer in memory. + +#### `GET /v1/browser/network` + +Get captured network requests. + +```typescript +// Query params +limit?: number +urlPattern?: string // regex filter + +// Response (200) +{ + requests: Array<{ + url: string, + method: string, + status: number, + mimeType: string, + responseSize: number, + duration: number, // ms + timestamp: string, + }> +} +``` + +**CDP:** Subscribe to `Network.requestWillBeSent` + `Network.responseReceived`, buffer in memory. + +### 3.8 Crawling + +#### `POST /v1/browser/crawl` + +Crawl pages starting from a URL. + +```typescript +// Request +{ + url: string, + maxPages?: number, // default: 10, max: 100 + maxDepth?: number, // default: 2 + allowedDomains?: string[], // restrict to these domains + extract?: "markdown" | "html" | "text" | "links", // what to return per page +} + +// Response (200) +{ + pages: Array<{ + url: string, + title: string, + content: string, // in the requested extract format + links: string[], // outgoing links found + status: number, + depth: number, + }>, + totalPages: number, + truncated: boolean, // true if maxPages was hit +} +``` + +**Implementation:** BFS crawl using the CDP-controlled browser. For each page: +1. Navigate via `Page.navigate` +2. Wait for load +3. Extract content (reuse markdown/html/links extraction logic) +4. Collect links, filter by domain/depth +5. Continue until maxPages or maxDepth + +### 3.9 Browser Contexts (Persistent Profiles) + +#### `GET /v1/browser/contexts` + +List saved browser contexts (persistent cookie/storage profiles). + +```typescript +// Response (200) +{ + contexts: Array<{ + id: string, + name: string, + createdAt: string, + sizeBytes: number, + }> +} +``` + +**Storage:** Each context is a Chromium `--user-data-dir` directory stored under `$STATE_DIR/browser-contexts/{id}/`. + +#### `POST /v1/browser/contexts` + +Create a named context. + +```typescript +// Request +{ name: string } + +// Response (201) +{ id: string, name: string, createdAt: string } +``` + +#### `DELETE /v1/browser/contexts/{id}` + +Delete a context and its stored data. + +#### Using a context + +Pass `contextId` in the start request: + +```typescript +POST /v1/browser/start +{ contextId: "ctx_abc123" } +``` + +This sets `--user-data-dir` to the context's directory, preserving cookies, localStorage, IndexedDB, etc. across browser sessions. + +### 3.10 Cookies + +#### `GET /v1/browser/cookies` + +```typescript +// Query params +url?: string // filter by URL + +// Response (200) +{ + cookies: Array<{ + name: string, + value: string, + domain: string, + path: string, + expires: number, + httpOnly: boolean, + secure: boolean, + sameSite: string, + }> +} +``` + +**CDP call:** `Network.getCookies`. + +#### `POST /v1/browser/cookies` + +Set cookies. + +```typescript +// Request +{ + cookies: Array<{ + name: string, + value: string, + domain?: string, + path?: string, + expires?: number, + httpOnly?: boolean, + secure?: boolean, + sameSite?: "Strict" | "Lax" | "None", + }> +} + +// Response (200) +{ ok: true } +``` + +**CDP call:** `Network.setCookies`. + +#### `DELETE /v1/browser/cookies` + +Clear all cookies (or by name/domain filter). + +```typescript +// Query params +name?: string +domain?: string +``` + +--- + +## 4. Rust Module Structure + +### New files + +``` +server/packages/sandbox-agent/src/ +├── browser_types.rs # Request/response DTOs (serde + utoipa + schemars) +├── browser_errors.rs # BrowserProblem error type (mirrors DesktopProblem) +├── browser_runtime.rs # BrowserRuntime state machine (~800 lines) +├── browser_cdp.rs # CDP client: persistent WS connection to Chromium +├── browser_install.rs # install browser CLI logic +├── browser_crawl.rs # Crawl implementation +└── browser_context.rs # Persistent profile (user-data-dir) management +``` + +### Modified files + +``` +server/packages/sandbox-agent/src/ +├── cli.rs # Add Browser variant to InstallCommand +├── router.rs # Add /v1/browser/* routes +├── lib.rs # Add module declarations +└── state.rs # Add BrowserRuntime to app state (or equivalent) +``` + +### BrowserRuntime + +```rust +pub struct BrowserRuntime { + config: BrowserRuntimeConfig, + process_runtime: Arc, + desktop_streaming_manager: Arc, // shared with DesktopRuntime + desktop_recording_manager: Arc, // shared + cdp_client: Arc>>, + inner: Arc>, +} + +#[derive(Debug)] +struct BrowserRuntimeState { + state: BrowserState, + xvfb_process_id: Option, + chromium_process_id: Option, + display: Option, + resolution: Option, + started_at: Option, + last_error: Option, + // Monitoring buffers + console_messages: VecDeque, // bounded ring buffer, max 1000 + network_requests: VecDeque, // bounded ring buffer, max 1000 +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BrowserState { + Inactive, + InstallRequired, + Starting, + Active, + Stopping, + Failed, +} +``` + +### CdpClient + +```rust +/// Persistent WebSocket connection to Chromium's CDP server. +pub struct CdpClient { + ws: WebSocketStream>, + next_id: AtomicU64, +} + +impl CdpClient { + /// Connect to Chromium CDP at ws://127.0.0.1:9222/devtools/browser/{id} + pub async fn connect() -> Result; + + /// Send a CDP command and wait for the response. + pub async fn send(&self, method: &str, params: serde_json::Value) -> Result; + + /// Subscribe to CDP events (e.g., Runtime.consoleAPICalled). + pub async fn subscribe(&self, event: &str, callback: impl Fn(serde_json::Value)); +} +``` + +### Router registration + +Add to the router builder (follow existing patterns in `router.rs`): + +```rust +// Browser lifecycle +.route("/v1/browser/start", post(browser_start)) +.route("/v1/browser/stop", post(browser_stop)) +.route("/v1/browser/status", get(browser_status)) + +// CDP proxy +.route("/v1/browser/cdp", get(browser_cdp_ws)) + +// Navigation +.route("/v1/browser/navigate", post(browser_navigate)) +.route("/v1/browser/back", post(browser_back)) +.route("/v1/browser/forward", post(browser_forward)) +.route("/v1/browser/reload", post(browser_reload)) +.route("/v1/browser/wait", post(browser_wait)) + +// Tabs +.route("/v1/browser/tabs", get(browser_tabs_list)) +.route("/v1/browser/tabs", post(browser_tabs_create)) +.route("/v1/browser/tabs/:tab_id/activate", post(browser_tab_activate)) +.route("/v1/browser/tabs/:tab_id", delete(browser_tab_close)) + +// Content extraction +.route("/v1/browser/screenshot", get(browser_screenshot)) +.route("/v1/browser/pdf", get(browser_pdf)) +.route("/v1/browser/content", get(browser_content)) +.route("/v1/browser/markdown", get(browser_markdown)) +.route("/v1/browser/scrape", post(browser_scrape)) +.route("/v1/browser/links", get(browser_links)) +.route("/v1/browser/execute", post(browser_execute)) +.route("/v1/browser/snapshot", get(browser_snapshot)) + +// Interaction +.route("/v1/browser/click", post(browser_click)) +.route("/v1/browser/type", post(browser_type)) +.route("/v1/browser/select", post(browser_select)) +.route("/v1/browser/hover", post(browser_hover)) +.route("/v1/browser/scroll", post(browser_scroll)) +.route("/v1/browser/upload", post(browser_upload)) +.route("/v1/browser/dialog", post(browser_dialog)) + +// Monitoring +.route("/v1/browser/console", get(browser_console)) +.route("/v1/browser/network", get(browser_network)) + +// Crawling +.route("/v1/browser/crawl", post(browser_crawl)) + +// Contexts +.route("/v1/browser/contexts", get(browser_contexts_list)) +.route("/v1/browser/contexts", post(browser_contexts_create)) +.route("/v1/browser/contexts/:context_id", delete(browser_contexts_delete)) + +// Cookies +.route("/v1/browser/cookies", get(browser_cookies_get)) +.route("/v1/browser/cookies", post(browser_cookies_set)) +.route("/v1/browser/cookies", delete(browser_cookies_delete)) +``` + +Total: **33 new endpoints** + +--- + +## 5. TypeScript SDK + +### New methods on `SandboxAgent` class + +```typescript +// Lifecycle +startBrowser(request?: BrowserStartRequest): Promise +stopBrowser(): Promise +getBrowserStatus(): Promise + +// CDP +getBrowserCdpUrl(): string // returns ws://host:port/v1/browser/cdp + +// Navigation +browserNavigate(request: BrowserNavigateRequest): Promise +browserBack(): Promise +browserForward(): Promise +browserReload(request?: BrowserReloadRequest): Promise +browserWait(request: BrowserWaitRequest): Promise<{ found: boolean }> + +// Tabs +getBrowserTabs(): Promise +createBrowserTab(request?: { url?: string }): Promise +activateBrowserTab(tabId: string): Promise +closeBrowserTab(tabId: string): Promise<{ ok: boolean }> + +// Content extraction +takeBrowserScreenshot(request?: BrowserScreenshotRequest): Promise +getBrowserPdf(request?: BrowserPdfRequest): Promise +getBrowserContent(request?: { selector?: string }): Promise +getBrowserMarkdown(): Promise +scrapeBrowser(request: BrowserScrapeRequest): Promise +getBrowserLinks(): Promise +executeBrowserScript(request: BrowserExecuteRequest): Promise +getBrowserSnapshot(): Promise + +// Interaction +browserClick(request: BrowserClickRequest): Promise<{ ok: boolean }> +browserType(request: BrowserTypeRequest): Promise<{ ok: boolean }> +browserSelect(request: BrowserSelectRequest): Promise<{ ok: boolean }> +browserHover(request: { selector: string }): Promise<{ ok: boolean }> +browserScroll(request: BrowserScrollRequest): Promise<{ ok: boolean }> +browserUpload(request: BrowserUploadRequest): Promise<{ ok: boolean }> +browserDialog(request: BrowserDialogRequest): Promise<{ ok: boolean }> + +// Monitoring +getBrowserConsole(request?: BrowserConsoleQuery): Promise +getBrowserNetwork(request?: BrowserNetworkQuery): Promise + +// Crawling +crawlBrowser(request: BrowserCrawlRequest): Promise + +// Contexts +getBrowserContexts(): Promise +createBrowserContext(request: { name: string }): Promise +deleteBrowserContext(contextId: string): Promise + +// Cookies +getBrowserCookies(request?: { url?: string }): Promise +setBrowserCookies(request: { cookies: BrowserCookie[] }): Promise<{ ok: boolean }> +deleteBrowserCookies(request?: { name?: string, domain?: string }): Promise +``` + +### Types (in `sdks/typescript/src/types/browser.ts`) + +```typescript +export interface BrowserStartRequest { + width?: number; + height?: number; + dpi?: number; + url?: string; + headless?: boolean; + contextId?: string; + streamVideoCodec?: string; + streamAudioCodec?: string; + streamFrameRate?: number; + webrtcPortRange?: string; + recordingFps?: number; +} + +export interface BrowserStatusResponse { + state: "active" | "starting" | "inactive" | "install_required" | "failed"; + display?: string; + resolution?: { width: number; height: number; dpi?: number }; + startedAt?: string; + cdpUrl?: string; + url?: string; + missingDependencies: string[]; + installCommand?: string; + processes: Array<{ name: string; pid?: number; running: boolean }>; + lastError?: { code: string; message: string }; +} + +export interface BrowserTabInfo { + id: string; + url: string; + title: string; + active: boolean; +} + +export interface BrowserPageInfo { + url: string; + title: string; + status?: number; +} + +export interface BrowserNavigateRequest { + url: string; + waitUntil?: "load" | "domcontentloaded" | "networkidle"; +} + +export interface BrowserScreenshotRequest { + format?: "png" | "jpeg" | "webp"; + quality?: number; + fullPage?: boolean; + selector?: string; +} + +export interface BrowserClickRequest { + selector: string; + button?: "left" | "right" | "middle"; + clickCount?: number; + timeout?: number; +} + +export interface BrowserTypeRequest { + selector: string; + text: string; + delay?: number; + clear?: boolean; +} + +export interface BrowserCrawlRequest { + url: string; + maxPages?: number; + maxDepth?: number; + allowedDomains?: string[]; + extract?: "markdown" | "html" | "text" | "links"; +} + +// ... (remaining types follow the same pattern from the HTTP API section) +``` + +--- + +## 6. Inspector UI: Browser Tab + +### New file: `frontend/packages/inspector/src/components/debug/BrowserTab.tsx` + +The Browser tab follows the same patterns as `DesktopTab.tsx` but with browser-specific sections. + +### Tab registration + +In `DebugPanel.tsx`: + +```typescript +import BrowserTab from "./BrowserTab"; + +type DebugTab = "log" | "events" | "agents" | "desktop" | "browser" | "mcp" | "skills" | "processes" | "run-process"; + +// In the tab bar, add after the desktop tab: + + +// In the tab content area: +{debugTab === "browser" && } +``` + +Use `Globe` icon from lucide-react (to differentiate from the `Monitor` icon on the Desktop tab). + +### BrowserTab sections + +The component should have these card sections, following the same card/card-header/card-title patterns as DesktopTab: + +#### Section 1: Runtime Control + +- State pill (active/inactive/install_required/failed) +- Status grid: URL, Resolution, Started +- Inputs: Width, Height, URL, Context dropdown +- Start/Stop buttons +- Auto-refresh status every 5s when active + +#### Section 2: Live View (Neko stream) + +When browser is active and streaming: +- Reuse the `` component from `@sandbox-agent/react` +- Same WebRTC stream, same interaction model +- Show current URL above the viewer +- Navigation bar: Back, Forward, Reload buttons + URL input + +```tsx +
+ + + + e.key === "Enter" && handleNavigate(currentUrl)} + onChange={(e) => setCurrentUrl(e.target.value)} + /> +
+ +``` + +#### Section 3: Screenshot (fallback when not streaming) + +Same as desktop screenshot section: +- Format selector (PNG/JPEG/WebP) +- Quality input +- Full page checkbox (browser-specific) +- Selector input (browser-specific, optional CSS selector) +- Screenshot button + preview + +#### Section 4: Tabs + +- List of open tabs with URL and title +- Active tab highlighted +- Per-tab actions: Activate, Close +- "New Tab" button with URL input + +``` +┌─────────────────────────────────────────────────┐ +│ Tabs │ +├─────────────────────────────────────────────────┤ +│ ● https://example.com - Example Domain [X] │ +│ https://google.com - Google [X] │ +│ │ +│ [+ New Tab] URL: [________________] [Open] │ +└─────────────────────────────────────────────────┘ +``` + +#### Section 5: Console + +- Level filter pills: All, Log, Warn, Error, Info +- Scrollable message list with level-colored indicators +- Auto-refresh every 3s when active +- Clear button + +``` +┌─────────────────────────────────────────────────┐ +│ Console [Clear] [↻] │ +├─────────────────────────────────────────────────┤ +│ [All] [Log] [Warn] [Error] [Info] │ +│ │ +│ LOG Hello world │ +│ WARN Deprecation warning: ... │ +│ ERR Uncaught TypeError: ... │ +│ at foo.js:42 │ +└─────────────────────────────────────────────────┘ +``` + +#### Section 6: Network + +- Request list showing method, URL (truncated), status, size, duration +- URL pattern filter input +- Auto-refresh every 3s +- Click to expand shows full URL and response details + +``` +┌─────────────────────────────────────────────────┐ +│ Network [Clear] [↻] │ +├─────────────────────────────────────────────────┤ +│ Filter: [_________________________] │ +│ │ +│ GET /api/data 200 1.2KB 45ms │ +│ POST /api/submit 201 0.3KB 120ms │ +│ GET /styles.css 200 15KB 12ms │ +└─────────────────────────────────────────────────┘ +``` + +#### Section 7: Content Tools + +Extraction tools in a compact form: +- "Get HTML" button +- "Get Markdown" button +- "Get Links" button +- "Get Snapshot" (accessibility tree) button +- Output textarea showing the result + +#### Section 8: Recording + +Reuse the same recording UI from DesktopTab (since it's the same Xvfb/ffmpeg infrastructure): +- Start/Stop recording +- FPS input +- Recording list with download/delete + +#### Section 9: Contexts + +- List of saved contexts with name, created date, size +- Create new context form (name input) +- Delete button per context +- "Use" button that sets contextId for next start + +#### Section 10: Diagnostics + +Same pattern as desktop: +- Last error details +- Process list (Xvfb, Chromium, Neko) with PIDs and running state +- Runtime log path + +### State management + +```typescript +const BrowserTab = ({ getClient }: { getClient: () => SandboxAgent }) => { + // Runtime + const [status, setStatus] = useState(null); + const [loading, setLoading] = useState(false); + const [acting, setActing] = useState<"start" | "stop" | null>(null); + const [error, setError] = useState(null); + + // Config inputs + const [width, setWidth] = useState("1280"); + const [height, setHeight] = useState("720"); + const [startUrl, setStartUrl] = useState(""); + const [selectedContext, setSelectedContext] = useState(null); + + // Live view + const [liveViewActive, setLiveViewActive] = useState(false); + const [currentUrl, setCurrentUrl] = useState(""); + + // Screenshot + const [screenshotUrl, setScreenshotUrl] = useState(null); + const [screenshotLoading, setScreenshotLoading] = useState(false); + const [screenshotFormat, setScreenshotFormat] = useState<"png" | "jpeg" | "webp">("png"); + const [screenshotFullPage, setScreenshotFullPage] = useState(false); + + // Tabs + const [tabs, setTabs] = useState([]); + const [newTabUrl, setNewTabUrl] = useState(""); + + // Console + const [consoleMessages, setConsoleMessages] = useState([]); + const [consoleFilter, setConsoleFilter] = useState(null); + + // Network + const [networkRequests, setNetworkRequests] = useState([]); + const [networkFilter, setNetworkFilter] = useState(""); + + // Content + const [contentResult, setContentResult] = useState(null); + const [contentType, setContentType] = useState<"html" | "markdown" | "links" | "snapshot">("html"); + + // Recording (reuse desktop recording state pattern) + const [recordings, setRecordings] = useState([]); + const [recordingFps, setRecordingFps] = useState("30"); + + // Contexts + const [contexts, setContexts] = useState([]); + const [newContextName, setNewContextName] = useState(""); + + // ... (callbacks, effects, render follow DesktopTab patterns) +}; +``` + +--- + +## 7. React Component: `BrowserViewer` + +### New file: `sdks/react/src/BrowserViewer.tsx` + +A thin wrapper around `DesktopViewer` that adds a navigation bar. This is the reusable component for embedding in any React app. + +```typescript +export interface BrowserViewerProps { + client: BrowserViewerClient; + className?: string; + style?: CSSProperties; + height?: number | string; + showNavigationBar?: boolean; // default: true + showStatusBar?: boolean; // default: true + onNavigate?: (url: string) => void; + onConnect?: (status: DesktopStreamReadyStatus) => void; + onDisconnect?: () => void; + onError?: (error: Error) => void; +} + +export type BrowserViewerClient = Pick; +``` + +Export from `sdks/react/src/index.ts`: + +```typescript +export { BrowserViewer } from "./BrowserViewer"; +export type { BrowserViewerProps, BrowserViewerClient } from "./BrowserViewer"; +``` + +--- + +## 8. Desktop Integration + +### Shared infrastructure + +The browser runtime shares these components with the desktop runtime: + +| Component | Desktop | Browser | Shared? | +|-----------|---------|---------|---------| +| Xvfb | Yes | Yes | Same launch logic, different defaults (browser: 1280x720) | +| Openbox | Yes | No | Browser runs Chromium directly | +| Neko streaming | Yes | Yes | Same `DesktopStreamingManager` instance | +| Recording (ffmpeg) | Yes | Yes | Same `DesktopRecordingManager` instance | +| Screenshot (ImageMagick) | Yes | Yes | Desktop-level screenshots via same `import` command | +| Mouse/keyboard (xdotool) | Yes | Yes | Same desktop input endpoints work | +| Clipboard | Yes | Yes | Same X11 clipboard | + +### Mutual exclusivity + +Desktop mode and browser mode are mutually exclusive. Only one can be active at a time (they share the X11 display). The `BrowserRuntime` should check that `DesktopRuntime` is not active before starting, and vice versa. Return a `409 Conflict` if the other mode is active. + +Alternatively, browser mode could be a "mode" of the desktop runtime (add a `mode` field to `DesktopStartRequest`). This avoids two separate runtime managers and simplifies shared state. **Recommendation: implement as a mode of the existing desktop runtime** to maximize code reuse. + +### Desktop runtime mode approach + +```rust +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DesktopMode { + Desktop, // Xvfb + Openbox (current behavior) + Browser, // Xvfb + Chromium (no window manager) +} +``` + +`POST /v1/browser/start` internally calls `desktop_runtime.start()` with `mode: Browser`. The browser-specific endpoints (CDP, navigate, etc.) are only available when the mode is `Browser`. + +--- + +## 9. Error Handling + +### BrowserProblem (mirrors DesktopProblem) + +```rust +pub enum BrowserProblem { + NotActive, // 409 - browser is not running + AlreadyActive, // 409 - browser is already running + DesktopConflict, // 409 - desktop mode is active, cannot start browser + InstallRequired, // 424 - missing dependencies + StartFailed(String), // 500 - startup sequence failed + CdpError(String), // 502 - CDP communication error + Timeout(String), // 504 - operation timed out + NotFound(String), // 404 - tab/context/element not found + InvalidSelector(String), // 400 - bad CSS selector +} +``` + +All errors return `application/problem+json`: + +```json +{ + "type": "tag:sandboxagent.dev,2025:browser/not-active", + "title": "Browser Not Active", + "status": 409, + "detail": "The browser is not running. Call POST /v1/browser/start first." +} +``` + +--- + +## 10. Testing + +### Integration tests + +New file: `server/packages/sandbox-agent/tests/browser_api.rs` + +Test categories: +1. **Lifecycle**: start, status, stop +2. **Navigation**: navigate, back, forward, reload +3. **Tabs**: create, list, activate, close +4. **Screenshots**: PNG/JPEG/WebP, full page, element +5. **Content**: HTML, markdown, links, snapshot +6. **Interaction**: click, type, scroll (against a test HTML page) +7. **Monitoring**: console messages, network requests +8. **Crawling**: multi-page crawl with depth/page limits +9. **Contexts**: create, use, delete +10. **CDP proxy**: Playwright connects through proxy + +### Test HTML pages + +Serve static test pages from within the test via a simple HTTP server inside the sandbox. This avoids network dependencies. + +### Docker test image + +Update `docker/test-agent/Dockerfile` to include Chromium (it's already in `docker/test-common-software/Dockerfile`). + +### Run command + +```bash +cargo test -p sandbox-agent --test browser_api +``` + diff --git a/scripts/ralph/.last-branch b/scripts/ralph/.last-branch new file mode 100644 index 0000000..5408a32 --- /dev/null +++ b/scripts/ralph/.last-branch @@ -0,0 +1 @@ +ralph/browser-automation diff --git a/scripts/ralph/CLAUDE.md b/scripts/ralph/CLAUDE.md new file mode 100644 index 0000000..f95bb92 --- /dev/null +++ b/scripts/ralph/CLAUDE.md @@ -0,0 +1,104 @@ +# Ralph Agent Instructions + +You are an autonomous coding agent working on a software project. + +## Your Task + +1. Read the PRD at `prd.json` (in the same directory as this file) +2. Read the progress log at `progress.txt` (check Codebase Patterns section first) +3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main. +4. Pick the **highest priority** user story where `passes: false` +5. Implement that single user story +6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires) +7. Update CLAUDE.md files if you discover reusable patterns (see below) +8. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]` +9. Update the PRD to set `passes: true` for the completed story +10. Append your progress to `progress.txt` + +## Progress Report Format + +APPEND to progress.txt (never replace, always append): +``` +## [Date/Time] - [Story ID] +- What was implemented +- Files changed +- **Learnings for future iterations:** + - Patterns discovered (e.g., "this codebase uses X for Y") + - Gotchas encountered (e.g., "don't forget to update Z when changing W") + - Useful context (e.g., "the evaluation panel is in component X") +--- +``` + +The learnings section is critical - it helps future iterations avoid repeating mistakes and understand the codebase better. + +## Consolidate Patterns + +If you discover a **reusable pattern** that future iterations should know, add it to the `## Codebase Patterns` section at the TOP of progress.txt (create it if it doesn't exist). This section should consolidate the most important learnings: + +``` +## Codebase Patterns +- Example: Use `sql` template for aggregations +- Example: Always use `IF NOT EXISTS` for migrations +- Example: Export types from actions.ts for UI components +``` + +Only add patterns that are **general and reusable**, not story-specific details. + +## Update CLAUDE.md Files + +Before committing, check if any edited files have learnings worth preserving in nearby CLAUDE.md files: + +1. **Identify directories with edited files** - Look at which directories you modified +2. **Check for existing CLAUDE.md** - Look for CLAUDE.md in those directories or parent directories +3. **Add valuable learnings** - If you discovered something future developers/agents should know: + - API patterns or conventions specific to that module + - Gotchas or non-obvious requirements + - Dependencies between files + - Testing approaches for that area + - Configuration or environment requirements + +**Examples of good CLAUDE.md additions:** +- "When modifying X, also update Y to keep them in sync" +- "This module uses pattern Z for all API calls" +- "Tests require the dev server running on PORT 3000" +- "Field names must match the template exactly" + +**Do NOT add:** +- Story-specific implementation details +- Temporary debugging notes +- Information already in progress.txt + +Only update CLAUDE.md if you have **genuinely reusable knowledge** that would help future work in that directory. + +## Quality Requirements + +- ALL commits must pass your project's quality checks (typecheck, lint, test) +- Do NOT commit broken code +- Keep changes focused and minimal +- Follow existing code patterns + +## Browser Testing (If Available) + +For any story that changes UI, verify it works in the browser if you have browser testing tools configured (e.g., via MCP): + +1. Navigate to the relevant page +2. Verify the UI changes work as expected +3. Take a screenshot if helpful for the progress log + +If no browser tools are available, note in your progress report that manual browser verification is needed. + +## Stop Condition + +After completing a user story, check if ALL stories have `passes: true`. + +If ALL stories are complete and passing, reply with: +COMPLETE + +If there are still stories with `passes: false`, end your response normally (another iteration will pick up the next story). + +## Important + +- Work on ONE story per iteration +- Commit frequently +- Keep CI green +- Read the Codebase Patterns section in progress.txt before starting diff --git a/scripts/ralph/prd.json b/scripts/ralph/prd.json index cca1564..86b58cd 100644 --- a/scripts/ralph/prd.json +++ b/scripts/ralph/prd.json @@ -278,8 +278,8 @@ "Typecheck passes" ], "priority": 17, - "passes": false, - "notes": "" + "passes": true, + "notes": "GET uses Network.getCookies with optional urls param. POST uses Network.setCookies with cookie array. DELETE uses Network.clearBrowserCookies (no filter) or Network.getCookies + Network.deleteCookies (with name/domain filter)." }, { "id": "US-018", @@ -295,8 +295,8 @@ "Typecheck passes" ], "priority": 18, - "passes": false, - "notes": "" + "passes": true, + "notes": "BFS crawl uses CDP Page.navigate + Runtime.evaluate for each page. Content extraction supports 4 modes (markdown/html/text/links). URL dedup via fragment-stripped normalization. Domain filtering via url crate. Added url.workspace = true dependency." }, { "id": "US-019", @@ -308,8 +308,8 @@ "Typecheck passes" ], "priority": 19, - "passes": false, - "notes": "" + "passes": true, + "notes": "Types added to existing types.ts (not a new types/browser.ts) following the SDK's established pattern of extracting type aliases from the generated OpenAPI types. Regenerated openapi.json and openapi.ts to include browser operations." }, { "id": "US-020", @@ -323,8 +323,8 @@ "Typecheck passes" ], "priority": 20, - "passes": false, - "notes": "" + "passes": true, + "notes": "Methods follow exact same patterns as desktop counterparts. getBrowserCdpUrl() uses toWebSocketUrl() + buildUrl() with access_token query param, same as buildDesktopStreamWebSocketUrl()." }, { "id": "US-021", @@ -343,8 +343,8 @@ "Typecheck passes" ], "priority": 21, - "passes": false, - "notes": "" + "passes": true, + "notes": "Methods follow same requestJson pattern as lifecycle methods. Type imports added alphabetically. closeBrowserTab uses DELETE method. createBrowserTab and browserReload have optional request params." }, { "id": "US-022", @@ -362,7 +362,7 @@ "Typecheck passes" ], "priority": 22, - "passes": false, + "passes": true, "notes": "" }, { @@ -380,8 +380,8 @@ "Typecheck passes" ], "priority": 23, - "passes": false, - "notes": "" + "passes": true, + "notes": "All 7 interaction methods follow the same requestJson POST pattern with BrowserActionResponse return type. Type imports added alphabetically." }, { "id": "US-024", @@ -400,7 +400,7 @@ "Typecheck passes" ], "priority": 24, - "passes": false, + "passes": true, "notes": "" }, { @@ -416,7 +416,7 @@ "Typecheck passes" ], "priority": 25, - "passes": false, + "passes": true, "notes": "" }, { @@ -433,8 +433,8 @@ "Verify in browser using dev-browser skill" ], "priority": 26, - "passes": false, - "notes": "Follow DesktopTab.tsx patterns for card layout and state management" + "passes": true, + "notes": "Follows DesktopTab.tsx patterns for card layout and state management. BrowserViewerClient used for live view with DesktopViewer component. Navigation bar with back/forward/reload + URL input. Context dropdown populated from getBrowserContexts(). Auto-refresh every 5s when active. BrowserStartRequest doesn't have 'streaming' field - removed it." }, { "id": "US-027", @@ -448,8 +448,8 @@ "Verify in browser using dev-browser skill" ], "priority": 27, - "passes": false, - "notes": "" + "passes": true, + "notes": "Screenshot uses createScreenshotUrl blob pattern from DesktopTab. Tabs reuse desktop-window-item/desktop-window-focused CSS classes. Console auto-refreshes every 3s with level filter pills. All three sections conditionally rendered only when isActive." }, { "id": "US-028", @@ -465,8 +465,8 @@ "Verify in browser using dev-browser skill" ], "priority": 28, - "passes": false, - "notes": "" + "passes": true, + "notes": "Network section auto-refreshes every 3s with URL pattern filter. Content Tools has Get HTML/Markdown/Links/Snapshot buttons with output textarea. Recording reuses desktop recording API (startDesktopRecording/stopDesktopRecording/listDesktopRecordings). Contexts section with list/create/delete/Use button sets contextId for next browser start. Diagnostics shows lastError and process list from BrowserStatusResponse." }, { "id": "US-029", @@ -486,8 +486,128 @@ "Tests pass" ], "priority": 29, + "passes": true, + "notes": "Run with: cargo test -p sandbox-agent --test browser_api. Fixed CdpClient to connect to page endpoint instead of browser endpoint for Page/Runtime/DOM commands. Chromium added to test Docker image." + }, + { + "id": "US-030", + "title": "Fix crawl page load: replace sleep with readyState polling", + "description": "As a user, I need the crawl endpoint to reliably wait for pages to load instead of using a fixed 500ms sleep.", + "acceptanceCriteria": [ + "In browser_crawl.rs, replace `tokio::time::sleep(Duration::from_millis(500))` after Page.navigate with a polling loop that checks `document.readyState === 'complete'` via Runtime.evaluate", + "Polling should check every 100ms with a configurable timeout (default 10s)", + "If timeout is reached, proceed with extraction anyway (don't fail the crawl)", + "Typecheck passes" + ], + "priority": 30, + "passes": true, + "notes": "Current code at browser_crawl.rs:61 uses a hard-coded 500ms sleep which is too short for slow pages and wastes time on fast pages." + }, + { + "id": "US-031", + "title": "Fix crawl navigation status: use real HTTP status instead of faked 200", + "description": "As a user, I need crawl results to report the actual HTTP status code, not always 200.", + "acceptanceCriteria": [ + "In browser_crawl.rs, enable Network domain (Network.enable) before crawling", + "Capture the actual HTTP status from Network.responseReceived for each navigated page", + "Replace the faked `nav_result.get(\"frameId\").map(|_| 200u16)` with the real status", + "If Page.navigate returns an errorText field, record the page with status None and skip link extraction", + "Typecheck passes" + ], + "priority": 31, + "passes": true, + "notes": "Enabled Network domain, subscribed to Network.responseReceived events, drain buffered events after readyState complete to get real HTTP status. Handles errorText from Page.navigate by recording None status and skipping extraction. Takes last Document response to handle redirect chains." + }, + { + "id": "US-032", + "title": "Remove dead cdp_client() method from BrowserRuntime", + "description": "As a developer, I want to remove dead code that always returns an error.", + "acceptanceCriteria": [ + "Remove the `pub async fn cdp_client()` method from BrowserRuntime in browser_runtime.rs that always returns Err('Use with_cdp() to execute CDP commands')", + "Verify no callers reference cdp_client() (only get_cdp() and with_cdp() should be used)", + "If any callers exist, migrate them to get_cdp()", + "Typecheck passes" + ], + "priority": 32, + "passes": true, + "notes": "browser_runtime.rs:553-564 always returns an error telling callers to use with_cdp(). This is dead code that confuses the API surface." + }, + { + "id": "US-033", + "title": "Fix default display dimensions to match spec (1280x720)", + "description": "As a developer, I need the default browser dimensions to match the spec.", + "acceptanceCriteria": [ + "Change DEFAULT_WIDTH from 1440 to 1280 in browser_runtime.rs", + "Change DEFAULT_HEIGHT from 900 to 720 in browser_runtime.rs", + "Typecheck passes" + ], + "priority": 33, "passes": false, - "notes": "Run with: cargo test -p sandbox-agent --test browser_api" + "notes": "Spec section 3.1 says defaults are 1280x720 but browser_runtime.rs uses 1440x900." + }, + { + "id": "US-034", + "title": "Add reverse mutual exclusivity check in DesktopRuntime", + "description": "As a developer, I need DesktopRuntime to reject start when BrowserRuntime is active.", + "acceptanceCriteria": [ + "In DesktopRuntime.start(), check if BrowserRuntime is active before proceeding", + "If BrowserRuntime state is Active, return a 409 Conflict error with message explaining browser and desktop modes are mutually exclusive", + "This mirrors the existing check in BrowserRuntime.start() that checks DesktopRuntime", + "BrowserRuntime may need to be added to DesktopRuntime's constructor or accessed via shared app state", + "Typecheck passes" + ], + "priority": 34, + "passes": false, + "notes": "Currently BrowserRuntime checks DesktopRuntime before starting, but DesktopRuntime does not check BrowserRuntime. This is a one-sided guard." + }, + { + "id": "US-035", + "title": "Fix BrowserProblem misuse: use correct error variants for non-startup failures", + "description": "As a developer, I need error variants to be used correctly so error codes are meaningful.", + "acceptanceCriteria": [ + "In browser_context.rs, change delete_context's fs::remove_dir_all error from BrowserProblem::start_failed to a more appropriate variant (e.g. cdp_error or add a new internal_error variant)", + "In browser_context.rs, change list_contexts's fs::read_dir error from BrowserProblem::start_failed similarly", + "In browser_context.rs, change create_context's fs errors from BrowserProblem::start_failed similarly", + "Fix the no-op comment at browser_runtime.rs console event handler: 'CDP uses warning as type but we normalize to warning' (same value, comment is misleading - either remove the comment or actually normalize 'warning' to 'warn')", + "Typecheck passes" + ], + "priority": 35, + "passes": false, + "notes": "BrowserProblem::start_failed (500 status, browser/start-failed code) is used as a catch-all for filesystem errors in browser_context.rs which makes error codes meaningless for API consumers." + }, + { + "id": "US-036", + "title": "Add integration tests for console and network monitoring", + "description": "As a developer, I need tests that verify console and network monitoring actually capture events.", + "acceptanceCriteria": [ + "Add test v1_browser_console_monitoring to browser_api.rs", + "Test navigates to a page that calls console.log('test-message') and console.error('test-error')", + "Test calls GET /v1/browser/console and verifies the messages array contains entries with matching text and correct levels", + "Test calls GET /v1/browser/console?level=error and verifies only error-level messages are returned", + "Add test v1_browser_network_monitoring to browser_api.rs", + "Test navigates to a page, then calls GET /v1/browser/network and verifies at least one request entry exists with a url, method, and status", + "Tests pass" + ], + "priority": 36, + "passes": false, + "notes": "Console and network monitoring have real complexity (background tasks, ring buffers, event correlation) but zero test coverage currently." + }, + { + "id": "US-037", + "title": "Add integration tests for crawling", + "description": "As a developer, I need tests that verify the crawl endpoint works with multiple pages.", + "acceptanceCriteria": [ + "Add test v1_browser_crawl to browser_api.rs", + "Write 3 test HTML pages: page-a.html links to page-b.html, page-b.html links to page-c.html, page-c.html has no links", + "Test POST /v1/browser/crawl with url=file:///tmp/page-a.html, maxDepth=2, extract=text", + "Verify response has 3 pages with correct depths (0, 1, 2)", + "Verify totalPages is 3 and truncated is false", + "Test maxPages=1 returns only 1 page and truncated is true", + "Tests pass" + ], + "priority": 37, + "passes": false, + "notes": "Crawl has real logic (BFS, domain filtering, depth limits, URL normalization) but no test coverage." } ] } diff --git a/scripts/ralph/progress.txt b/scripts/ralph/progress.txt index dd0f1c1..85220f7 100644 --- a/scripts/ralph/progress.txt +++ b/scripts/ralph/progress.txt @@ -22,6 +22,10 @@ - `BrowserRuntime::ensure_active()` is a reusable guard for any handler requiring active browser state - `BrowserRuntime::get_cdp()` returns `Arc` without holding state lock; preferred over `with_cdp()` closure for handlers that do multiple async CDP calls (avoids lifetime issues) - `CdpClient::close()` takes `&self` (not `self`); CdpClient is stored as `Option>` in BrowserRuntimeStateData +- CdpClient MUST connect to a page endpoint (`/json/list` → first page's `webSocketDebuggerUrl`), NOT the browser endpoint from `/json/version`. Page/Runtime/DOM commands only work on page-level connections. +- Integration tests use Docker containers via `TestApp::new(AuthConfig::disabled())` from `support/docker.rs`; `#[serial]` for sequential execution +- Test helper `write_test_file()` uses `PUT /v1/fs/file?path=...` to write HTML test fixtures into the container +- `docker/test-agent/Dockerfile` must include chromium + deps (libnss3, libatk-bridge2.0-0, libdrm2, libxcomposite1, libxdamage1, libxrandr2, libgbm1, libasound2, libpangocairo-1.0-0, libgtk-3-0) for browser integration tests - `get_page_info_via_cdp()` is a helper fn in router.rs for getting current URL and title via Runtime.evaluate - CDP `Page.getNavigationHistory` returns `{currentIndex, entries: [{id, url, title}]}` for back/forward navigation - CDP `Page.navigateToHistoryEntry` takes `{entryId}` (the id from history entries, not the index) @@ -43,6 +47,21 @@ - For internal-only fields in API types, use `#[serde(default, skip_serializing)]` to keep them out of JSON responses - Browser context management is pure filesystem CRUD; each context is a directory under `{state_dir}/browser-contexts/{id}/` with a `context.json` metadata file - Use hex-encoded /dev/urandom bytes for generating IDs (same pattern as telemetry.rs) to avoid adding new crate deps +- CDP `Network.getCookies`/`setCookies`/`deleteCookies`/`clearBrowserCookies` for cookie CRUD; sameSite values are capitalized strings ("Strict", "Lax", "None") +- For complex multi-page logic (e.g., crawl), put business logic in a separate module file and call it from the router handler; keeps router.rs manageable +- `url` crate available as workspace dependency for URL parsing/domain extraction +- TypeScript SDK types pipeline: (1) `cargo run -p sandbox-agent-openapi-gen -- --out docs/openapi.json`, (2) `npx openapi-typescript docs/openapi.json -o src/generated/openapi.ts && node ./scripts/patch-openapi-types.mjs`, (3) add type aliases in `types.ts` using `JsonResponse`/`JsonRequestBody`/`QueryParams` utilities, (4) export from `index.ts` +- TypeScript SDK types are extracted from generated OpenAPI types, NOT manually written interfaces; operation IDs follow `{method}_v1_{domain}_{action}` pattern +- For QueryParams types that might resolve to `never`, use defensive pattern: `QueryParams extends never ? Record : QueryParams` +- SDK method patterns: `requestJson("GET"|"POST", path)` for JSON, `requestRaw("GET", path, {query, accept})` for binary, `toWebSocketUrl(buildUrl(path, {access_token}))` for WS URLs; type imports go alphabetically in the `import { ... } from "./types.ts"` block +- SDK binary response pattern (screenshot/pdf): `requestRaw("GET", path, {query, accept: "image/*"})` → `response.arrayBuffer()` → `new Uint8Array(buffer)` +- SDK content extraction methods: `requestJson("GET", path, {query})` for JSON endpoints, `requestRaw("GET", path, {query, accept})` for binary; query param types use the defensive `extends never` pattern +- React SDK components use inline CSSProperties styles (no CSS modules or Tailwind), with base shell/status/viewport styles as const objects +- React SDK `BrowserViewerClient`/`DesktopViewerClient` use `Pick` for loose coupling; when adding new components that depend on SDK methods, the TypeScript SDK dist must be rebuilt first (`npx tsup` in sdks/typescript/) before React SDK typecheck passes +- React SDK barrel exports are alphabetically ordered; component exports first, then type exports grouped by source file +- Inspector debug tab pattern: (1) add to `DebugTab` union in DebugPanel.tsx, (2) import component, (3) add icon button in tabs section, (4) add conditional render `{debugTab === "x" && }` in content section +- Inspector tab components reuse `desktop-panel`, `desktop-state-grid`, `desktop-start-controls`, `desktop-input-group`, `card`, `card-header`, `card-meta`, `card-actions` CSS classes +- `Parameters[0]` derives request types from SDK method signatures in inspector components # Ralph Progress Log Started: Tue Mar 17 04:32:06 AM PDT 2026 @@ -308,3 +327,283 @@ Started: Tue Mar 17 04:32:06 AM PDT 2026 - Context types (BrowserContextInfo, BrowserContextListResponse, BrowserContextCreateRequest) were already defined in browser_types.rs from US-003 - `tempfile` crate is a workspace dev-dependency available via `test-utils` feature flag --- + +## 2026-03-17 - US-017 +- Implemented 3 browser cookie management HTTP endpoints: GET /v1/browser/cookies, POST /v1/browser/cookies, DELETE /v1/browser/cookies +- GET /v1/browser/cookies: accepts optional `url` query param; uses CDP `Network.getCookies` with optional `urls` array; maps CDP cookie fields (httpOnly, sameSite) to BrowserCookie struct +- POST /v1/browser/cookies: accepts `{cookies: [...]}` body; maps BrowserCookie fields to CDP format; uses CDP `Network.setCookies` +- DELETE /v1/browser/cookies: accepts optional `name`, `domain` query params; if no filters, uses `Network.clearBrowserCookies`; if filtered, fetches all cookies via `Network.getCookies`, matches by name/domain, deletes each via `Network.deleteCookies` +- Routes registered with combined `get().post().delete()` on single `/browser/cookies` path +- OpenAPI paths and schemas registered for all 3 handlers and all cookie types (BrowserCookie, BrowserCookieSameSite, BrowserCookiesQuery, BrowserCookiesResponse, BrowserSetCookiesRequest, BrowserDeleteCookiesQuery) +- Files changed: router.rs +- **Learnings for future iterations:** + - CDP `Network.getCookies` takes `{urls?: [string]}` and returns `{cookies: [{name, value, domain, path, expires, httpOnly, secure, sameSite, ...}]}` + - CDP `Network.setCookies` takes `{cookies: [{name, value, domain?, path?, expires?, httpOnly?, secure?, sameSite?}]}` + - CDP `Network.deleteCookies` takes `{name, domain?, path?}` to delete specific cookies + - CDP `Network.clearBrowserCookies` takes no params and clears all cookies + - CDP cookie `sameSite` values are "Strict", "Lax", "None" (capitalized strings) + - CDP cookie `expires` is 0 for session cookies; filter with `> 0.0` before returning + - For delete with filters, must first fetch all cookies then match and delete individually (CDP has no bulk-filter-delete) + - Axum route `.get().post().delete()` chaining works for registering multiple HTTP methods on same path +--- + +## 2026-03-17 - US-018 +- Created `browser_crawl.rs` with BFS crawl implementation using CDP +- POST /v1/browser/crawl: accepts `{url, maxPages?, maxDepth?, allowedDomains?, extract?}` +- Returns `{pages: [{url, title, content, links, status, depth}], totalPages, truncated}` +- 4 content extraction modes: markdown (strips nav/footer/aside, uses html2md), html (outerHTML), text (innerText), links (empty content, links in links field) +- BFS queue with visited set for URL deduplication (fragment-stripped normalization) +- Domain filtering via `url` crate; defaults to same-domain-only if no allowedDomains specified +- maxPages default 10, capped at 100; maxDepth default 2 +- Added `url.workspace = true` dependency to sandbox-agent Cargo.toml +- Route registered at `/browser/crawl` in v1_router, OpenAPI paths and schemas registered +- Files changed: browser_crawl.rs (new), Cargo.toml, lib.rs, router.rs +- **Learnings for future iterations:** + - `url` crate (v2.5) is a workspace dependency, just add `url.workspace = true` to package Cargo.toml + - `Url::parse()` + `host_str()` is the clean way to extract domains from URLs for filtering + - Crawl logic is kept in a separate module (browser_crawl.rs) rather than inline in router.rs since it has substantial business logic + - The crawl reuses the same CDP patterns: Page.navigate for navigation, Runtime.evaluate for content extraction, JSON.stringify for link collection + - Fragment-stripped URL normalization (`Url::set_fragment(None)`) prevents crawling the same page with different anchors + - `truncated` field signals whether there were more pages in the queue when max_pages was reached +--- + +## 2026-03-17 - US-019 +- Added 55 browser type aliases to `sdks/typescript/src/types.ts` following existing desktop type pattern +- Regenerated `docs/openapi.json` from Rust server (now includes all browser endpoints) +- Regenerated `sdks/typescript/src/generated/openapi.ts` via `openapi-typescript` +- Exported all browser types from `sdks/typescript/src/index.ts` barrel file +- Types cover: lifecycle (BrowserState, BrowserStartRequest, BrowserStatusResponse), navigation (BrowserNavigateRequest, BrowserPageInfo, BrowserWaitRequest/Response), tabs (BrowserTabInfo, BrowserTabListResponse, BrowserCreateTabRequest), screenshots/PDF (BrowserScreenshotQuery/Format, BrowserPdfQuery/Format), content extraction (BrowserContentQuery/Response, BrowserMarkdownResponse, BrowserLinksResponse, BrowserSnapshotResponse), scrape/execute (BrowserScrapeRequest/Response, BrowserExecuteRequest/Response), interaction (BrowserClickRequest, BrowserTypeRequest, BrowserSelectRequest, BrowserHoverRequest, BrowserScrollRequest, BrowserUploadRequest, BrowserDialogRequest, BrowserActionResponse), monitoring (BrowserConsoleQuery/Message/Response, BrowserNetworkQuery/Request/Response), crawl (BrowserCrawlRequest/Page/Response/Extract), contexts (BrowserContextInfo/ListResponse/CreateRequest), cookies (BrowserCookie/SameSite, BrowserCookiesQuery/Response, BrowserSetCookiesRequest, BrowserDeleteCookiesQuery) +- Files changed: types.ts, index.ts, generated/openapi.ts, docs/openapi.json +- **Learnings for future iterations:** + - TypeScript SDK types are NOT manually written interfaces; they're type aliases extracted from generated OpenAPI types using `JsonResponse`, `JsonRequestBody`, `QueryParams` generic utilities + - Must regenerate OpenAPI pipeline first: `cargo run -p sandbox-agent-openapi-gen -- --out docs/openapi.json` then `npx openapi-typescript ... -o src/generated/openapi.ts && node ./scripts/patch-openapi-types.mjs` + - For query param types that might resolve to `never`, use the `extends never ? Record : ...` pattern (see DesktopScreenshotQuery) + - biome pre-commit hook auto-formats; files may be reformatted on commit + - Operation IDs follow pattern: `{method}_v1_browser_{action}` (e.g., `post_v1_browser_start`, `get_v1_browser_status`) + - Component schemas use the exact Rust struct name (e.g., `BrowserStartRequest`, `BrowserState`) +--- + +## 2026-03-17 - US-020 +- Added 4 browser lifecycle/CDP methods to SandboxAgent class in sdks/typescript/src/client.ts: + - `startBrowser(request?)` → POST /v1/browser/start + - `stopBrowser()` → POST /v1/browser/stop + - `getBrowserStatus()` → GET /v1/browser/status + - `getBrowserCdpUrl(options?)` → builds ws:// URL for /v1/browser/cdp with access_token +- Imported `BrowserStartRequest` and `BrowserStatusResponse` types from types.ts +- Methods placed after desktop stream methods, before private getLiveConnection +- Files changed: client.ts +- **Learnings for future iterations:** + - SDK methods follow 1:1 pattern with desktop counterparts: `requestJson("METHOD", path, {body/query})` for JSON, `toWebSocketUrl(buildUrl(...))` for WS URLs + - Type imports are added alphabetically in the main `import { ... } from "./types.ts"` block + - `getBrowserCdpUrl()` is sync (not async) since it just constructs a URL, same as `buildDesktopStreamWebSocketUrl()` + - Reuses `ProcessTerminalWebSocketUrlOptions` type for the options param (contains `accessToken?: string`) + - biome pre-commit formats automatically; no manual formatting needed +--- + +## 2026-03-17 - US-021 +- Added 9 browser navigation and tab methods to SandboxAgent class in sdks/typescript/src/client.ts: + - `browserNavigate(request)` → POST /v1/browser/navigate → BrowserPageInfo + - `browserBack()` → POST /v1/browser/back → BrowserPageInfo + - `browserForward()` → POST /v1/browser/forward → BrowserPageInfo + - `browserReload(request?)` → POST /v1/browser/reload → BrowserPageInfo + - `browserWait(request)` → POST /v1/browser/wait → BrowserWaitResponse + - `getBrowserTabs()` → GET /v1/browser/tabs → BrowserTabListResponse + - `createBrowserTab(request?)` → POST /v1/browser/tabs → BrowserTabInfo + - `activateBrowserTab(tabId)` → POST /v1/browser/tabs/:id/activate → BrowserTabInfo + - `closeBrowserTab(tabId)` → DELETE /v1/browser/tabs/:id → BrowserActionResponse +- Added 9 type imports alphabetically: BrowserActionResponse, BrowserCreateTabRequest, BrowserNavigateRequest, BrowserPageInfo, BrowserReloadRequest, BrowserTabInfo, BrowserTabListResponse, BrowserWaitRequest, BrowserWaitResponse +- Files changed: client.ts +- **Learnings for future iterations:** + - Navigation methods (back/forward/reload) have no required request body, but reload accepts optional BrowserReloadRequest + - Tab methods use path params for tab IDs: `/browser/tabs/${tabId}/activate` and `/browser/tabs/${tabId}` + - createBrowserTab request body is optional (defaults to empty tab) + - closeBrowserTab returns BrowserActionResponse ({ok: true}), not BrowserTabInfo + - DELETE HTTP method works with requestJson same as GET/POST +--- + +## 2026-03-17 - US-022 +- Added 8 browser content extraction methods to SandboxAgent class in sdks/typescript/src/client.ts: + - `takeBrowserScreenshot(query?)` → GET /v1/browser/screenshot → Uint8Array (binary, requestRaw) + - `getBrowserPdf(query?)` → GET /v1/browser/pdf → Uint8Array (binary, requestRaw with accept: "application/pdf") + - `getBrowserContent(query?)` → GET /v1/browser/content → BrowserContentResponse + - `getBrowserMarkdown()` → GET /v1/browser/markdown → BrowserMarkdownResponse + - `scrapeBrowser(request)` → POST /v1/browser/scrape → BrowserScrapeResponse + - `getBrowserLinks()` → GET /v1/browser/links → BrowserLinksResponse + - `executeBrowserScript(request)` → POST /v1/browser/execute → BrowserExecuteResponse + - `getBrowserSnapshot()` → GET /v1/browser/snapshot → BrowserSnapshotResponse +- Added 10 type imports alphabetically: BrowserContentQuery, BrowserContentResponse, BrowserExecuteRequest, BrowserExecuteResponse, BrowserLinksResponse, BrowserMarkdownResponse, BrowserPdfQuery, BrowserScreenshotQuery, BrowserScrapeRequest, BrowserScrapeResponse, BrowserSnapshotResponse +- Files changed: client.ts +- **Learnings for future iterations:** + - Screenshot uses `requestRaw` with `accept: "image/*"`, PDF uses `requestRaw` with `accept: "application/pdf"` - both return `Uint8Array` via `response.arrayBuffer()` + - Content extraction GET endpoints with optional query params use `requestJson("GET", path, { query })` pattern + - Scrape and execute are POST endpoints with required request bodies + - getBrowserMarkdown, getBrowserLinks, getBrowserSnapshot have no parameters (simple GET endpoints) + - Parameter name is `query` (not `request`) for GET endpoints with query params, matching desktop screenshot pattern +--- + +## 2026-03-17 - US-023 +- Added 7 browser interaction methods to SandboxAgent class in sdks/typescript/src/client.ts: + - `browserClick(request)` → POST /v1/browser/click → BrowserActionResponse + - `browserType(request)` → POST /v1/browser/type → BrowserActionResponse + - `browserSelect(request)` → POST /v1/browser/select → BrowserActionResponse + - `browserHover(request)` → POST /v1/browser/hover → BrowserActionResponse + - `browserScroll(request)` → POST /v1/browser/scroll → BrowserActionResponse + - `browserUpload(request)` → POST /v1/browser/upload → BrowserActionResponse + - `browserDialog(request)` → POST /v1/browser/dialog → BrowserActionResponse +- Added 7 type imports alphabetically: BrowserClickRequest, BrowserDialogRequest, BrowserHoverRequest, BrowserScrollRequest, BrowserSelectRequest, BrowserTypeRequest, BrowserUploadRequest +- Files changed: client.ts +- **Learnings for future iterations:** + - All browser interaction methods follow the exact same pattern: `requestJson("POST", path, { body: request })` returning `BrowserActionResponse` + - BrowserActionResponse is shared across all interaction endpoints (already imported from US-021) + - Methods placed after content extraction methods and before private getLiveConnection +--- + +## 2026-03-17 - US-024 +- Added 9 browser monitoring, crawl, context, and cookie methods to SandboxAgent class in sdks/typescript/src/client.ts: + - `getBrowserConsole(query?)` → GET /v1/browser/console → BrowserConsoleResponse + - `getBrowserNetwork(query?)` → GET /v1/browser/network → BrowserNetworkResponse + - `crawlBrowser(request)` → POST /v1/browser/crawl → BrowserCrawlResponse + - `getBrowserContexts()` → GET /v1/browser/contexts → BrowserContextListResponse + - `createBrowserContext(request)` → POST /v1/browser/contexts → BrowserContextInfo + - `deleteBrowserContext(contextId)` → DELETE /v1/browser/contexts/:id → BrowserActionResponse + - `getBrowserCookies(query?)` → GET /v1/browser/cookies → BrowserCookiesResponse + - `setBrowserCookies(request)` → POST /v1/browser/cookies → BrowserActionResponse + - `deleteBrowserCookies(query?)` → DELETE /v1/browser/cookies → BrowserActionResponse +- Added 12 type imports alphabetically: BrowserConsoleQuery, BrowserConsoleResponse, BrowserContextCreateRequest, BrowserContextInfo, BrowserContextListResponse, BrowserCookiesQuery, BrowserCookiesResponse, BrowserCrawlRequest, BrowserCrawlResponse, BrowserDeleteCookiesQuery, BrowserNetworkQuery, BrowserNetworkResponse, BrowserSetCookiesRequest +- Files changed: client.ts +- **Learnings for future iterations:** + - Monitoring endpoints (console/network) use GET with optional query params, same pattern as content extraction + - Context CRUD: GET for list, POST for create (returns BrowserContextInfo, not BrowserContextListResponse), DELETE with path param for delete + - Cookie methods mirror the Rust HTTP API exactly: GET/POST/DELETE on same /cookies path + - deleteBrowserCookies uses query params (not body) for filter criteria, matching the Rust DELETE handler + - createBrowserContext returns BrowserContextInfo (single context), not BrowserContextListResponse +--- + +## 2026-03-17 - US-025 +- Created `sdks/react/src/BrowserViewer.tsx` with BrowserViewer component that wraps DesktopViewer with a browser navigation bar +- BrowserViewerClient type uses `Pick` +- BrowserViewerProps: client, className, style, height (default 480), showNavigationBar (default true), showStatusBar (default true), onNavigate, onConnect, onDisconnect, onError +- Navigation bar has back/forward/reload buttons and URL input with Enter-to-navigate +- URL auto-prefixes https:// if no protocol specified +- Syncs URL display from getBrowserStatus() on stream connect +- Passes DesktopViewer props with shell styling overridden (no double border/shadow) +- Exported BrowserViewer + BrowserViewerClient + BrowserViewerProps from index.ts +- Files changed: BrowserViewer.tsx (new), index.ts +- **Learnings for future iterations:** + - React SDK references `sandbox-agent` via workspace symlink but uses compiled dist types; must rebuild TypeScript SDK (`npx tsup` in sdks/typescript/) after adding new methods before React typecheck works + - biome pre-commit reformats: `Pick<>` union types get collapsed to single line, style objects stay as-is + - DesktopViewer accepts style prop which can override its shell styling (border, borderRadius, background, boxShadow) - useful for embedding inside a wrapper component + - BrowserViewer composes DesktopViewer rather than duplicating WebRTC logic; the stream is the same (Neko on Xvfb display) +--- + +## 2026-03-17 - US-026 +- Created `BrowserTab.tsx` in `frontend/packages/inspector/src/components/debug/` with two sections: + - Section 1 - Runtime Control: state pill (active/inactive/install_required/failed), status grid (URL, Resolution, Started), config inputs (Width, Height, URL, Context dropdown), Start/Stop buttons, auto-refresh every 5s when active + - Section 2 - Live View: navigation bar (Back, Forward, Reload + URL input), DesktopViewer component for WebRTC stream, current URL display +- Updated `DebugPanel.tsx`: added `"browser"` to DebugTab type, imported BrowserTab, added Globe icon tab button after Desktop, added render condition +- Typecheck passes +- Files changed: BrowserTab.tsx (new), DebugPanel.tsx +- **Learnings for future iterations:** + - Inspector tab pattern: add to DebugTab union type, import component, add button with icon in tabs section, add conditional render in content section + - `BrowserStartRequest` does NOT have a `streaming` field (unlike what might be expected); just omit it + - `BrowserViewerClient` from `@sandbox-agent/react` uses `Pick` and requires `connectDesktopStream`, `browserNavigate`, `browserBack`, `browserForward`, `browserReload`, `getBrowserStatus` + - Reuse `desktop-panel`, `desktop-state-grid`, `desktop-start-controls`, `desktop-input-group` CSS classes from DesktopTab for consistent layout + - biome pre-commit hook reformats: ternary chains get collapsed, style objects adjusted + - `Parameters[0]` is the pattern for deriving request types from SDK method signatures + - Browser contexts are loaded via `getBrowserContexts()` and shown in a dropdown; the contextId is passed to `startBrowser()` + - Manual browser verification needed (no browser testing tools available in this environment) +--- + +## 2026-03-17 - US-027 +- Implemented Screenshot, Tabs, and Console sections in BrowserTab.tsx +- Files changed: + - frontend/packages/inspector/src/components/debug/BrowserTab.tsx +- **What was implemented:** + - Section 3 - Screenshot: format selector (PNG/JPEG/WebP), quality input (hidden for PNG), fullPage checkbox, CSS selector input, capture button with loading state, preview image with blob URL management + - Section 4 - Tabs: list of open tabs with URL/title, active tab highlighted with green pill, per-tab Activate/Close buttons, New Tab button with URL input (Enter key support) + - Section 5 - Console: level filter pills (All/Log/Warn/Error/Info), scrollable message list with level-colored dot indicators and timestamps, auto-refresh every 3s when active +- **Learnings for future iterations:** + - `createScreenshotUrl` helper converts Uint8Array to blob URL; must be paired with `revokeScreenshotUrl` for cleanup + - `desktop-window-item` and `desktop-window-focused` CSS classes work well for any list item with active state highlighting (not just windows) + - `desktop-screenshot-controls` and `desktop-screenshot-frame`/`desktop-screenshot-image` CSS classes are reusable across browser and desktop screenshot sections + - Console auto-refresh at 3s interval is distinct from status auto-refresh at 5s; both use the same useEffect + setInterval pattern with cleanup + - `getBrowserConsole({ level })` accepts a level filter param; passing empty object gets all levels + - Tabs and console are loaded eagerly when browser becomes active via a `status?.state === "active"` useEffect dependency + - Manual browser verification needed (no browser testing tools available in this environment) +--- + +## 2026-03-17 - US-028 +- Added 5 new sections to BrowserTab.tsx: Network, Content Tools, Recording, Contexts, Diagnostics +- Files changed: frontend/packages/inspector/src/components/debug/BrowserTab.tsx +- **What was implemented:** + - Section 6 - Network: request list with method/URL/status/size/duration, URL pattern filter input, auto-refresh every 3s + - Section 7 - Content Tools: Get HTML, Get Markdown, Get Links, Get Snapshot buttons with readonly output textarea + - Section 8 - Recording: reuses desktop recording API (startDesktopRecording/stopDesktopRecording/listDesktopRecordings/downloadDesktopRecording/deleteDesktopRecording), FPS input, start/stop buttons, recording list with download/delete, poll while recording active + - Section 9 - Contexts: list browser contexts with name/id/size/date, create form, delete button, Use button to set contextId, refresh button + - Section 10 - Diagnostics: lastError details (code + message), process list with name/pid/running state/logPath +- **Learnings for future iterations:** + - Recording is a shared desktop-level feature (Xvfb recording), not browser-specific; browser and desktop tabs share the same recording API + - `downloadDesktopRecording` returns `Uint8Array` which needs the same `new Uint8Array(bytes.byteLength); payload.set(bytes)` workaround for Blob creation (TypeScript ArrayBufferLike vs ArrayBuffer type mismatch) + - Network requests use `BrowserNetworkRequest` type with `responseSize` and `duration` fields (both nullable) + - Content tools reuse existing SDK methods: getBrowserContent, getBrowserMarkdown, getBrowserLinks, getBrowserSnapshot + - Context management is available even when browser is not active (filesystem-based), so the Contexts section is always shown + - Diagnostics section conditionally renders only when there's data (lastError or processes) + - Manual browser verification needed (no browser testing tools available in this environment) +--- + +## 2026-03-17 - US-029 +- Implemented browser API integration tests +- Files changed: + - `docker/test-agent/Dockerfile` - Added chromium and browser dependency packages + - `server/packages/sandbox-agent/tests/browser_api.rs` - New integration test file with 7 test functions + - `server/packages/sandbox-agent/src/browser_cdp.rs` - Fixed CdpClient to connect to page endpoint instead of browser endpoint +- Test coverage: + - `v1_browser_status_reports_install_required_when_chromium_missing` - Missing deps detection + - `v1_browser_lifecycle_and_navigation` - Start, status, navigate, back, forward, reload, stop + - `v1_browser_tabs_management` - List, create, activate, close tabs + - `v1_browser_screenshots` - PNG, JPEG, WebP screenshot capture + - `v1_browser_content_extraction` - HTML, markdown, links, accessibility snapshot + - `v1_browser_interaction` - Click button, type text, verify state via execute + - `v1_browser_contexts_management` - Create, list, delete persistent browser profiles +- **Learnings for future iterations:** + - CdpClient must connect to a page-level endpoint (`/json/list` → first page), not the browser-level endpoint (`/json/version`). Browser endpoints only support Target/Browser domains; Page/Runtime/DOM commands need page sessions. + - The CDP proxy endpoint (`/v1/browser/cdp`) correctly uses the browser-level URL since external tools (Playwright/Puppeteer) handle session management themselves. + - Test files can be written into the container via `PUT /v1/fs/file?path=...` and then navigated to via `file:///` URLs. + - Docker image rebuild is triggered by `OnceLock` in the test harness; changing the Dockerfile or server binary invalidates the cached image tag. + - `reqwest::Client.query(&[("path", path)])` properly URL-encodes query parameters (no need for `urlencoding` crate). +--- + +## 2026-03-17 - US-030 +- Replaced fixed 500ms `tokio::time::sleep` in `browser_crawl.rs` with a `document.readyState` polling loop +- Polls every 100ms via `Runtime.evaluate`, times out after 10s, proceeds with extraction on timeout +- Files changed: `server/packages/sandbox-agent/src/browser_crawl.rs` +- **Learnings for future iterations:** + - CDP `Runtime.evaluate` with `document.readyState` is reliable for detecting page load completion + - Using `std::time::Instant` for timeout tracking avoids drift issues compared to counting iterations + - Graceful timeout (proceed anyway) is better than failing the crawl when a page is slow +--- + +## 2026-03-17 - US-031 +- Replaced faked `200` status with real HTTP status from `Network.responseReceived` CDP events +- Enabled `Network.enable` domain before crawl loop +- Subscribe to `Network.responseReceived` once, drain buffered events after readyState polling +- Added `drain_navigation_status()` helper that takes last Document response for a frame (handles redirects) +- Added `errorText` check on `Page.navigate` result: if navigation fails, record page with `None` status and skip extraction +- Files changed: `server/packages/sandbox-agent/src/browser_crawl.rs` +- **Learnings for future iterations:** + - `Network.responseReceived` events have `type` field; use `"Document"` to filter for the main navigation response + - For redirect chains, the last Document `Network.responseReceived` event has the final status code + - `Page.navigate` returns `errorText` (non-empty string) when navigation fails (DNS error, connection refused, etc.) + - `mpsc::UnboundedReceiver::try_recv()` is useful for non-blocking drain of buffered events + - `file://` URLs don't produce Network events, so status will be `None` - this is correct behavior +--- + +## 2026-03-17 - US-032 +- Removed dead `pub async fn cdp_client()` method from BrowserRuntime (browser_runtime.rs:552-564) +- Method always returned `Err(BrowserProblem::cdp_error("Use with_cdp() to execute CDP commands"))` - no callers existed +- Grep confirmed zero references to `cdp_client()` method; only the `cdp_client` field on BrowserRuntimeState is used +- Files changed: `server/packages/sandbox-agent/src/browser_runtime.rs` +- **Learnings for future iterations:** + - When removing methods, grep for the method name across the entire src directory to confirm no callers + - The `cdp_client` field on BrowserRuntimeState and the `cdp_client()` method on BrowserRuntime are different things - field is actively used +--- diff --git a/scripts/ralph/ralph.sh b/scripts/ralph/ralph.sh new file mode 100755 index 0000000..d510f20 --- /dev/null +++ b/scripts/ralph/ralph.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Ralph Wiggum - Long-running AI agent loop +# Usage: ./ralph.sh [--tool amp|claude] [max_iterations] + +set -e + +# Parse arguments +TOOL="amp" # Default to amp for backwards compatibility +MAX_ITERATIONS=10 + +while [[ $# -gt 0 ]]; do + case $1 in + --tool) + TOOL="$2" + shift 2 + ;; + --tool=*) + TOOL="${1#*=}" + shift + ;; + *) + # Assume it's max_iterations if it's a number + if [[ "$1" =~ ^[0-9]+$ ]]; then + MAX_ITERATIONS="$1" + fi + shift + ;; + esac +done + +# Validate tool choice +if [[ "$TOOL" != "amp" && "$TOOL" != "claude" ]]; then + echo "Error: Invalid tool '$TOOL'. Must be 'amp' or 'claude'." + exit 1 +fi +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PRD_FILE="$SCRIPT_DIR/prd.json" +PROGRESS_FILE="$SCRIPT_DIR/progress.txt" +ARCHIVE_DIR="$SCRIPT_DIR/archive" +LAST_BRANCH_FILE="$SCRIPT_DIR/.last-branch" + +# Archive previous run if branch changed +if [ -f "$PRD_FILE" ] && [ -f "$LAST_BRANCH_FILE" ]; then + CURRENT_BRANCH=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") + LAST_BRANCH=$(cat "$LAST_BRANCH_FILE" 2>/dev/null || echo "") + + if [ -n "$CURRENT_BRANCH" ] && [ -n "$LAST_BRANCH" ] && [ "$CURRENT_BRANCH" != "$LAST_BRANCH" ]; then + # Archive the previous run + DATE=$(date +%Y-%m-%d) + # Strip "ralph/" prefix from branch name for folder + FOLDER_NAME=$(echo "$LAST_BRANCH" | sed 's|^ralph/||') + ARCHIVE_FOLDER="$ARCHIVE_DIR/$DATE-$FOLDER_NAME" + + echo "Archiving previous run: $LAST_BRANCH" + mkdir -p "$ARCHIVE_FOLDER" + [ -f "$PRD_FILE" ] && cp "$PRD_FILE" "$ARCHIVE_FOLDER/" + [ -f "$PROGRESS_FILE" ] && cp "$PROGRESS_FILE" "$ARCHIVE_FOLDER/" + echo " Archived to: $ARCHIVE_FOLDER" + + # Reset progress file for new run + echo "# Ralph Progress Log" > "$PROGRESS_FILE" + echo "Started: $(date)" >> "$PROGRESS_FILE" + echo "---" >> "$PROGRESS_FILE" + fi +fi + +# Track current branch +if [ -f "$PRD_FILE" ]; then + CURRENT_BRANCH=$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || echo "") + if [ -n "$CURRENT_BRANCH" ]; then + echo "$CURRENT_BRANCH" > "$LAST_BRANCH_FILE" + fi +fi + +# Initialize progress file if it doesn't exist +if [ ! -f "$PROGRESS_FILE" ]; then + echo "# Ralph Progress Log" > "$PROGRESS_FILE" + echo "Started: $(date)" >> "$PROGRESS_FILE" + echo "---" >> "$PROGRESS_FILE" +fi + +RUN_START=$(date '+%Y-%m-%d %H:%M:%S') +echo "Starting Ralph - Tool: $TOOL - Max iterations: $MAX_ITERATIONS" +echo "Run started: $RUN_START" + +for i in $(seq 1 $MAX_ITERATIONS); do + ITER_START=$(date '+%Y-%m-%d %H:%M:%S') + echo "" + echo "===============================================================" + echo " Ralph Iteration $i of $MAX_ITERATIONS ($TOOL)" + echo " Started: $ITER_START" + echo "===============================================================" + + # Run the selected tool with the ralph prompt + if [[ "$TOOL" == "amp" ]]; then + OUTPUT=$(cat "$SCRIPT_DIR/prompt.md" | amp --dangerously-allow-all 2>&1 | tee /dev/stderr) || true + else + # Claude Code: use --dangerously-skip-permissions for autonomous operation, --print for output + OUTPUT=$(claude --dangerously-skip-permissions --print < "$SCRIPT_DIR/CLAUDE.md" 2>&1 | tee /dev/stderr) || true + fi + + ITER_END=$(date '+%Y-%m-%d %H:%M:%S') + ITER_DURATION=$(($(date -d "$ITER_END" +%s) - $(date -d "$ITER_START" +%s))) + ITER_MINS=$((ITER_DURATION / 60)) + ITER_SECS=$((ITER_DURATION % 60)) + + # Check for completion signal + if echo "$OUTPUT" | grep -q "COMPLETE"; then + RUN_END=$(date '+%Y-%m-%d %H:%M:%S') + RUN_DURATION=$(($(date -d "$RUN_END" +%s) - $(date -d "$RUN_START" +%s))) + RUN_MINS=$((RUN_DURATION / 60)) + RUN_SECS=$((RUN_DURATION % 60)) + echo "" + echo "Ralph completed all tasks!" + echo "Completed at iteration $i of $MAX_ITERATIONS" + echo "Iteration: ${ITER_MINS}m ${ITER_SECS}s" + echo "Run started: $RUN_START" + echo "Run finished: $RUN_END (total: ${RUN_MINS}m ${RUN_SECS}s)" + exit 0 + fi + + echo "Iteration $i complete. Finished: $ITER_END (${ITER_MINS}m ${ITER_SECS}s)" + sleep 2 +done + +RUN_END=$(date '+%Y-%m-%d %H:%M:%S') +RUN_DURATION=$(($(date -d "$RUN_END" +%s) - $(date -d "$RUN_START" +%s))) +RUN_MINS=$((RUN_DURATION / 60)) +RUN_SECS=$((RUN_DURATION % 60)) +echo "" +echo "Ralph reached max iterations ($MAX_ITERATIONS) without completing all tasks." +echo "Run started: $RUN_START" +echo "Run finished: $RUN_END (total: ${RUN_MINS}m ${RUN_SECS}s)" +echo "Check $PROGRESS_FILE for status." +exit 1 diff --git a/server/packages/sandbox-agent/src/browser_runtime.rs b/server/packages/sandbox-agent/src/browser_runtime.rs index b555225..ea8233d 100644 --- a/server/packages/sandbox-agent/src/browser_runtime.rs +++ b/server/packages/sandbox-agent/src/browser_runtime.rs @@ -24,8 +24,8 @@ use crate::process_runtime::{ ProcessOwner, ProcessRuntime, ProcessStartSpec, ProcessStatus, RestartPolicy, }; -const DEFAULT_WIDTH: u32 = 1440; -const DEFAULT_HEIGHT: u32 = 900; +const DEFAULT_WIDTH: u32 = 1280; +const DEFAULT_HEIGHT: u32 = 720; const DEFAULT_DPI: u32 = 96; const DEFAULT_DISPLAY_NUM: i32 = 98; const MAX_DISPLAY_PROBE: i32 = 10;