feat: [US-018] - Add browser crawl endpoint

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 07:04:48 +00:00 · 2026-03-17 06:09:49 -07:00 · 2026-03-17 06:09:49 -07:00 · 806acdf199
commit 806acdf199
parent 8223383858
4 changed files with 297 additions and 0 deletions
--- a/server/packages/sandbox-agent/Cargo.toml
+++ b/server/packages/sandbox-agent/Cargo.toml
@ -43,6 +43,7 @@ tar.workspace = true
 zip.workspace = true
 tokio-tungstenite = "0.24"
 html2md = "0.2"
+url.workspace = true
 tempfile = { workspace = true, optional = true }

 [target.'cfg(unix)'.dependencies]
--- a/server/packages/sandbox-agent/src/browser_crawl.rs
+++ b/server/packages/sandbox-agent/src/browser_crawl.rs
@ -0,0 +1,264 @@
+use std::collections::{HashSet, VecDeque};
+
+use url::Url;
+
+use crate::browser_cdp::CdpClient;
+use crate::browser_errors::BrowserProblem;
+use crate::browser_types::{
+    BrowserCrawlExtract, BrowserCrawlPage, BrowserCrawlRequest, BrowserCrawlResponse,
+};
+
+/// Perform a BFS crawl starting from the given URL.
+///
+/// Navigates to each page via CDP, extracts content according to the requested
+/// format, collects links, and follows them breadth-first within the configured
+/// domain and depth limits.
+pub async fn crawl_pages(
+    cdp: &CdpClient,
+    request: &BrowserCrawlRequest,
+) -> Result<BrowserCrawlResponse, BrowserProblem> {
+    let max_pages = request.max_pages.unwrap_or(10).min(100) as usize;
+    let max_depth = request.max_depth.unwrap_or(2);
+    let extract = request.extract.unwrap_or(BrowserCrawlExtract::Markdown);
+
+    // Parse the starting URL to determine the default allowed domain.
+    let start_url = Url::parse(&request.url)
+        .map_err(|e| BrowserProblem::cdp_error(format!("Invalid start URL: {e}")))?;
+
+    let allowed_domains: HashSet<String> = if let Some(ref domains) = request.allowed_domains {
+        domains.iter().cloned().collect()
+    } else {
+        // Default: only crawl same domain as start URL.
+        let mut set = HashSet::new();
+        if let Some(host) = start_url.host_str() {
+            set.insert(host.to_string());
+        }
+        set
+    };
+
+    let mut visited: HashSet<String> = HashSet::new();
+    let mut queue: VecDeque<(String, u32)> = VecDeque::new();
+    let mut pages: Vec<BrowserCrawlPage> = Vec::new();
+
+    queue.push_back((request.url.clone(), 0));
+    visited.insert(normalize_url(&request.url));
+
+    cdp.send("Page.enable", None).await?;
+
+    while let Some((url, depth)) = queue.pop_front() {
+        if pages.len() >= max_pages {
+            break;
+        }
+
+        // Navigate to the page.
+        let nav_result = cdp
+            .send("Page.navigate", Some(serde_json::json!({ "url": url })))
+            .await?;
+
+        let status = nav_result.get("frameId").map(|_| 200u16);
+
+        // Wait for load.
+        tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+
+        // Get page info.
+        let (page_url, title) = get_page_info(cdp).await?;
+
+        // Extract content based on requested mode.
+        let content = extract_content(cdp, extract).await?;
+
+        // Collect links for further crawling.
+        let links = extract_links(cdp).await?;
+
+        pages.push(BrowserCrawlPage {
+            url: page_url,
+            title,
+            content,
+            links: links.clone(),
+            status,
+            depth,
+        });
+
+        // Enqueue discovered links if we haven't reached max depth.
+        if depth < max_depth {
+            for link in &links {
+                let normalized = normalize_url(link);
+                if visited.contains(&normalized) {
+                    continue;
+                }
+                if let Ok(parsed) = Url::parse(link) {
+                    if parsed.scheme() != "http" && parsed.scheme() != "https" {
+                        continue;
+                    }
+                    if let Some(host) = parsed.host_str() {
+                        if !allowed_domains.is_empty() && !allowed_domains.contains(host) {
+                            continue;
+                        }
+                    }
+                    visited.insert(normalized);
+                    queue.push_back((link.clone(), depth + 1));
+                }
+            }
+        }
+    }
+
+    let total_pages = pages.len() as u32;
+    let truncated = !queue.is_empty();
+
+    Ok(BrowserCrawlResponse {
+        pages,
+        total_pages,
+        truncated,
+    })
+}
+
+/// Normalize a URL by removing the fragment for deduplication.
+fn normalize_url(url: &str) -> String {
+    if let Ok(mut parsed) = Url::parse(url) {
+        parsed.set_fragment(None);
+        parsed.to_string()
+    } else {
+        url.to_string()
+    }
+}
+
+/// Get the current page URL and title via CDP Runtime.evaluate.
+async fn get_page_info(cdp: &CdpClient) -> Result<(String, String), BrowserProblem> {
+    let url_result = cdp
+        .send(
+            "Runtime.evaluate",
+            Some(serde_json::json!({
+                "expression": "document.location.href",
+                "returnByValue": true
+            })),
+        )
+        .await?;
+    let url = url_result
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_str())
+        .unwrap_or("")
+        .to_string();
+
+    let title_result = cdp
+        .send(
+            "Runtime.evaluate",
+            Some(serde_json::json!({
+                "expression": "document.title",
+                "returnByValue": true
+            })),
+        )
+        .await?;
+    let title = title_result
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_str())
+        .unwrap_or("")
+        .to_string();
+
+    Ok((url, title))
+}
+
+/// Extract page content according to the requested format.
+async fn extract_content(
+    cdp: &CdpClient,
+    extract: BrowserCrawlExtract,
+) -> Result<String, BrowserProblem> {
+    match extract {
+        BrowserCrawlExtract::Html => {
+            let result = cdp
+                .send(
+                    "Runtime.evaluate",
+                    Some(serde_json::json!({
+                        "expression": "document.documentElement.outerHTML",
+                        "returnByValue": true
+                    })),
+                )
+                .await?;
+            Ok(result
+                .get("result")
+                .and_then(|r| r.get("value"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string())
+        }
+        BrowserCrawlExtract::Text => {
+            let result = cdp
+                .send(
+                    "Runtime.evaluate",
+                    Some(serde_json::json!({
+                        "expression": "document.body.innerText",
+                        "returnByValue": true
+                    })),
+                )
+                .await?;
+            Ok(result
+                .get("result")
+                .and_then(|r| r.get("value"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("")
+                .to_string())
+        }
+        BrowserCrawlExtract::Markdown => {
+            let expression = r#"
+                (function() {
+                    var clone = document.body.cloneNode(true);
+                    var selectors = ['nav', 'footer', 'aside', 'header', '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]'];
+                    selectors.forEach(function(sel) {
+                        clone.querySelectorAll(sel).forEach(function(el) { el.remove(); });
+                    });
+                    return clone.innerHTML;
+                })()
+            "#;
+            let result = cdp
+                .send(
+                    "Runtime.evaluate",
+                    Some(serde_json::json!({
+                        "expression": expression,
+                        "returnByValue": true
+                    })),
+                )
+                .await?;
+            let html = result
+                .get("result")
+                .and_then(|r| r.get("value"))
+                .and_then(|v| v.as_str())
+                .unwrap_or("");
+            Ok(html2md::parse_html(html))
+        }
+        BrowserCrawlExtract::Links => {
+            // For "links" extraction, content is empty; links are in the links field.
+            Ok(String::new())
+        }
+    }
+}
+
+/// Extract all http/https links from the current page.
+async fn extract_links(cdp: &CdpClient) -> Result<Vec<String>, BrowserProblem> {
+    let expression = r#"
+        (function() {
+            var links = [];
+            document.querySelectorAll('a[href]').forEach(function(a) {
+                if (a.href && a.href.startsWith('http')) {
+                    links.push(a.href);
+                }
+            });
+            return JSON.stringify(links);
+        })()
+    "#;
+    let result = cdp
+        .send(
+            "Runtime.evaluate",
+            Some(serde_json::json!({
+                "expression": expression,
+                "returnByValue": true
+            })),
+        )
+        .await?;
+    let json_str = result
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_str())
+        .unwrap_or("[]");
+    let links: Vec<String> = serde_json::from_str(json_str).unwrap_or_default();
+    Ok(links)
+}
--- a/server/packages/sandbox-agent/src/lib.rs
+++ b/server/packages/sandbox-agent/src/lib.rs
@ -3,6 +3,7 @@
 mod acp_proxy_runtime;
 mod browser_cdp;
 mod browser_context;
+mod browser_crawl;
 mod browser_errors;
 mod browser_install;
 mod browser_runtime;
--- a/server/packages/sandbox-agent/src/router.rs
+++ b/server/packages/sandbox-agent/src/router.rs
@ -307,6 +307,7 @@ pub fn build_router_with_state(shared: Arc<AppState>) -> (Router, Arc<AppState>)
        .route("/browser/dialog", post(post_v1_browser_dialog))
        .route("/browser/console", get(get_v1_browser_console))
        .route("/browser/network", get(get_v1_browser_network))
+        .route("/browser/crawl", post(post_v1_browser_crawl))
        .route(
            "/browser/contexts",
            get(get_v1_browser_contexts).post(post_v1_browser_contexts),
@ -544,6 +545,7 @@ pub async fn shutdown_servers(state: &Arc<AppState>) {
        get_v1_browser_cookies,
        post_v1_browser_cookies,
        delete_v1_browser_cookies,
+        post_v1_browser_crawl,
        get_v1_agents,
        get_v1_agent,
        post_v1_agent_install,
@ -664,6 +666,10 @@ pub async fn shutdown_servers(state: &Arc<AppState>) {
            BrowserCookiesResponse,
            BrowserSetCookiesRequest,
            BrowserDeleteCookiesQuery,
+            BrowserCrawlRequest,
+            BrowserCrawlExtract,
+            BrowserCrawlPage,
+            BrowserCrawlResponse,
            DesktopClipboardResponse,
            DesktopClipboardQuery,
            DesktopClipboardWriteRequest,
@ -2975,6 +2981,31 @@ async fn delete_v1_browser_cookies(
    Ok(Json(BrowserActionResponse { ok: true }))
 }

+/// Crawl multiple pages starting from a URL.
+///
+/// Performs a breadth-first crawl: navigates to each page, extracts content in
+/// the requested format, collects links, and follows them within the configured
+/// domain and depth limits.
+#[utoipa::path(
+    post,
+    path = "/v1/browser/crawl",
+    tag = "v1",
+    request_body = BrowserCrawlRequest,
+    responses(
+        (status = 200, description = "Crawl results", body = BrowserCrawlResponse),
+        (status = 409, description = "Browser runtime is not active", body = ProblemDetails),
+        (status = 502, description = "CDP command failed", body = ProblemDetails)
+    )
+)]
+async fn post_v1_browser_crawl(
+    State(state): State<Arc<AppState>>,
+    Json(body): Json<BrowserCrawlRequest>,
+) -> Result<Json<BrowserCrawlResponse>, ApiError> {
+    let cdp = state.browser_runtime().get_cdp().await?;
+    let response = crate::browser_crawl::crawl_pages(&cdp, &body).await?;
+    Ok(Json(response))
+}
+
 /// Helper: get the current page URL and title via CDP Runtime.evaluate.
 async fn get_page_info_via_cdp(
    cdp: &crate::browser_cdp::CdpClient,