feat: [US-011] - Add browser content extraction endpoints (HTML, markdown, links, snapshot)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 04:03:31 +00:00 · 2026-03-17 05:32:21 -07:00 · 2026-03-17 05:32:21 -07:00 · 1ae732d5b6
commit 1ae732d5b6
parent 45258c32b0
2 changed files with 246 additions and 0 deletions
--- a/server/packages/sandbox-agent/Cargo.toml
+++ b/server/packages/sandbox-agent/Cargo.toml
@ -42,6 +42,7 @@ toml_edit.workspace = true
 tar.workspace = true
 zip.workspace = true
 tokio-tungstenite = "0.24"
+html2md = "0.2"
 tempfile = { workspace = true, optional = true }

 [target.'cfg(unix)'.dependencies]
--- a/server/packages/sandbox-agent/src/router.rs
+++ b/server/packages/sandbox-agent/src/router.rs
@ -292,6 +292,10 @@ pub fn build_router_with_state(shared: Arc<AppState>) -> (Router, Arc<AppState>)
        .route("/browser/tabs/:tab_id", delete(delete_v1_browser_tab))
        .route("/browser/screenshot", get(get_v1_browser_screenshot))
        .route("/browser/pdf", get(get_v1_browser_pdf))
+        .route("/browser/content", get(get_v1_browser_content))
+        .route("/browser/markdown", get(get_v1_browser_markdown))
+        .route("/browser/links", get(get_v1_browser_links))
+        .route("/browser/snapshot", get(get_v1_browser_snapshot))
        .route("/agents", get(get_v1_agents))
        .route("/agents/:agent", get(get_v1_agent))
        .route("/agents/:agent/install", post(post_v1_agent_install))
@ -494,6 +498,10 @@ pub async fn shutdown_servers(state: &Arc<AppState>) {
        delete_v1_browser_tab,
        get_v1_browser_screenshot,
        get_v1_browser_pdf,
+        get_v1_browser_content,
+        get_v1_browser_markdown,
+        get_v1_browser_links,
+        get_v1_browser_snapshot,
        get_v1_agents,
        get_v1_agent,
        post_v1_agent_install,
@ -581,6 +589,12 @@ pub async fn shutdown_servers(state: &Arc<AppState>) {
            BrowserScreenshotFormat,
            BrowserPdfQuery,
            BrowserPdfFormat,
+            BrowserContentQuery,
+            BrowserContentResponse,
+            BrowserMarkdownResponse,
+            BrowserLinkInfo,
+            BrowserLinksResponse,
+            BrowserSnapshotResponse,
            DesktopClipboardResponse,
            DesktopClipboardQuery,
            DesktopClipboardWriteRequest,
@ -1663,6 +1677,237 @@ async fn get_v1_browser_pdf(
        .into_response())
 }

+/// Get the HTML content of the current browser page.
+///
+/// Returns the outerHTML of the page or a specific element selected by a CSS
+/// selector, along with the current URL and title.
+#[utoipa::path(
+    get,
+    path = "/v1/browser/content",
+    tag = "v1",
+    params(BrowserContentQuery),
+    responses(
+        (status = 200, description = "Page HTML content", body = BrowserContentResponse),
+        (status = 409, description = "Browser runtime is not active", body = ProblemDetails),
+        (status = 502, description = "CDP command failed", body = ProblemDetails)
+    )
+)]
+async fn get_v1_browser_content(
+    State(state): State<Arc<AppState>>,
+    Query(query): Query<BrowserContentQuery>,
+) -> Result<Json<BrowserContentResponse>, ApiError> {
+    let cdp = state.browser_runtime().get_cdp().await?;
+    let (url, title) = get_page_info_via_cdp(&cdp).await?;
+
+    let expression = if let Some(ref selector) = query.selector {
+        let escaped = selector.replace('\\', "\\\\").replace('\'', "\\'");
+        format!(
+            "(function() {{ var el = document.querySelector('{}'); return el ? el.outerHTML : null; }})()",
+            escaped
+        )
+    } else {
+        "document.documentElement.outerHTML".to_string()
+    };
+
+    let result = cdp
+        .send(
+            "Runtime.evaluate",
+            Some(serde_json::json!({
+                "expression": expression,
+                "returnByValue": true
+            })),
+        )
+        .await?;
+
+    let html = result
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_str())
+        .unwrap_or("")
+        .to_string();
+
+    if query.selector.is_some() && html.is_empty() {
+        return Err(BrowserProblem::not_found(&format!(
+            "Element not found: {}",
+            query.selector.as_deref().unwrap_or("")
+        ))
+        .into());
+    }
+
+    Ok(Json(BrowserContentResponse { html, url, title }))
+}
+
+/// Get the page content as Markdown.
+///
+/// Extracts the DOM HTML via CDP, strips navigation/footer/aside elements, and
+/// converts the remaining content to Markdown using html2md.
+#[utoipa::path(
+    get,
+    path = "/v1/browser/markdown",
+    tag = "v1",
+    responses(
+        (status = 200, description = "Page content as Markdown", body = BrowserMarkdownResponse),
+        (status = 409, description = "Browser runtime is not active", body = ProblemDetails),
+        (status = 502, description = "CDP command failed", body = ProblemDetails)
+    )
+)]
+async fn get_v1_browser_markdown(
+    State(state): State<Arc<AppState>>,
+) -> Result<Json<BrowserMarkdownResponse>, ApiError> {
+    let cdp = state.browser_runtime().get_cdp().await?;
+    let (url, title) = get_page_info_via_cdp(&cdp).await?;
+
+    // Extract body HTML with nav/footer/aside stripped out
+    let expression = r#"
+        (function() {
+            var clone = document.body.cloneNode(true);
+            var selectors = ['nav', 'footer', 'aside', 'header', '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]'];
+            selectors.forEach(function(sel) {
+                clone.querySelectorAll(sel).forEach(function(el) { el.remove(); });
+            });
+            return clone.innerHTML;
+        })()
+    "#;
+
+    let result = cdp
+        .send(
+            "Runtime.evaluate",
+            Some(serde_json::json!({
+                "expression": expression,
+                "returnByValue": true
+            })),
+        )
+        .await?;
+
+    let html = result
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_str())
+        .unwrap_or("");
+
+    let markdown = html2md::parse_html(html);
+
+    Ok(Json(BrowserMarkdownResponse {
+        markdown,
+        url,
+        title,
+    }))
+}
+
+/// Get all links on the current page.
+///
+/// Extracts all anchor elements from the page via CDP and returns their href
+/// and text content.
+#[utoipa::path(
+    get,
+    path = "/v1/browser/links",
+    tag = "v1",
+    responses(
+        (status = 200, description = "Links on the page", body = BrowserLinksResponse),
+        (status = 409, description = "Browser runtime is not active", body = ProblemDetails),
+        (status = 502, description = "CDP command failed", body = ProblemDetails)
+    )
+)]
+async fn get_v1_browser_links(
+    State(state): State<Arc<AppState>>,
+) -> Result<Json<BrowserLinksResponse>, ApiError> {
+    let cdp = state.browser_runtime().get_cdp().await?;
+    let (url, _title) = get_page_info_via_cdp(&cdp).await?;
+
+    let expression = r#"
+        (function() {
+            var links = [];
+            document.querySelectorAll('a[href]').forEach(function(a) {
+                links.push({ href: a.href, text: (a.textContent || '').trim() });
+            });
+            return JSON.stringify(links);
+        })()
+    "#;
+
+    let result = cdp
+        .send(
+            "Runtime.evaluate",
+            Some(serde_json::json!({
+                "expression": expression,
+                "returnByValue": true
+            })),
+        )
+        .await?;
+
+    let json_str = result
+        .get("result")
+        .and_then(|r| r.get("value"))
+        .and_then(|v| v.as_str())
+        .unwrap_or("[]");
+
+    let links: Vec<BrowserLinkInfo> = serde_json::from_str(json_str).unwrap_or_default();
+
+    Ok(Json(BrowserLinksResponse { links, url }))
+}
+
+/// Get an accessibility tree snapshot of the current page.
+///
+/// Returns a text representation of the page accessibility tree via CDP
+/// `Accessibility.getFullAXTree`.
+#[utoipa::path(
+    get,
+    path = "/v1/browser/snapshot",
+    tag = "v1",
+    responses(
+        (status = 200, description = "Accessibility tree snapshot", body = BrowserSnapshotResponse),
+        (status = 409, description = "Browser runtime is not active", body = ProblemDetails),
+        (status = 502, description = "CDP command failed", body = ProblemDetails)
+    )
+)]
+async fn get_v1_browser_snapshot(
+    State(state): State<Arc<AppState>>,
+) -> Result<Json<BrowserSnapshotResponse>, ApiError> {
+    let cdp = state.browser_runtime().get_cdp().await?;
+    let (url, title) = get_page_info_via_cdp(&cdp).await?;
+
+    let result = cdp.send("Accessibility.getFullAXTree", None).await?;
+
+    // Format the AX tree into a readable text snapshot
+    let nodes = result
+        .get("nodes")
+        .and_then(|v| v.as_array())
+        .cloned()
+        .unwrap_or_default();
+
+    let mut snapshot = String::new();
+    for node in &nodes {
+        let role = node
+            .get("role")
+            .and_then(|r| r.get("value"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+        let name = node
+            .get("name")
+            .and_then(|n| n.get("value"))
+            .and_then(|v| v.as_str())
+            .unwrap_or("");
+
+        if role == "none" || role == "GenericContainer" || (role.is_empty() && name.is_empty()) {
+            continue;
+        }
+
+        if !snapshot.is_empty() {
+            snapshot.push('\n');
+        }
+        if name.is_empty() {
+            snapshot.push_str(role);
+        } else {
+            snapshot.push_str(&format!("{}: {}", role, name));
+        }
+    }
+
+    Ok(Json(BrowserSnapshotResponse {
+        snapshot,
+        url,
+        title,
+    }))
+}
+
 /// Helper: get the current page URL and title via CDP Runtime.evaluate.
 async fn get_page_info_via_cdp(
    cdp: &crate::browser_cdp::CdpClient,