mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 07:04:48 +00:00
feat: [US-018] - Add browser crawl endpoint
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8223383858
commit
806acdf199
4 changed files with 297 additions and 0 deletions
|
|
@ -43,6 +43,7 @@ tar.workspace = true
|
||||||
zip.workspace = true
|
zip.workspace = true
|
||||||
tokio-tungstenite = "0.24"
|
tokio-tungstenite = "0.24"
|
||||||
html2md = "0.2"
|
html2md = "0.2"
|
||||||
|
url.workspace = true
|
||||||
tempfile = { workspace = true, optional = true }
|
tempfile = { workspace = true, optional = true }
|
||||||
|
|
||||||
[target.'cfg(unix)'.dependencies]
|
[target.'cfg(unix)'.dependencies]
|
||||||
|
|
|
||||||
264
server/packages/sandbox-agent/src/browser_crawl.rs
Normal file
264
server/packages/sandbox-agent/src/browser_crawl.rs
Normal file
|
|
@ -0,0 +1,264 @@
|
||||||
|
use std::collections::{HashSet, VecDeque};
|
||||||
|
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::browser_cdp::CdpClient;
|
||||||
|
use crate::browser_errors::BrowserProblem;
|
||||||
|
use crate::browser_types::{
|
||||||
|
BrowserCrawlExtract, BrowserCrawlPage, BrowserCrawlRequest, BrowserCrawlResponse,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Perform a BFS crawl starting from the given URL.
|
||||||
|
///
|
||||||
|
/// Navigates to each page via CDP, extracts content according to the requested
|
||||||
|
/// format, collects links, and follows them breadth-first within the configured
|
||||||
|
/// domain and depth limits.
|
||||||
|
pub async fn crawl_pages(
|
||||||
|
cdp: &CdpClient,
|
||||||
|
request: &BrowserCrawlRequest,
|
||||||
|
) -> Result<BrowserCrawlResponse, BrowserProblem> {
|
||||||
|
let max_pages = request.max_pages.unwrap_or(10).min(100) as usize;
|
||||||
|
let max_depth = request.max_depth.unwrap_or(2);
|
||||||
|
let extract = request.extract.unwrap_or(BrowserCrawlExtract::Markdown);
|
||||||
|
|
||||||
|
// Parse the starting URL to determine the default allowed domain.
|
||||||
|
let start_url = Url::parse(&request.url)
|
||||||
|
.map_err(|e| BrowserProblem::cdp_error(format!("Invalid start URL: {e}")))?;
|
||||||
|
|
||||||
|
let allowed_domains: HashSet<String> = if let Some(ref domains) = request.allowed_domains {
|
||||||
|
domains.iter().cloned().collect()
|
||||||
|
} else {
|
||||||
|
// Default: only crawl same domain as start URL.
|
||||||
|
let mut set = HashSet::new();
|
||||||
|
if let Some(host) = start_url.host_str() {
|
||||||
|
set.insert(host.to_string());
|
||||||
|
}
|
||||||
|
set
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut visited: HashSet<String> = HashSet::new();
|
||||||
|
let mut queue: VecDeque<(String, u32)> = VecDeque::new();
|
||||||
|
let mut pages: Vec<BrowserCrawlPage> = Vec::new();
|
||||||
|
|
||||||
|
queue.push_back((request.url.clone(), 0));
|
||||||
|
visited.insert(normalize_url(&request.url));
|
||||||
|
|
||||||
|
cdp.send("Page.enable", None).await?;
|
||||||
|
|
||||||
|
while let Some((url, depth)) = queue.pop_front() {
|
||||||
|
if pages.len() >= max_pages {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Navigate to the page.
|
||||||
|
let nav_result = cdp
|
||||||
|
.send("Page.navigate", Some(serde_json::json!({ "url": url })))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let status = nav_result.get("frameId").map(|_| 200u16);
|
||||||
|
|
||||||
|
// Wait for load.
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
||||||
|
|
||||||
|
// Get page info.
|
||||||
|
let (page_url, title) = get_page_info(cdp).await?;
|
||||||
|
|
||||||
|
// Extract content based on requested mode.
|
||||||
|
let content = extract_content(cdp, extract).await?;
|
||||||
|
|
||||||
|
// Collect links for further crawling.
|
||||||
|
let links = extract_links(cdp).await?;
|
||||||
|
|
||||||
|
pages.push(BrowserCrawlPage {
|
||||||
|
url: page_url,
|
||||||
|
title,
|
||||||
|
content,
|
||||||
|
links: links.clone(),
|
||||||
|
status,
|
||||||
|
depth,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Enqueue discovered links if we haven't reached max depth.
|
||||||
|
if depth < max_depth {
|
||||||
|
for link in &links {
|
||||||
|
let normalized = normalize_url(link);
|
||||||
|
if visited.contains(&normalized) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Ok(parsed) = Url::parse(link) {
|
||||||
|
if parsed.scheme() != "http" && parsed.scheme() != "https" {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Some(host) = parsed.host_str() {
|
||||||
|
if !allowed_domains.is_empty() && !allowed_domains.contains(host) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
visited.insert(normalized);
|
||||||
|
queue.push_back((link.clone(), depth + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let total_pages = pages.len() as u32;
|
||||||
|
let truncated = !queue.is_empty();
|
||||||
|
|
||||||
|
Ok(BrowserCrawlResponse {
|
||||||
|
pages,
|
||||||
|
total_pages,
|
||||||
|
truncated,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize a URL by removing the fragment for deduplication.
|
||||||
|
fn normalize_url(url: &str) -> String {
|
||||||
|
if let Ok(mut parsed) = Url::parse(url) {
|
||||||
|
parsed.set_fragment(None);
|
||||||
|
parsed.to_string()
|
||||||
|
} else {
|
||||||
|
url.to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the current page URL and title via CDP Runtime.evaluate.
|
||||||
|
async fn get_page_info(cdp: &CdpClient) -> Result<(String, String), BrowserProblem> {
|
||||||
|
let url_result = cdp
|
||||||
|
.send(
|
||||||
|
"Runtime.evaluate",
|
||||||
|
Some(serde_json::json!({
|
||||||
|
"expression": "document.location.href",
|
||||||
|
"returnByValue": true
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let url = url_result
|
||||||
|
.get("result")
|
||||||
|
.and_then(|r| r.get("value"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let title_result = cdp
|
||||||
|
.send(
|
||||||
|
"Runtime.evaluate",
|
||||||
|
Some(serde_json::json!({
|
||||||
|
"expression": "document.title",
|
||||||
|
"returnByValue": true
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let title = title_result
|
||||||
|
.get("result")
|
||||||
|
.and_then(|r| r.get("value"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
Ok((url, title))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract page content according to the requested format.
|
||||||
|
async fn extract_content(
|
||||||
|
cdp: &CdpClient,
|
||||||
|
extract: BrowserCrawlExtract,
|
||||||
|
) -> Result<String, BrowserProblem> {
|
||||||
|
match extract {
|
||||||
|
BrowserCrawlExtract::Html => {
|
||||||
|
let result = cdp
|
||||||
|
.send(
|
||||||
|
"Runtime.evaluate",
|
||||||
|
Some(serde_json::json!({
|
||||||
|
"expression": "document.documentElement.outerHTML",
|
||||||
|
"returnByValue": true
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(result
|
||||||
|
.get("result")
|
||||||
|
.and_then(|r| r.get("value"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string())
|
||||||
|
}
|
||||||
|
BrowserCrawlExtract::Text => {
|
||||||
|
let result = cdp
|
||||||
|
.send(
|
||||||
|
"Runtime.evaluate",
|
||||||
|
Some(serde_json::json!({
|
||||||
|
"expression": "document.body.innerText",
|
||||||
|
"returnByValue": true
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(result
|
||||||
|
.get("result")
|
||||||
|
.and_then(|r| r.get("value"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string())
|
||||||
|
}
|
||||||
|
BrowserCrawlExtract::Markdown => {
|
||||||
|
let expression = r#"
|
||||||
|
(function() {
|
||||||
|
var clone = document.body.cloneNode(true);
|
||||||
|
var selectors = ['nav', 'footer', 'aside', 'header', '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]'];
|
||||||
|
selectors.forEach(function(sel) {
|
||||||
|
clone.querySelectorAll(sel).forEach(function(el) { el.remove(); });
|
||||||
|
});
|
||||||
|
return clone.innerHTML;
|
||||||
|
})()
|
||||||
|
"#;
|
||||||
|
let result = cdp
|
||||||
|
.send(
|
||||||
|
"Runtime.evaluate",
|
||||||
|
Some(serde_json::json!({
|
||||||
|
"expression": expression,
|
||||||
|
"returnByValue": true
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let html = result
|
||||||
|
.get("result")
|
||||||
|
.and_then(|r| r.get("value"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("");
|
||||||
|
Ok(html2md::parse_html(html))
|
||||||
|
}
|
||||||
|
BrowserCrawlExtract::Links => {
|
||||||
|
// For "links" extraction, content is empty; links are in the links field.
|
||||||
|
Ok(String::new())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract all http/https links from the current page.
|
||||||
|
async fn extract_links(cdp: &CdpClient) -> Result<Vec<String>, BrowserProblem> {
|
||||||
|
let expression = r#"
|
||||||
|
(function() {
|
||||||
|
var links = [];
|
||||||
|
document.querySelectorAll('a[href]').forEach(function(a) {
|
||||||
|
if (a.href && a.href.startsWith('http')) {
|
||||||
|
links.push(a.href);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return JSON.stringify(links);
|
||||||
|
})()
|
||||||
|
"#;
|
||||||
|
let result = cdp
|
||||||
|
.send(
|
||||||
|
"Runtime.evaluate",
|
||||||
|
Some(serde_json::json!({
|
||||||
|
"expression": expression,
|
||||||
|
"returnByValue": true
|
||||||
|
})),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let json_str = result
|
||||||
|
.get("result")
|
||||||
|
.and_then(|r| r.get("value"))
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("[]");
|
||||||
|
let links: Vec<String> = serde_json::from_str(json_str).unwrap_or_default();
|
||||||
|
Ok(links)
|
||||||
|
}
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
mod acp_proxy_runtime;
|
mod acp_proxy_runtime;
|
||||||
mod browser_cdp;
|
mod browser_cdp;
|
||||||
mod browser_context;
|
mod browser_context;
|
||||||
|
mod browser_crawl;
|
||||||
mod browser_errors;
|
mod browser_errors;
|
||||||
mod browser_install;
|
mod browser_install;
|
||||||
mod browser_runtime;
|
mod browser_runtime;
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ pub fn build_router_with_state(shared: Arc<AppState>) -> (Router, Arc<AppState>)
|
||||||
.route("/browser/dialog", post(post_v1_browser_dialog))
|
.route("/browser/dialog", post(post_v1_browser_dialog))
|
||||||
.route("/browser/console", get(get_v1_browser_console))
|
.route("/browser/console", get(get_v1_browser_console))
|
||||||
.route("/browser/network", get(get_v1_browser_network))
|
.route("/browser/network", get(get_v1_browser_network))
|
||||||
|
.route("/browser/crawl", post(post_v1_browser_crawl))
|
||||||
.route(
|
.route(
|
||||||
"/browser/contexts",
|
"/browser/contexts",
|
||||||
get(get_v1_browser_contexts).post(post_v1_browser_contexts),
|
get(get_v1_browser_contexts).post(post_v1_browser_contexts),
|
||||||
|
|
@ -544,6 +545,7 @@ pub async fn shutdown_servers(state: &Arc<AppState>) {
|
||||||
get_v1_browser_cookies,
|
get_v1_browser_cookies,
|
||||||
post_v1_browser_cookies,
|
post_v1_browser_cookies,
|
||||||
delete_v1_browser_cookies,
|
delete_v1_browser_cookies,
|
||||||
|
post_v1_browser_crawl,
|
||||||
get_v1_agents,
|
get_v1_agents,
|
||||||
get_v1_agent,
|
get_v1_agent,
|
||||||
post_v1_agent_install,
|
post_v1_agent_install,
|
||||||
|
|
@ -664,6 +666,10 @@ pub async fn shutdown_servers(state: &Arc<AppState>) {
|
||||||
BrowserCookiesResponse,
|
BrowserCookiesResponse,
|
||||||
BrowserSetCookiesRequest,
|
BrowserSetCookiesRequest,
|
||||||
BrowserDeleteCookiesQuery,
|
BrowserDeleteCookiesQuery,
|
||||||
|
BrowserCrawlRequest,
|
||||||
|
BrowserCrawlExtract,
|
||||||
|
BrowserCrawlPage,
|
||||||
|
BrowserCrawlResponse,
|
||||||
DesktopClipboardResponse,
|
DesktopClipboardResponse,
|
||||||
DesktopClipboardQuery,
|
DesktopClipboardQuery,
|
||||||
DesktopClipboardWriteRequest,
|
DesktopClipboardWriteRequest,
|
||||||
|
|
@ -2975,6 +2981,31 @@ async fn delete_v1_browser_cookies(
|
||||||
Ok(Json(BrowserActionResponse { ok: true }))
|
Ok(Json(BrowserActionResponse { ok: true }))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Crawl multiple pages starting from a URL.
|
||||||
|
///
|
||||||
|
/// Performs a breadth-first crawl: navigates to each page, extracts content in
|
||||||
|
/// the requested format, collects links, and follows them within the configured
|
||||||
|
/// domain and depth limits.
|
||||||
|
#[utoipa::path(
|
||||||
|
post,
|
||||||
|
path = "/v1/browser/crawl",
|
||||||
|
tag = "v1",
|
||||||
|
request_body = BrowserCrawlRequest,
|
||||||
|
responses(
|
||||||
|
(status = 200, description = "Crawl results", body = BrowserCrawlResponse),
|
||||||
|
(status = 409, description = "Browser runtime is not active", body = ProblemDetails),
|
||||||
|
(status = 502, description = "CDP command failed", body = ProblemDetails)
|
||||||
|
)
|
||||||
|
)]
|
||||||
|
async fn post_v1_browser_crawl(
|
||||||
|
State(state): State<Arc<AppState>>,
|
||||||
|
Json(body): Json<BrowserCrawlRequest>,
|
||||||
|
) -> Result<Json<BrowserCrawlResponse>, ApiError> {
|
||||||
|
let cdp = state.browser_runtime().get_cdp().await?;
|
||||||
|
let response = crate::browser_crawl::crawl_pages(&cdp, &body).await?;
|
||||||
|
Ok(Json(response))
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper: get the current page URL and title via CDP Runtime.evaluate.
|
/// Helper: get the current page URL and title via CDP Runtime.evaluate.
|
||||||
async fn get_page_info_via_cdp(
|
async fn get_page_info_via_cdp(
|
||||||
cdp: &crate::browser_cdp::CdpClient,
|
cdp: &crate::browser_cdp::CdpClient,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue