diff --git a/server/packages/sandbox-agent/src/browser_crawl.rs b/server/packages/sandbox-agent/src/browser_crawl.rs index 3e867b5..d55773e 100644 --- a/server/packages/sandbox-agent/src/browser_crawl.rs +++ b/server/packages/sandbox-agent/src/browser_crawl.rs @@ -51,6 +51,8 @@ pub async fn crawl_pages( while let Some((url, depth)) = queue.pop_front() { if pages.len() >= max_pages { + // Push back so truncated detection sees remaining work. + queue.push_front((url, depth)); break; } @@ -147,7 +149,10 @@ pub async fn crawl_pages( continue; } if let Ok(parsed) = Url::parse(link) { - if parsed.scheme() != "http" && parsed.scheme() != "https" { + if parsed.scheme() != "http" + && parsed.scheme() != "https" + && parsed.scheme() != "file" + { continue; } if let Some(host) = parsed.host_str() { @@ -326,7 +331,7 @@ async fn extract_links(cdp: &CdpClient) -> Result, BrowserProblem> { (function() { var links = []; document.querySelectorAll('a[href]').forEach(function(a) { - if (a.href && a.href.startsWith('http')) { + if (a.href && (a.href.startsWith('http') || a.href.startsWith('file:'))) { links.push(a.href); } }); diff --git a/server/packages/sandbox-agent/tests/browser_api.rs b/server/packages/sandbox-agent/tests/browser_api.rs index d50b306..e40e3ed 100644 --- a/server/packages/sandbox-agent/tests/browser_api.rs +++ b/server/packages/sandbox-agent/tests/browser_api.rs @@ -972,3 +972,150 @@ async fn v1_browser_network_monitoring() { send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await; assert_eq!(status, StatusCode::OK); } + +const TEST_HTML_CRAWL_A: &str = r#" + +Page A + +

Page A

+

This is page A content.

+Go to Page B + +"#; + +const TEST_HTML_CRAWL_B: &str = r#" + +Page B + +

Page B

+

This is page B content.

+Go to Page C + +"#; + +const TEST_HTML_CRAWL_C: &str = r#" + +Page C + +

Page C

+

This is page C content. No more links.

+ +"#; + +#[tokio::test] +#[serial] +async fn v1_browser_crawl() { + let test_app = TestApp::new(AuthConfig::disabled()); + + // Start browser + let (status, _, body) = send_request( + &test_app.app, + Method::POST, + "/v1/browser/start", + Some(json!({ "headless": true })), + &[], + ) + .await; + assert_eq!( + status, + StatusCode::OK, + "start: {}", + String::from_utf8_lossy(&body) + ); + + // Write the 3 linked test HTML pages + write_test_file(&test_app.app, "/tmp/page-a.html", TEST_HTML_CRAWL_A).await; + write_test_file(&test_app.app, "/tmp/page-b.html", TEST_HTML_CRAWL_B).await; + write_test_file(&test_app.app, "/tmp/page-c.html", TEST_HTML_CRAWL_C).await; + + // Crawl starting from page-a with maxDepth=2, extract=text + let (status, _, body) = send_request( + &test_app.app, + Method::POST, + "/v1/browser/crawl", + Some(json!({ + "url": "file:///tmp/page-a.html", + "maxDepth": 2, + "extract": "text" + })), + &[], + ) + .await; + assert_eq!( + status, + StatusCode::OK, + "crawl: {}", + String::from_utf8_lossy(&body) + ); + let parsed = parse_json(&body); + let pages = parsed["pages"].as_array().expect("pages array"); + + // Should have 3 pages: page-a (depth 0), page-b (depth 1), page-c (depth 2) + assert_eq!( + pages.len(), + 3, + "expected 3 crawled pages, got {}: {parsed}", + pages.len() + ); + + // Verify depths + assert_eq!(pages[0]["depth"], 0, "page-a should be depth 0"); + assert_eq!(pages[1]["depth"], 1, "page-b should be depth 1"); + assert_eq!(pages[2]["depth"], 2, "page-c should be depth 2"); + + // Verify page content (text extraction) + assert!( + pages[0]["content"] + .as_str() + .unwrap_or("") + .contains("Page A"), + "page-a content should contain 'Page A'" + ); + assert!( + pages[1]["content"] + .as_str() + .unwrap_or("") + .contains("Page B"), + "page-b content should contain 'Page B'" + ); + assert!( + pages[2]["content"] + .as_str() + .unwrap_or("") + .contains("Page C"), + "page-c content should contain 'Page C'" + ); + + // Verify totalPages and truncated + assert_eq!(parsed["totalPages"], 3); + assert_eq!(parsed["truncated"], false); + + // Test maxPages=1 returns only 1 page and truncated is true + let (status, _, body) = send_request( + &test_app.app, + Method::POST, + "/v1/browser/crawl", + Some(json!({ + "url": "file:///tmp/page-a.html", + "maxPages": 1, + "maxDepth": 2, + "extract": "text" + })), + &[], + ) + .await; + assert_eq!(status, StatusCode::OK); + let parsed = parse_json(&body); + let pages = parsed["pages"].as_array().expect("pages array"); + assert_eq!(pages.len(), 1, "maxPages=1 should return only 1 page"); + assert_eq!(parsed["totalPages"], 1); + assert_eq!( + parsed["truncated"], true, + "should be truncated when more pages exist" + ); + + // Stop browser + let (status, _, _) = + send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await; + assert_eq!(status, StatusCode::OK); +}