mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 06:04:43 +00:00
feat: [US-037] - Add integration tests for crawling
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8ace9cd9f1
commit
adca4425bb
2 changed files with 154 additions and 2 deletions
|
|
@ -51,6 +51,8 @@ pub async fn crawl_pages(
|
|||
|
||||
while let Some((url, depth)) = queue.pop_front() {
|
||||
if pages.len() >= max_pages {
|
||||
// Push back so truncated detection sees remaining work.
|
||||
queue.push_front((url, depth));
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
@ -147,7 +149,10 @@ pub async fn crawl_pages(
|
|||
continue;
|
||||
}
|
||||
if let Ok(parsed) = Url::parse(link) {
|
||||
if parsed.scheme() != "http" && parsed.scheme() != "https" {
|
||||
if parsed.scheme() != "http"
|
||||
&& parsed.scheme() != "https"
|
||||
&& parsed.scheme() != "file"
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if let Some(host) = parsed.host_str() {
|
||||
|
|
@ -326,7 +331,7 @@ async fn extract_links(cdp: &CdpClient) -> Result<Vec<String>, BrowserProblem> {
|
|||
(function() {
|
||||
var links = [];
|
||||
document.querySelectorAll('a[href]').forEach(function(a) {
|
||||
if (a.href && a.href.startsWith('http')) {
|
||||
if (a.href && (a.href.startsWith('http') || a.href.startsWith('file:'))) {
|
||||
links.push(a.href);
|
||||
}
|
||||
});
|
||||
|
|
|
|||
|
|
@ -972,3 +972,150 @@ async fn v1_browser_network_monitoring() {
|
|||
send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
|
||||
assert_eq!(status, StatusCode::OK);
|
||||
}
|
||||
|
||||
const TEST_HTML_CRAWL_A: &str = r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Page A</title></head>
|
||||
<body>
|
||||
<h1>Page A</h1>
|
||||
<p>This is page A content.</p>
|
||||
<a href="page-b.html">Go to Page B</a>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
const TEST_HTML_CRAWL_B: &str = r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Page B</title></head>
|
||||
<body>
|
||||
<h1>Page B</h1>
|
||||
<p>This is page B content.</p>
|
||||
<a href="page-c.html">Go to Page C</a>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
const TEST_HTML_CRAWL_C: &str = r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Page C</title></head>
|
||||
<body>
|
||||
<h1>Page C</h1>
|
||||
<p>This is page C content. No more links.</p>
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
#[tokio::test]
|
||||
#[serial]
|
||||
async fn v1_browser_crawl() {
|
||||
let test_app = TestApp::new(AuthConfig::disabled());
|
||||
|
||||
// Start browser
|
||||
let (status, _, body) = send_request(
|
||||
&test_app.app,
|
||||
Method::POST,
|
||||
"/v1/browser/start",
|
||||
Some(json!({ "headless": true })),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
assert_eq!(
|
||||
status,
|
||||
StatusCode::OK,
|
||||
"start: {}",
|
||||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
|
||||
// Write the 3 linked test HTML pages
|
||||
write_test_file(&test_app.app, "/tmp/page-a.html", TEST_HTML_CRAWL_A).await;
|
||||
write_test_file(&test_app.app, "/tmp/page-b.html", TEST_HTML_CRAWL_B).await;
|
||||
write_test_file(&test_app.app, "/tmp/page-c.html", TEST_HTML_CRAWL_C).await;
|
||||
|
||||
// Crawl starting from page-a with maxDepth=2, extract=text
|
||||
let (status, _, body) = send_request(
|
||||
&test_app.app,
|
||||
Method::POST,
|
||||
"/v1/browser/crawl",
|
||||
Some(json!({
|
||||
"url": "file:///tmp/page-a.html",
|
||||
"maxDepth": 2,
|
||||
"extract": "text"
|
||||
})),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
assert_eq!(
|
||||
status,
|
||||
StatusCode::OK,
|
||||
"crawl: {}",
|
||||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
let parsed = parse_json(&body);
|
||||
let pages = parsed["pages"].as_array().expect("pages array");
|
||||
|
||||
// Should have 3 pages: page-a (depth 0), page-b (depth 1), page-c (depth 2)
|
||||
assert_eq!(
|
||||
pages.len(),
|
||||
3,
|
||||
"expected 3 crawled pages, got {}: {parsed}",
|
||||
pages.len()
|
||||
);
|
||||
|
||||
// Verify depths
|
||||
assert_eq!(pages[0]["depth"], 0, "page-a should be depth 0");
|
||||
assert_eq!(pages[1]["depth"], 1, "page-b should be depth 1");
|
||||
assert_eq!(pages[2]["depth"], 2, "page-c should be depth 2");
|
||||
|
||||
// Verify page content (text extraction)
|
||||
assert!(
|
||||
pages[0]["content"]
|
||||
.as_str()
|
||||
.unwrap_or("")
|
||||
.contains("Page A"),
|
||||
"page-a content should contain 'Page A'"
|
||||
);
|
||||
assert!(
|
||||
pages[1]["content"]
|
||||
.as_str()
|
||||
.unwrap_or("")
|
||||
.contains("Page B"),
|
||||
"page-b content should contain 'Page B'"
|
||||
);
|
||||
assert!(
|
||||
pages[2]["content"]
|
||||
.as_str()
|
||||
.unwrap_or("")
|
||||
.contains("Page C"),
|
||||
"page-c content should contain 'Page C'"
|
||||
);
|
||||
|
||||
// Verify totalPages and truncated
|
||||
assert_eq!(parsed["totalPages"], 3);
|
||||
assert_eq!(parsed["truncated"], false);
|
||||
|
||||
// Test maxPages=1 returns only 1 page and truncated is true
|
||||
let (status, _, body) = send_request(
|
||||
&test_app.app,
|
||||
Method::POST,
|
||||
"/v1/browser/crawl",
|
||||
Some(json!({
|
||||
"url": "file:///tmp/page-a.html",
|
||||
"maxPages": 1,
|
||||
"maxDepth": 2,
|
||||
"extract": "text"
|
||||
})),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
assert_eq!(status, StatusCode::OK);
|
||||
let parsed = parse_json(&body);
|
||||
let pages = parsed["pages"].as_array().expect("pages array");
|
||||
assert_eq!(pages.len(), 1, "maxPages=1 should return only 1 page");
|
||||
assert_eq!(parsed["totalPages"], 1);
|
||||
assert_eq!(
|
||||
parsed["truncated"], true,
|
||||
"should be truncated when more pages exist"
|
||||
);
|
||||
|
||||
// Stop browser
|
||||
let (status, _, _) =
|
||||
send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
|
||||
assert_eq!(status, StatusCode::OK);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue