mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 07:04:48 +00:00
feat: [US-037] - Add integration tests for crawling
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8ace9cd9f1
commit
adca4425bb
2 changed files with 154 additions and 2 deletions
|
|
@ -51,6 +51,8 @@ pub async fn crawl_pages(
|
||||||
|
|
||||||
while let Some((url, depth)) = queue.pop_front() {
|
while let Some((url, depth)) = queue.pop_front() {
|
||||||
if pages.len() >= max_pages {
|
if pages.len() >= max_pages {
|
||||||
|
// Push back so truncated detection sees remaining work.
|
||||||
|
queue.push_front((url, depth));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -147,7 +149,10 @@ pub async fn crawl_pages(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if let Ok(parsed) = Url::parse(link) {
|
if let Ok(parsed) = Url::parse(link) {
|
||||||
if parsed.scheme() != "http" && parsed.scheme() != "https" {
|
if parsed.scheme() != "http"
|
||||||
|
&& parsed.scheme() != "https"
|
||||||
|
&& parsed.scheme() != "file"
|
||||||
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if let Some(host) = parsed.host_str() {
|
if let Some(host) = parsed.host_str() {
|
||||||
|
|
@ -326,7 +331,7 @@ async fn extract_links(cdp: &CdpClient) -> Result<Vec<String>, BrowserProblem> {
|
||||||
(function() {
|
(function() {
|
||||||
var links = [];
|
var links = [];
|
||||||
document.querySelectorAll('a[href]').forEach(function(a) {
|
document.querySelectorAll('a[href]').forEach(function(a) {
|
||||||
if (a.href && a.href.startsWith('http')) {
|
if (a.href && (a.href.startsWith('http') || a.href.startsWith('file:'))) {
|
||||||
links.push(a.href);
|
links.push(a.href);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -972,3 +972,150 @@ async fn v1_browser_network_monitoring() {
|
||||||
send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
|
send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
|
||||||
assert_eq!(status, StatusCode::OK);
|
assert_eq!(status, StatusCode::OK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const TEST_HTML_CRAWL_A: &str = r#"<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Page A</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Page A</h1>
|
||||||
|
<p>This is page A content.</p>
|
||||||
|
<a href="page-b.html">Go to Page B</a>
|
||||||
|
</body>
|
||||||
|
</html>"#;
|
||||||
|
|
||||||
|
const TEST_HTML_CRAWL_B: &str = r#"<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Page B</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Page B</h1>
|
||||||
|
<p>This is page B content.</p>
|
||||||
|
<a href="page-c.html">Go to Page C</a>
|
||||||
|
</body>
|
||||||
|
</html>"#;
|
||||||
|
|
||||||
|
const TEST_HTML_CRAWL_C: &str = r#"<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head><title>Page C</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Page C</h1>
|
||||||
|
<p>This is page C content. No more links.</p>
|
||||||
|
</body>
|
||||||
|
</html>"#;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[serial]
|
||||||
|
async fn v1_browser_crawl() {
|
||||||
|
let test_app = TestApp::new(AuthConfig::disabled());
|
||||||
|
|
||||||
|
// Start browser
|
||||||
|
let (status, _, body) = send_request(
|
||||||
|
&test_app.app,
|
||||||
|
Method::POST,
|
||||||
|
"/v1/browser/start",
|
||||||
|
Some(json!({ "headless": true })),
|
||||||
|
&[],
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(
|
||||||
|
status,
|
||||||
|
StatusCode::OK,
|
||||||
|
"start: {}",
|
||||||
|
String::from_utf8_lossy(&body)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Write the 3 linked test HTML pages
|
||||||
|
write_test_file(&test_app.app, "/tmp/page-a.html", TEST_HTML_CRAWL_A).await;
|
||||||
|
write_test_file(&test_app.app, "/tmp/page-b.html", TEST_HTML_CRAWL_B).await;
|
||||||
|
write_test_file(&test_app.app, "/tmp/page-c.html", TEST_HTML_CRAWL_C).await;
|
||||||
|
|
||||||
|
// Crawl starting from page-a with maxDepth=2, extract=text
|
||||||
|
let (status, _, body) = send_request(
|
||||||
|
&test_app.app,
|
||||||
|
Method::POST,
|
||||||
|
"/v1/browser/crawl",
|
||||||
|
Some(json!({
|
||||||
|
"url": "file:///tmp/page-a.html",
|
||||||
|
"maxDepth": 2,
|
||||||
|
"extract": "text"
|
||||||
|
})),
|
||||||
|
&[],
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(
|
||||||
|
status,
|
||||||
|
StatusCode::OK,
|
||||||
|
"crawl: {}",
|
||||||
|
String::from_utf8_lossy(&body)
|
||||||
|
);
|
||||||
|
let parsed = parse_json(&body);
|
||||||
|
let pages = parsed["pages"].as_array().expect("pages array");
|
||||||
|
|
||||||
|
// Should have 3 pages: page-a (depth 0), page-b (depth 1), page-c (depth 2)
|
||||||
|
assert_eq!(
|
||||||
|
pages.len(),
|
||||||
|
3,
|
||||||
|
"expected 3 crawled pages, got {}: {parsed}",
|
||||||
|
pages.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify depths
|
||||||
|
assert_eq!(pages[0]["depth"], 0, "page-a should be depth 0");
|
||||||
|
assert_eq!(pages[1]["depth"], 1, "page-b should be depth 1");
|
||||||
|
assert_eq!(pages[2]["depth"], 2, "page-c should be depth 2");
|
||||||
|
|
||||||
|
// Verify page content (text extraction)
|
||||||
|
assert!(
|
||||||
|
pages[0]["content"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap_or("")
|
||||||
|
.contains("Page A"),
|
||||||
|
"page-a content should contain 'Page A'"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
pages[1]["content"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap_or("")
|
||||||
|
.contains("Page B"),
|
||||||
|
"page-b content should contain 'Page B'"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
pages[2]["content"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap_or("")
|
||||||
|
.contains("Page C"),
|
||||||
|
"page-c content should contain 'Page C'"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify totalPages and truncated
|
||||||
|
assert_eq!(parsed["totalPages"], 3);
|
||||||
|
assert_eq!(parsed["truncated"], false);
|
||||||
|
|
||||||
|
// Test maxPages=1 returns only 1 page and truncated is true
|
||||||
|
let (status, _, body) = send_request(
|
||||||
|
&test_app.app,
|
||||||
|
Method::POST,
|
||||||
|
"/v1/browser/crawl",
|
||||||
|
Some(json!({
|
||||||
|
"url": "file:///tmp/page-a.html",
|
||||||
|
"maxPages": 1,
|
||||||
|
"maxDepth": 2,
|
||||||
|
"extract": "text"
|
||||||
|
})),
|
||||||
|
&[],
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(status, StatusCode::OK);
|
||||||
|
let parsed = parse_json(&body);
|
||||||
|
let pages = parsed["pages"].as_array().expect("pages array");
|
||||||
|
assert_eq!(pages.len(), 1, "maxPages=1 should return only 1 page");
|
||||||
|
assert_eq!(parsed["totalPages"], 1);
|
||||||
|
assert_eq!(
|
||||||
|
parsed["truncated"], true,
|
||||||
|
"should be truncated when more pages exist"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Stop browser
|
||||||
|
let (status, _, _) =
|
||||||
|
send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
|
||||||
|
assert_eq!(status, StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue