mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 06:04:43 +00:00
feat: [US-041] - Restrict crawl endpoint to http/https schemes only
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1bd7ef9219
commit
a9629c91ea
5 changed files with 151 additions and 13 deletions
|
|
@ -670,7 +670,7 @@
|
|||
"Tests pass"
|
||||
],
|
||||
"priority": 41,
|
||||
"passes": false,
|
||||
"passes": true,
|
||||
"notes": "SECURITY: file:// URLs combined with --no-sandbox Chromium lets anyone read arbitrary files via the crawl endpoint. The crawl link filter explicitly allows file:// scheme and extract_links collects file: hrefs."
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -28,7 +28,8 @@
|
|||
- Test helper `write_test_file()` uses `PUT /v1/fs/file?path=...` to write HTML test fixtures into the container
|
||||
- `docker/test-agent/Dockerfile` must include chromium + deps (libnss3, libatk-bridge2.0-0, libdrm2, libxcomposite1, libxdamage1, libxrandr2, libgbm1, libasound2, libpangocairo-1.0-0, libgtk-3-0) for browser integration tests
|
||||
- `get_page_info_via_cdp()` is a helper fn in router.rs for getting current URL and title via Runtime.evaluate
|
||||
- Crawl supports `file://`, `http://`, and `https://` schemes; `extract_links` JS filter and `crawl_pages` Rust scheme filter must both be updated when adding new schemes
|
||||
- Crawl only allows `http://` and `https://` schemes (file:// rejected with 400); `extract_links` JS filter and `crawl_pages` Rust scheme filter must both be updated when adding new schemes
|
||||
- Integration tests can start background services inside the container via `POST /v1/processes` and check readiness via `POST /v1/processes/run` (e.g. curl probe)
|
||||
- Crawl `truncated` detection: when breaking early on max_pages, push the popped URL back into the queue before breaking so `!queue.is_empty()` is accurate
|
||||
- CDP event-based features (console, network monitoring) are captured asynchronously by background tasks; integration tests need ~1s sleep after triggering events before asserting on endpoint results
|
||||
- CDP `Page.getNavigationHistory` returns `{currentIndex, entries: [{id, url, title}]}` for back/forward navigation
|
||||
|
|
@ -700,3 +701,18 @@ Started: Tue Mar 17 04:32:06 AM PDT 2026
|
|||
- `get_cdp()` is the canonical pattern for accessing the CDP client - it returns an `Arc<CdpClient>` so callers don't hold the state lock during I/O
|
||||
- Dead public API methods should be removed proactively to avoid inviting misuse patterns
|
||||
---
|
||||
|
||||
## 2026-03-17 - US-041
|
||||
- Restricted crawl endpoint to http/https schemes only (file:// URLs now return 400)
|
||||
- Added URL scheme validation at the top of crawl_pages() before any navigation
|
||||
- Removed 'file' from the link filtering scheme whitelist in the BFS crawl loop
|
||||
- Removed 'file:' prefix from extract_links() JavaScript href collection filter
|
||||
- Added BrowserProblem::invalid_url() constructor for 400 "Invalid URL" errors
|
||||
- Rewrote v1_browser_crawl integration test to use a local Python HTTP server (via process API) instead of file:// URLs
|
||||
- Added file:// URL rejection assertion in the crawl test
|
||||
- Files changed: `browser_crawl.rs`, `browser_errors.rs`, `browser_api.rs` (tests)
|
||||
- **Learnings for future iterations:**
|
||||
- Integration tests can start background services in the container via POST /v1/processes (long-lived) and check readiness via POST /v1/processes/run (curl probe)
|
||||
- PUT /v1/fs/file auto-creates parent directories, so no need for separate mkdir calls
|
||||
- BrowserProblem extensions are flattened into the ProblemDetails JSON response (e.g. `parsed["code"]` not `parsed["extensions"]["code"]`)
|
||||
---
|
||||
|
|
|
|||
|
|
@ -27,6 +27,16 @@ pub async fn crawl_pages(
|
|||
let start_url = Url::parse(&request.url)
|
||||
.map_err(|e| BrowserProblem::cdp_error(format!("Invalid start URL: {e}")))?;
|
||||
|
||||
// Only allow http and https schemes to prevent local filesystem reading.
|
||||
match start_url.scheme() {
|
||||
"http" | "https" => {}
|
||||
scheme => {
|
||||
return Err(BrowserProblem::invalid_url(format!(
|
||||
"Unsupported URL scheme '{scheme}'. Only http and https are allowed."
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let allowed_domains: HashSet<String> = if let Some(ref domains) = request.allowed_domains {
|
||||
domains.iter().cloned().collect()
|
||||
} else {
|
||||
|
|
@ -149,10 +159,7 @@ pub async fn crawl_pages(
|
|||
continue;
|
||||
}
|
||||
if let Ok(parsed) = Url::parse(link) {
|
||||
if parsed.scheme() != "http"
|
||||
&& parsed.scheme() != "https"
|
||||
&& parsed.scheme() != "file"
|
||||
{
|
||||
if parsed.scheme() != "http" && parsed.scheme() != "https" {
|
||||
continue;
|
||||
}
|
||||
if let Some(host) = parsed.host_str() {
|
||||
|
|
@ -331,7 +338,7 @@ async fn extract_links(cdp: &CdpClient) -> Result<Vec<String>, BrowserProblem> {
|
|||
(function() {
|
||||
var links = [];
|
||||
document.querySelectorAll('a[href]').forEach(function(a) {
|
||||
if (a.href && (a.href.startsWith('http') || a.href.startsWith('file:'))) {
|
||||
if (a.href && a.href.startsWith('http')) {
|
||||
links.push(a.href);
|
||||
}
|
||||
});
|
||||
|
|
|
|||
|
|
@ -82,6 +82,11 @@ impl BrowserProblem {
|
|||
Self::new(400, "Invalid Selector", "browser/invalid-selector", message)
|
||||
}
|
||||
|
||||
// 400 - invalid URL (e.g. disallowed scheme)
|
||||
pub fn invalid_url(message: impl Into<String>) -> Self {
|
||||
Self::new(400, "Invalid URL", "browser/invalid-url", message)
|
||||
}
|
||||
|
||||
pub fn to_problem_details(&self) -> ProblemDetails {
|
||||
let mut extensions = Map::new();
|
||||
extensions.insert("code".to_string(), Value::String(self.code.to_string()));
|
||||
|
|
|
|||
|
|
@ -1002,10 +1002,96 @@ const TEST_HTML_CRAWL_C: &str = r#"<!DOCTYPE html>
|
|||
</body>
|
||||
</html>"#;
|
||||
|
||||
/// Start a Python HTTP server inside the container serving a directory on the
|
||||
/// given port. Returns the process ID so the caller can clean it up.
|
||||
async fn start_http_server(app: &docker_support::DockerApp, dir: &str, port: u16) -> String {
|
||||
let client = reqwest::Client::new();
|
||||
let response = client
|
||||
.post(app.http_url("/v1/processes"))
|
||||
.header(reqwest::header::CONTENT_TYPE, "application/json")
|
||||
.body(
|
||||
json!({
|
||||
"command": "python3",
|
||||
"args": ["-m", "http.server", port.to_string(), "--directory", dir],
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.send()
|
||||
.await
|
||||
.expect("start http server process");
|
||||
assert_eq!(
|
||||
response.status(),
|
||||
StatusCode::OK,
|
||||
"failed to start http server"
|
||||
);
|
||||
let body: Value = response.json().await.expect("json response");
|
||||
let process_id = body["id"].as_str().expect("process id").to_string();
|
||||
|
||||
// Wait for the server to bind inside the container by running a curl check.
|
||||
for _ in 0..30 {
|
||||
tokio::time::sleep(Duration::from_millis(200)).await;
|
||||
let check = client
|
||||
.post(app.http_url("/v1/processes/run"))
|
||||
.header(reqwest::header::CONTENT_TYPE, "application/json")
|
||||
.body(
|
||||
json!({
|
||||
"command": "curl",
|
||||
"args": ["-sf", "-o", "/dev/null", format!("http://localhost:{port}/")],
|
||||
"timeoutMs": 2000
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
if let Ok(resp) = check {
|
||||
if let Ok(val) = resp.json::<Value>().await {
|
||||
if val["exitCode"] == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
process_id
|
||||
}
|
||||
|
||||
/// Stop a process started via the process API.
|
||||
async fn stop_http_server(app: &docker_support::DockerApp, process_id: &str) {
|
||||
let client = reqwest::Client::new();
|
||||
let _ = client
|
||||
.post(app.http_url(&format!("/v1/processes/{process_id}/kill")))
|
||||
.send()
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[serial]
|
||||
async fn v1_browser_crawl() {
|
||||
let test_app = TestApp::new(AuthConfig::disabled());
|
||||
let http_port: u16 = 18765;
|
||||
|
||||
// Write the 3 linked test HTML pages
|
||||
write_test_file(
|
||||
&test_app.app,
|
||||
"/tmp/crawl-test/page-a.html",
|
||||
TEST_HTML_CRAWL_A,
|
||||
)
|
||||
.await;
|
||||
write_test_file(
|
||||
&test_app.app,
|
||||
"/tmp/crawl-test/page-b.html",
|
||||
TEST_HTML_CRAWL_B,
|
||||
)
|
||||
.await;
|
||||
write_test_file(
|
||||
&test_app.app,
|
||||
"/tmp/crawl-test/page-c.html",
|
||||
TEST_HTML_CRAWL_C,
|
||||
)
|
||||
.await;
|
||||
|
||||
// Start a local HTTP server inside the container to serve the test pages.
|
||||
let server_pid = start_http_server(&test_app.app, "/tmp/crawl-test", http_port).await;
|
||||
|
||||
// Start browser
|
||||
let (status, _, body) = send_request(
|
||||
|
|
@ -1023,10 +1109,7 @@ async fn v1_browser_crawl() {
|
|||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
|
||||
// Write the 3 linked test HTML pages
|
||||
write_test_file(&test_app.app, "/tmp/page-a.html", TEST_HTML_CRAWL_A).await;
|
||||
write_test_file(&test_app.app, "/tmp/page-b.html", TEST_HTML_CRAWL_B).await;
|
||||
write_test_file(&test_app.app, "/tmp/page-c.html", TEST_HTML_CRAWL_C).await;
|
||||
let base_url = format!("http://localhost:{http_port}");
|
||||
|
||||
// Crawl starting from page-a with maxDepth=2, extract=text
|
||||
let (status, _, body) = send_request(
|
||||
|
|
@ -1034,7 +1117,7 @@ async fn v1_browser_crawl() {
|
|||
Method::POST,
|
||||
"/v1/browser/crawl",
|
||||
Some(json!({
|
||||
"url": "file:///tmp/page-a.html",
|
||||
"url": format!("{base_url}/page-a.html"),
|
||||
"maxDepth": 2,
|
||||
"extract": "text"
|
||||
})),
|
||||
|
|
@ -1096,7 +1179,7 @@ async fn v1_browser_crawl() {
|
|||
Method::POST,
|
||||
"/v1/browser/crawl",
|
||||
Some(json!({
|
||||
"url": "file:///tmp/page-a.html",
|
||||
"url": format!("{base_url}/page-a.html"),
|
||||
"maxPages": 1,
|
||||
"maxDepth": 2,
|
||||
"extract": "text"
|
||||
|
|
@ -1114,8 +1197,35 @@ async fn v1_browser_crawl() {
|
|||
"should be truncated when more pages exist"
|
||||
);
|
||||
|
||||
// Test that file:// URLs are rejected with 400
|
||||
let (status, _, body) = send_request(
|
||||
&test_app.app,
|
||||
Method::POST,
|
||||
"/v1/browser/crawl",
|
||||
Some(json!({
|
||||
"url": "file:///etc/passwd",
|
||||
"extract": "text"
|
||||
})),
|
||||
&[],
|
||||
)
|
||||
.await;
|
||||
assert_eq!(
|
||||
status,
|
||||
StatusCode::BAD_REQUEST,
|
||||
"file:// URL should be rejected: {}",
|
||||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
let parsed = parse_json(&body);
|
||||
assert_eq!(
|
||||
parsed["code"], "browser/invalid-url",
|
||||
"should return invalid-url error code"
|
||||
);
|
||||
|
||||
// Stop browser
|
||||
let (status, _, _) =
|
||||
send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
|
||||
assert_eq!(status, StatusCode::OK);
|
||||
|
||||
// Clean up the HTTP server
|
||||
stop_http_server(&test_app.app, &server_pid).await;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue