From a9629c91ea13e55939b9d2d762796374579739a7 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 17 Mar 2026 17:00:24 -0700 Subject: [PATCH] feat: [US-041] - Restrict crawl endpoint to http/https schemes only Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/ralph/prd.json | 2 +- scripts/ralph/progress.txt | 18 ++- .../sandbox-agent/src/browser_crawl.rs | 17 ++- .../sandbox-agent/src/browser_errors.rs | 5 + .../sandbox-agent/tests/browser_api.rs | 122 +++++++++++++++++- 5 files changed, 151 insertions(+), 13 deletions(-) diff --git a/scripts/ralph/prd.json b/scripts/ralph/prd.json index d896490..055c437 100644 --- a/scripts/ralph/prd.json +++ b/scripts/ralph/prd.json @@ -670,7 +670,7 @@ "Tests pass" ], "priority": 41, - "passes": false, + "passes": true, "notes": "SECURITY: file:// URLs combined with --no-sandbox Chromium lets anyone read arbitrary files via the crawl endpoint. The crawl link filter explicitly allows file:// scheme and extract_links collects file: hrefs." }, { diff --git a/scripts/ralph/progress.txt b/scripts/ralph/progress.txt index 4a750da..28234b7 100644 --- a/scripts/ralph/progress.txt +++ b/scripts/ralph/progress.txt @@ -28,7 +28,8 @@ - Test helper `write_test_file()` uses `PUT /v1/fs/file?path=...` to write HTML test fixtures into the container - `docker/test-agent/Dockerfile` must include chromium + deps (libnss3, libatk-bridge2.0-0, libdrm2, libxcomposite1, libxdamage1, libxrandr2, libgbm1, libasound2, libpangocairo-1.0-0, libgtk-3-0) for browser integration tests - `get_page_info_via_cdp()` is a helper fn in router.rs for getting current URL and title via Runtime.evaluate -- Crawl supports `file://`, `http://`, and `https://` schemes; `extract_links` JS filter and `crawl_pages` Rust scheme filter must both be updated when adding new schemes +- Crawl only allows `http://` and `https://` schemes (file:// rejected with 400); `extract_links` JS filter and `crawl_pages` Rust scheme filter must both be updated when adding new schemes +- Integration tests can start background services inside the container via `POST /v1/processes` and check readiness via `POST /v1/processes/run` (e.g. curl probe) - Crawl `truncated` detection: when breaking early on max_pages, push the popped URL back into the queue before breaking so `!queue.is_empty()` is accurate - CDP event-based features (console, network monitoring) are captured asynchronously by background tasks; integration tests need ~1s sleep after triggering events before asserting on endpoint results - CDP `Page.getNavigationHistory` returns `{currentIndex, entries: [{id, url, title}]}` for back/forward navigation @@ -700,3 +701,18 @@ Started: Tue Mar 17 04:32:06 AM PDT 2026 - `get_cdp()` is the canonical pattern for accessing the CDP client - it returns an `Arc` so callers don't hold the state lock during I/O - Dead public API methods should be removed proactively to avoid inviting misuse patterns --- + +## 2026-03-17 - US-041 +- Restricted crawl endpoint to http/https schemes only (file:// URLs now return 400) +- Added URL scheme validation at the top of crawl_pages() before any navigation +- Removed 'file' from the link filtering scheme whitelist in the BFS crawl loop +- Removed 'file:' prefix from extract_links() JavaScript href collection filter +- Added BrowserProblem::invalid_url() constructor for 400 "Invalid URL" errors +- Rewrote v1_browser_crawl integration test to use a local Python HTTP server (via process API) instead of file:// URLs +- Added file:// URL rejection assertion in the crawl test +- Files changed: `browser_crawl.rs`, `browser_errors.rs`, `browser_api.rs` (tests) +- **Learnings for future iterations:** + - Integration tests can start background services in the container via POST /v1/processes (long-lived) and check readiness via POST /v1/processes/run (curl probe) + - PUT /v1/fs/file auto-creates parent directories, so no need for separate mkdir calls + - BrowserProblem extensions are flattened into the ProblemDetails JSON response (e.g. `parsed["code"]` not `parsed["extensions"]["code"]`) +--- diff --git a/server/packages/sandbox-agent/src/browser_crawl.rs b/server/packages/sandbox-agent/src/browser_crawl.rs index d55773e..88cf3ab 100644 --- a/server/packages/sandbox-agent/src/browser_crawl.rs +++ b/server/packages/sandbox-agent/src/browser_crawl.rs @@ -27,6 +27,16 @@ pub async fn crawl_pages( let start_url = Url::parse(&request.url) .map_err(|e| BrowserProblem::cdp_error(format!("Invalid start URL: {e}")))?; + // Only allow http and https schemes to prevent local filesystem reading. + match start_url.scheme() { + "http" | "https" => {} + scheme => { + return Err(BrowserProblem::invalid_url(format!( + "Unsupported URL scheme '{scheme}'. Only http and https are allowed." + ))); + } + } + let allowed_domains: HashSet = if let Some(ref domains) = request.allowed_domains { domains.iter().cloned().collect() } else { @@ -149,10 +159,7 @@ pub async fn crawl_pages( continue; } if let Ok(parsed) = Url::parse(link) { - if parsed.scheme() != "http" - && parsed.scheme() != "https" - && parsed.scheme() != "file" - { + if parsed.scheme() != "http" && parsed.scheme() != "https" { continue; } if let Some(host) = parsed.host_str() { @@ -331,7 +338,7 @@ async fn extract_links(cdp: &CdpClient) -> Result, BrowserProblem> { (function() { var links = []; document.querySelectorAll('a[href]').forEach(function(a) { - if (a.href && (a.href.startsWith('http') || a.href.startsWith('file:'))) { + if (a.href && a.href.startsWith('http')) { links.push(a.href); } }); diff --git a/server/packages/sandbox-agent/src/browser_errors.rs b/server/packages/sandbox-agent/src/browser_errors.rs index 5e409b1..520c915 100644 --- a/server/packages/sandbox-agent/src/browser_errors.rs +++ b/server/packages/sandbox-agent/src/browser_errors.rs @@ -82,6 +82,11 @@ impl BrowserProblem { Self::new(400, "Invalid Selector", "browser/invalid-selector", message) } + // 400 - invalid URL (e.g. disallowed scheme) + pub fn invalid_url(message: impl Into) -> Self { + Self::new(400, "Invalid URL", "browser/invalid-url", message) + } + pub fn to_problem_details(&self) -> ProblemDetails { let mut extensions = Map::new(); extensions.insert("code".to_string(), Value::String(self.code.to_string())); diff --git a/server/packages/sandbox-agent/tests/browser_api.rs b/server/packages/sandbox-agent/tests/browser_api.rs index e40e3ed..e8b3e3b 100644 --- a/server/packages/sandbox-agent/tests/browser_api.rs +++ b/server/packages/sandbox-agent/tests/browser_api.rs @@ -1002,10 +1002,96 @@ const TEST_HTML_CRAWL_C: &str = r#" "#; +/// Start a Python HTTP server inside the container serving a directory on the +/// given port. Returns the process ID so the caller can clean it up. +async fn start_http_server(app: &docker_support::DockerApp, dir: &str, port: u16) -> String { + let client = reqwest::Client::new(); + let response = client + .post(app.http_url("/v1/processes")) + .header(reqwest::header::CONTENT_TYPE, "application/json") + .body( + json!({ + "command": "python3", + "args": ["-m", "http.server", port.to_string(), "--directory", dir], + }) + .to_string(), + ) + .send() + .await + .expect("start http server process"); + assert_eq!( + response.status(), + StatusCode::OK, + "failed to start http server" + ); + let body: Value = response.json().await.expect("json response"); + let process_id = body["id"].as_str().expect("process id").to_string(); + + // Wait for the server to bind inside the container by running a curl check. + for _ in 0..30 { + tokio::time::sleep(Duration::from_millis(200)).await; + let check = client + .post(app.http_url("/v1/processes/run")) + .header(reqwest::header::CONTENT_TYPE, "application/json") + .body( + json!({ + "command": "curl", + "args": ["-sf", "-o", "/dev/null", format!("http://localhost:{port}/")], + "timeoutMs": 2000 + }) + .to_string(), + ) + .send() + .await; + if let Ok(resp) = check { + if let Ok(val) = resp.json::().await { + if val["exitCode"] == 0 { + break; + } + } + } + } + + process_id +} + +/// Stop a process started via the process API. +async fn stop_http_server(app: &docker_support::DockerApp, process_id: &str) { + let client = reqwest::Client::new(); + let _ = client + .post(app.http_url(&format!("/v1/processes/{process_id}/kill"))) + .send() + .await; +} + #[tokio::test] #[serial] async fn v1_browser_crawl() { let test_app = TestApp::new(AuthConfig::disabled()); + let http_port: u16 = 18765; + + // Write the 3 linked test HTML pages + write_test_file( + &test_app.app, + "/tmp/crawl-test/page-a.html", + TEST_HTML_CRAWL_A, + ) + .await; + write_test_file( + &test_app.app, + "/tmp/crawl-test/page-b.html", + TEST_HTML_CRAWL_B, + ) + .await; + write_test_file( + &test_app.app, + "/tmp/crawl-test/page-c.html", + TEST_HTML_CRAWL_C, + ) + .await; + + // Start a local HTTP server inside the container to serve the test pages. + let server_pid = start_http_server(&test_app.app, "/tmp/crawl-test", http_port).await; // Start browser let (status, _, body) = send_request( @@ -1023,10 +1109,7 @@ async fn v1_browser_crawl() { String::from_utf8_lossy(&body) ); - // Write the 3 linked test HTML pages - write_test_file(&test_app.app, "/tmp/page-a.html", TEST_HTML_CRAWL_A).await; - write_test_file(&test_app.app, "/tmp/page-b.html", TEST_HTML_CRAWL_B).await; - write_test_file(&test_app.app, "/tmp/page-c.html", TEST_HTML_CRAWL_C).await; + let base_url = format!("http://localhost:{http_port}"); // Crawl starting from page-a with maxDepth=2, extract=text let (status, _, body) = send_request( @@ -1034,7 +1117,7 @@ async fn v1_browser_crawl() { Method::POST, "/v1/browser/crawl", Some(json!({ - "url": "file:///tmp/page-a.html", + "url": format!("{base_url}/page-a.html"), "maxDepth": 2, "extract": "text" })), @@ -1096,7 +1179,7 @@ async fn v1_browser_crawl() { Method::POST, "/v1/browser/crawl", Some(json!({ - "url": "file:///tmp/page-a.html", + "url": format!("{base_url}/page-a.html"), "maxPages": 1, "maxDepth": 2, "extract": "text" @@ -1114,8 +1197,35 @@ async fn v1_browser_crawl() { "should be truncated when more pages exist" ); + // Test that file:// URLs are rejected with 400 + let (status, _, body) = send_request( + &test_app.app, + Method::POST, + "/v1/browser/crawl", + Some(json!({ + "url": "file:///etc/passwd", + "extract": "text" + })), + &[], + ) + .await; + assert_eq!( + status, + StatusCode::BAD_REQUEST, + "file:// URL should be rejected: {}", + String::from_utf8_lossy(&body) + ); + let parsed = parse_json(&body); + assert_eq!( + parsed["code"], "browser/invalid-url", + "should return invalid-url error code" + ); + // Stop browser let (status, _, _) = send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await; assert_eq!(status, StatusCode::OK); + + // Clean up the HTTP server + stop_http_server(&test_app.app, &server_pid).await; }