From a9629c91ea13e55939b9d2d762796374579739a7 Mon Sep 17 00:00:00 2001
From: Nathan Flurry <git@nathanflurry.com>
Date: Tue, 17 Mar 2026 17:00:24 -0700
Subject: [PATCH] feat: [US-041] - Restrict crawl endpoint to http/https
 schemes only

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/ralph/prd.json                        |   2 +-
 scripts/ralph/progress.txt                    |  18 ++-
 .../sandbox-agent/src/browser_crawl.rs        |  17 ++-
 .../sandbox-agent/src/browser_errors.rs       |   5 +
 .../sandbox-agent/tests/browser_api.rs        | 122 +++++++++++++++++-
 5 files changed, 151 insertions(+), 13 deletions(-)
diff --git a/scripts/ralph/prd.json b/scripts/ralph/prd.json
index d896490..055c437 100644
--- a/scripts/ralph/prd.json
+++ b/scripts/ralph/prd.json
@@ -670,7 +670,7 @@
         "Tests pass"
       ],
       "priority": 41,
-      "passes": false,
+      "passes": true,
       "notes": "SECURITY: file:// URLs combined with --no-sandbox Chromium lets anyone read arbitrary files via the crawl endpoint. The crawl link filter explicitly allows file:// scheme and extract_links collects file: hrefs."
     },
     {
diff --git a/scripts/ralph/progress.txt b/scripts/ralph/progress.txt
index 4a750da..28234b7 100644
--- a/scripts/ralph/progress.txt
+++ b/scripts/ralph/progress.txt
@@ -28,7 +28,8 @@
 - Test helper `write_test_file()` uses `PUT /v1/fs/file?path=...` to write HTML test fixtures into the container
 - `docker/test-agent/Dockerfile` must include chromium + deps (libnss3, libatk-bridge2.0-0, libdrm2, libxcomposite1, libxdamage1, libxrandr2, libgbm1, libasound2, libpangocairo-1.0-0, libgtk-3-0) for browser integration tests
 - `get_page_info_via_cdp()` is a helper fn in router.rs for getting current URL and title via Runtime.evaluate
-- Crawl supports `file://`, `http://`, and `https://` schemes; `extract_links` JS filter and `crawl_pages` Rust scheme filter must both be updated when adding new schemes
+- Crawl only allows `http://` and `https://` schemes (file:// rejected with 400); `extract_links` JS filter and `crawl_pages` Rust scheme filter must both be updated when adding new schemes
+- Integration tests can start background services inside the container via `POST /v1/processes` and check readiness via `POST /v1/processes/run` (e.g. curl probe)
 - Crawl `truncated` detection: when breaking early on max_pages, push the popped URL back into the queue before breaking so `!queue.is_empty()` is accurate
 - CDP event-based features (console, network monitoring) are captured asynchronously by background tasks; integration tests need ~1s sleep after triggering events before asserting on endpoint results
 - CDP `Page.getNavigationHistory` returns `{currentIndex, entries: [{id, url, title}]}` for back/forward navigation
@@ -700,3 +701,18 @@ Started: Tue Mar 17 04:32:06 AM PDT 2026
   - `get_cdp()` is the canonical pattern for accessing the CDP client - it returns an `Arc<CdpClient>` so callers don't hold the state lock during I/O
   - Dead public API methods should be removed proactively to avoid inviting misuse patterns
 ---
+
+## 2026-03-17 - US-041
+- Restricted crawl endpoint to http/https schemes only (file:// URLs now return 400)
+- Added URL scheme validation at the top of crawl_pages() before any navigation
+- Removed 'file' from the link filtering scheme whitelist in the BFS crawl loop
+- Removed 'file:' prefix from extract_links() JavaScript href collection filter
+- Added BrowserProblem::invalid_url() constructor for 400 "Invalid URL" errors
+- Rewrote v1_browser_crawl integration test to use a local Python HTTP server (via process API) instead of file:// URLs
+- Added file:// URL rejection assertion in the crawl test
+- Files changed: `browser_crawl.rs`, `browser_errors.rs`, `browser_api.rs` (tests)
+- **Learnings for future iterations:**
+  - Integration tests can start background services in the container via POST /v1/processes (long-lived) and check readiness via POST /v1/processes/run (curl probe)
+  - PUT /v1/fs/file auto-creates parent directories, so no need for separate mkdir calls
+  - BrowserProblem extensions are flattened into the ProblemDetails JSON response (e.g. `parsed["code"]` not `parsed["extensions"]["code"]`)
+---
diff --git a/server/packages/sandbox-agent/src/browser_crawl.rs b/server/packages/sandbox-agent/src/browser_crawl.rs
index d55773e..88cf3ab 100644
--- a/server/packages/sandbox-agent/src/browser_crawl.rs
+++ b/server/packages/sandbox-agent/src/browser_crawl.rs
@@ -27,6 +27,16 @@ pub async fn crawl_pages(
     let start_url = Url::parse(&request.url)
         .map_err(|e| BrowserProblem::cdp_error(format!("Invalid start URL: {e}")))?;
 
+    // Only allow http and https schemes to prevent local filesystem reading.
+    match start_url.scheme() {
+        "http" | "https" => {}
+        scheme => {
+            return Err(BrowserProblem::invalid_url(format!(
+                "Unsupported URL scheme '{scheme}'. Only http and https are allowed."
+            )));
+        }
+    }
+
     let allowed_domains: HashSet<String> = if let Some(ref domains) = request.allowed_domains {
         domains.iter().cloned().collect()
     } else {
@@ -149,10 +159,7 @@ pub async fn crawl_pages(
                     continue;
                 }
                 if let Ok(parsed) = Url::parse(link) {
-                    if parsed.scheme() != "http"
-                        && parsed.scheme() != "https"
-                        && parsed.scheme() != "file"
-                    {
+                    if parsed.scheme() != "http" && parsed.scheme() != "https" {
                         continue;
                     }
                     if let Some(host) = parsed.host_str() {
@@ -331,7 +338,7 @@ async fn extract_links(cdp: &CdpClient) -> Result<Vec<String>, BrowserProblem> {
         (function() {
             var links = [];
             document.querySelectorAll('a[href]').forEach(function(a) {
-                if (a.href && (a.href.startsWith('http') || a.href.startsWith('file:'))) {
+                if (a.href && a.href.startsWith('http')) {
                     links.push(a.href);
                 }
             });
diff --git a/server/packages/sandbox-agent/src/browser_errors.rs b/server/packages/sandbox-agent/src/browser_errors.rs
index 5e409b1..520c915 100644
--- a/server/packages/sandbox-agent/src/browser_errors.rs
+++ b/server/packages/sandbox-agent/src/browser_errors.rs
@@ -82,6 +82,11 @@ impl BrowserProblem {
         Self::new(400, "Invalid Selector", "browser/invalid-selector", message)
     }
 
+    // 400 - invalid URL (e.g. disallowed scheme)
+    pub fn invalid_url(message: impl Into<String>) -> Self {
+        Self::new(400, "Invalid URL", "browser/invalid-url", message)
+    }
+
     pub fn to_problem_details(&self) -> ProblemDetails {
         let mut extensions = Map::new();
         extensions.insert("code".to_string(), Value::String(self.code.to_string()));
diff --git a/server/packages/sandbox-agent/tests/browser_api.rs b/server/packages/sandbox-agent/tests/browser_api.rs
index e40e3ed..e8b3e3b 100644
--- a/server/packages/sandbox-agent/tests/browser_api.rs
+++ b/server/packages/sandbox-agent/tests/browser_api.rs
@@ -1002,10 +1002,96 @@ const TEST_HTML_CRAWL_C: &str = r#"<!DOCTYPE html>
 </body>
 </html>"#;
 
+/// Start a Python HTTP server inside the container serving a directory on the
+/// given port. Returns the process ID so the caller can clean it up.
+async fn start_http_server(app: &docker_support::DockerApp, dir: &str, port: u16) -> String {
+    let client = reqwest::Client::new();
+    let response = client
+        .post(app.http_url("/v1/processes"))
+        .header(reqwest::header::CONTENT_TYPE, "application/json")
+        .body(
+            json!({
+                "command": "python3",
+                "args": ["-m", "http.server", port.to_string(), "--directory", dir],
+            })
+            .to_string(),
+        )
+        .send()
+        .await
+        .expect("start http server process");
+    assert_eq!(
+        response.status(),
+        StatusCode::OK,
+        "failed to start http server"
+    );
+    let body: Value = response.json().await.expect("json response");
+    let process_id = body["id"].as_str().expect("process id").to_string();
+
+    // Wait for the server to bind inside the container by running a curl check.
+    for _ in 0..30 {
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        let check = client
+            .post(app.http_url("/v1/processes/run"))
+            .header(reqwest::header::CONTENT_TYPE, "application/json")
+            .body(
+                json!({
+                    "command": "curl",
+                    "args": ["-sf", "-o", "/dev/null", format!("http://localhost:{port}/")],
+                    "timeoutMs": 2000
+                })
+                .to_string(),
+            )
+            .send()
+            .await;
+        if let Ok(resp) = check {
+            if let Ok(val) = resp.json::<Value>().await {
+                if val["exitCode"] == 0 {
+                    break;
+                }
+            }
+        }
+    }
+
+    process_id
+}
+
+/// Stop a process started via the process API.
+async fn stop_http_server(app: &docker_support::DockerApp, process_id: &str) {
+    let client = reqwest::Client::new();
+    let _ = client
+        .post(app.http_url(&format!("/v1/processes/{process_id}/kill")))
+        .send()
+        .await;
+}
+
 #[tokio::test]
 #[serial]
 async fn v1_browser_crawl() {
     let test_app = TestApp::new(AuthConfig::disabled());
+    let http_port: u16 = 18765;
+
+    // Write the 3 linked test HTML pages
+    write_test_file(
+        &test_app.app,
+        "/tmp/crawl-test/page-a.html",
+        TEST_HTML_CRAWL_A,
+    )
+    .await;
+    write_test_file(
+        &test_app.app,
+        "/tmp/crawl-test/page-b.html",
+        TEST_HTML_CRAWL_B,
+    )
+    .await;
+    write_test_file(
+        &test_app.app,
+        "/tmp/crawl-test/page-c.html",
+        TEST_HTML_CRAWL_C,
+    )
+    .await;
+
+    // Start a local HTTP server inside the container to serve the test pages.
+    let server_pid = start_http_server(&test_app.app, "/tmp/crawl-test", http_port).await;
 
     // Start browser
     let (status, _, body) = send_request(
@@ -1023,10 +1109,7 @@ async fn v1_browser_crawl() {
         String::from_utf8_lossy(&body)
     );
 
-    // Write the 3 linked test HTML pages
-    write_test_file(&test_app.app, "/tmp/page-a.html", TEST_HTML_CRAWL_A).await;
-    write_test_file(&test_app.app, "/tmp/page-b.html", TEST_HTML_CRAWL_B).await;
-    write_test_file(&test_app.app, "/tmp/page-c.html", TEST_HTML_CRAWL_C).await;
+    let base_url = format!("http://localhost:{http_port}");
 
     // Crawl starting from page-a with maxDepth=2, extract=text
     let (status, _, body) = send_request(
@@ -1034,7 +1117,7 @@ async fn v1_browser_crawl() {
         Method::POST,
         "/v1/browser/crawl",
         Some(json!({
-            "url": "file:///tmp/page-a.html",
+            "url": format!("{base_url}/page-a.html"),
             "maxDepth": 2,
             "extract": "text"
         })),
@@ -1096,7 +1179,7 @@ async fn v1_browser_crawl() {
         Method::POST,
         "/v1/browser/crawl",
         Some(json!({
-            "url": "file:///tmp/page-a.html",
+            "url": format!("{base_url}/page-a.html"),
             "maxPages": 1,
             "maxDepth": 2,
             "extract": "text"
@@ -1114,8 +1197,35 @@ async fn v1_browser_crawl() {
         "should be truncated when more pages exist"
     );
 
+    // Test that file:// URLs are rejected with 400
+    let (status, _, body) = send_request(
+        &test_app.app,
+        Method::POST,
+        "/v1/browser/crawl",
+        Some(json!({
+            "url": "file:///etc/passwd",
+            "extract": "text"
+        })),
+        &[],
+    )
+    .await;
+    assert_eq!(
+        status,
+        StatusCode::BAD_REQUEST,
+        "file:// URL should be rejected: {}",
+        String::from_utf8_lossy(&body)
+    );
+    let parsed = parse_json(&body);
+    assert_eq!(
+        parsed["code"], "browser/invalid-url",
+        "should return invalid-url error code"
+    );
+
     // Stop browser
     let (status, _, _) =
         send_request(&test_app.app, Method::POST, "/v1/browser/stop", None, &[]).await;
     assert_eq!(status, StatusCode::OK);
+
+    // Clean up the HTTP server
+    stop_http_server(&test_app.app, &server_pid).await;
 }