From a2168ce561d31c2f0a5aa5cd91f40e5904eee3f9 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 17 Mar 2026 15:18:38 -0700 Subject: [PATCH] feat: [US-031] - Fix crawl navigation status: use real HTTP status instead of faked 200 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../sandbox-agent/src/browser_crawl.rs | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/server/packages/sandbox-agent/src/browser_crawl.rs b/server/packages/sandbox-agent/src/browser_crawl.rs index 09ce713..3e867b5 100644 --- a/server/packages/sandbox-agent/src/browser_crawl.rs +++ b/server/packages/sandbox-agent/src/browser_crawl.rs @@ -1,5 +1,7 @@ use std::collections::{HashSet, VecDeque}; +use serde_json::Value; +use tokio::sync::mpsc; use url::Url; use crate::browser_cdp::CdpClient; @@ -44,6 +46,8 @@ pub async fn crawl_pages( visited.insert(normalize_url(&request.url)); cdp.send("Page.enable", None).await?; + cdp.send("Network.enable", None).await?; + let mut network_rx = cdp.subscribe("Network.responseReceived").await; while let Some((url, depth)) = queue.pop_front() { if pages.len() >= max_pages { @@ -55,7 +59,30 @@ pub async fn crawl_pages( .send("Page.navigate", Some(serde_json::json!({ "url": url }))) .await?; - let status = nav_result.get("frameId").map(|_| 200u16); + // If Page.navigate returns an errorText, the navigation failed (e.g. DNS + // error, connection refused). Record the page with no status and skip + // content/link extraction. + let error_text = nav_result + .get("errorText") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()); + if error_text.is_some() { + pages.push(BrowserCrawlPage { + url, + title: String::new(), + content: String::new(), + links: vec![], + status: None, + depth, + }); + continue; + } + + let frame_id = nav_result + .get("frameId") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); // Wait for page load by polling document.readyState until "complete". // Polls every 100ms with a 10s timeout; proceeds with extraction if timeout reached. @@ -88,6 +115,12 @@ pub async fn crawl_pages( tokio::time::sleep(poll_interval).await; } + // Capture the real HTTP status from Network.responseReceived events. + // After readyState is "complete", all document-level network events should + // have been buffered. We drain them looking for the last Document response + // matching this frame (the last one handles redirects correctly). + let status = drain_navigation_status(&mut network_rx, &frame_id); + // Get page info. let (page_url, title) = get_page_info(cdp).await?; @@ -139,6 +172,33 @@ pub async fn crawl_pages( }) } +/// Drain buffered `Network.responseReceived` events and return the HTTP status +/// of the last Document-type response matching the given frame ID. +/// +/// Takes the *last* matching status to handle redirect chains correctly (the +/// final destination response comes last). Returns `None` for non-HTTP schemes +/// like `file://` where no network events are emitted. +fn drain_navigation_status( + network_rx: &mut mpsc::UnboundedReceiver, + frame_id: &str, +) -> Option { + let mut status = None; + while let Ok(event) = network_rx.try_recv() { + let event_frame = event.get("frameId").and_then(|v| v.as_str()).unwrap_or(""); + let event_type = event.get("type").and_then(|v| v.as_str()).unwrap_or(""); + if event_frame == frame_id && event_type == "Document" { + if let Some(s) = event + .get("response") + .and_then(|r| r.get("status")) + .and_then(|s| s.as_u64()) + { + status = Some(s as u16); + } + } + } + status +} + /// Normalize a URL by removing the fragment for deduplication. fn normalize_url(url: &str) -> String { if let Ok(mut parsed) = Url::parse(url) {