mirror of
https://github.com/harivansh-afk/sandbox-agent.git
synced 2026-04-15 07:04:48 +00:00
feat: [US-031] - Fix crawl navigation status: use real HTTP status instead of faked 200
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
117b9c2c42
commit
a2168ce561
1 changed files with 61 additions and 1 deletions
|
|
@ -1,5 +1,7 @@
|
||||||
use std::collections::{HashSet, VecDeque};
|
use std::collections::{HashSet, VecDeque};
|
||||||
|
|
||||||
|
use serde_json::Value;
|
||||||
|
use tokio::sync::mpsc;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use crate::browser_cdp::CdpClient;
|
use crate::browser_cdp::CdpClient;
|
||||||
|
|
@ -44,6 +46,8 @@ pub async fn crawl_pages(
|
||||||
visited.insert(normalize_url(&request.url));
|
visited.insert(normalize_url(&request.url));
|
||||||
|
|
||||||
cdp.send("Page.enable", None).await?;
|
cdp.send("Page.enable", None).await?;
|
||||||
|
cdp.send("Network.enable", None).await?;
|
||||||
|
let mut network_rx = cdp.subscribe("Network.responseReceived").await;
|
||||||
|
|
||||||
while let Some((url, depth)) = queue.pop_front() {
|
while let Some((url, depth)) = queue.pop_front() {
|
||||||
if pages.len() >= max_pages {
|
if pages.len() >= max_pages {
|
||||||
|
|
@ -55,7 +59,30 @@ pub async fn crawl_pages(
|
||||||
.send("Page.navigate", Some(serde_json::json!({ "url": url })))
|
.send("Page.navigate", Some(serde_json::json!({ "url": url })))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let status = nav_result.get("frameId").map(|_| 200u16);
|
// If Page.navigate returns an errorText, the navigation failed (e.g. DNS
|
||||||
|
// error, connection refused). Record the page with no status and skip
|
||||||
|
// content/link extraction.
|
||||||
|
let error_text = nav_result
|
||||||
|
.get("errorText")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.filter(|s| !s.is_empty());
|
||||||
|
if error_text.is_some() {
|
||||||
|
pages.push(BrowserCrawlPage {
|
||||||
|
url,
|
||||||
|
title: String::new(),
|
||||||
|
content: String::new(),
|
||||||
|
links: vec![],
|
||||||
|
status: None,
|
||||||
|
depth,
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let frame_id = nav_result
|
||||||
|
.get("frameId")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
// Wait for page load by polling document.readyState until "complete".
|
// Wait for page load by polling document.readyState until "complete".
|
||||||
// Polls every 100ms with a 10s timeout; proceeds with extraction if timeout reached.
|
// Polls every 100ms with a 10s timeout; proceeds with extraction if timeout reached.
|
||||||
|
|
@ -88,6 +115,12 @@ pub async fn crawl_pages(
|
||||||
tokio::time::sleep(poll_interval).await;
|
tokio::time::sleep(poll_interval).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Capture the real HTTP status from Network.responseReceived events.
|
||||||
|
// After readyState is "complete", all document-level network events should
|
||||||
|
// have been buffered. We drain them looking for the last Document response
|
||||||
|
// matching this frame (the last one handles redirects correctly).
|
||||||
|
let status = drain_navigation_status(&mut network_rx, &frame_id);
|
||||||
|
|
||||||
// Get page info.
|
// Get page info.
|
||||||
let (page_url, title) = get_page_info(cdp).await?;
|
let (page_url, title) = get_page_info(cdp).await?;
|
||||||
|
|
||||||
|
|
@ -139,6 +172,33 @@ pub async fn crawl_pages(
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Drain buffered `Network.responseReceived` events and return the HTTP status
|
||||||
|
/// of the last Document-type response matching the given frame ID.
|
||||||
|
///
|
||||||
|
/// Takes the *last* matching status to handle redirect chains correctly (the
|
||||||
|
/// final destination response comes last). Returns `None` for non-HTTP schemes
|
||||||
|
/// like `file://` where no network events are emitted.
|
||||||
|
fn drain_navigation_status(
|
||||||
|
network_rx: &mut mpsc::UnboundedReceiver<Value>,
|
||||||
|
frame_id: &str,
|
||||||
|
) -> Option<u16> {
|
||||||
|
let mut status = None;
|
||||||
|
while let Ok(event) = network_rx.try_recv() {
|
||||||
|
let event_frame = event.get("frameId").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
let event_type = event.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
if event_frame == frame_id && event_type == "Document" {
|
||||||
|
if let Some(s) = event
|
||||||
|
.get("response")
|
||||||
|
.and_then(|r| r.get("status"))
|
||||||
|
.and_then(|s| s.as_u64())
|
||||||
|
{
|
||||||
|
status = Some(s as u16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
status
|
||||||
|
}
|
||||||
|
|
||||||
/// Normalize a URL by removing the fragment for deduplication.
|
/// Normalize a URL by removing the fragment for deduplication.
|
||||||
fn normalize_url(url: &str) -> String {
|
fn normalize_url(url: &str) -> String {
|
||||||
if let Ok(mut parsed) = Url::parse(url) {
|
if let Ok(mut parsed) = Url::parse(url) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue