From c96558e52394eb046a6a214e4a36d52c0aa4c521 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 17 Mar 2026 17:03:56 -0700 Subject: [PATCH] feat: [US-042] - Handle CDP connection death on Chromium crash Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/ralph/prd.json | 2 +- scripts/ralph/progress.txt | 10 ++++++++++ server/packages/sandbox-agent/src/browser_cdp.rs | 8 ++++++++ .../packages/sandbox-agent/src/browser_runtime.rs | 13 ++++++++++++- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/scripts/ralph/prd.json b/scripts/ralph/prd.json index 055c437..6fdfd09 100644 --- a/scripts/ralph/prd.json +++ b/scripts/ralph/prd.json @@ -684,7 +684,7 @@ "Typecheck passes" ], "priority": 42, - "passes": false, + "passes": true, "notes": "If Chromium crashes, RestartPolicy::Always restarts it but the CdpClient holds a dead WebSocket. State stays Active but all CDP operations fail. No recovery without manual stop+start." }, { diff --git a/scripts/ralph/progress.txt b/scripts/ralph/progress.txt index 28234b7..d3abf76 100644 --- a/scripts/ralph/progress.txt +++ b/scripts/ralph/progress.txt @@ -716,3 +716,13 @@ Started: Tue Mar 17 04:32:06 AM PDT 2026 - PUT /v1/fs/file auto-creates parent directories, so no need for separate mkdir calls - BrowserProblem extensions are flattened into the ProblemDetails JSON response (e.g. `parsed["code"]` not `parsed["extensions"]["code"]`) --- + +## 2026-03-17 - US-042 +- Added `is_alive()` method to CdpClient that checks if the reader_task JoinHandle is finished +- Updated `refresh_status_locked()` in BrowserRuntime to check CDP client liveness after the Chromium process check; transitions to Failed state if the CDP WebSocket connection has died +- Changed Chromium's RestartPolicy from Always to Never so a crash transitions cleanly to Failed state instead of auto-restarting with a stale CDP connection +- Files changed: `browser_cdp.rs`, `browser_runtime.rs` +- **Learnings for future iterations:** + - `JoinHandle::is_finished()` is a non-blocking way to check if a spawned task has exited - useful for health checks without await + - RestartPolicy is declared but not actually enforced by ProcessRuntime (the field has `#[allow(dead_code)]`), so changing it is primarily a signal for future implementation and prevents accidental auto-restart if it gets wired up +--- diff --git a/server/packages/sandbox-agent/src/browser_cdp.rs b/server/packages/sandbox-agent/src/browser_cdp.rs index 1567b81..6dcd7c1 100644 --- a/server/packages/sandbox-agent/src/browser_cdp.rs +++ b/server/packages/sandbox-agent/src/browser_cdp.rs @@ -158,6 +158,14 @@ impl CdpClient { rx } + /// Check whether the CDP WebSocket connection is still alive. + /// + /// Returns `true` if the background reader task is still running, which + /// means the WebSocket connection has not been closed or errored. + pub fn is_alive(&self) -> bool { + !self.reader_task.is_finished() + } + /// Close the CDP connection and stop the reader task. pub async fn close(&self) { self.reader_task.abort(); diff --git a/server/packages/sandbox-agent/src/browser_runtime.rs b/server/packages/sandbox-agent/src/browser_runtime.rs index 4d17f02..4d54d54 100644 --- a/server/packages/sandbox-agent/src/browser_runtime.rs +++ b/server/packages/sandbox-agent/src/browser_runtime.rs @@ -757,6 +757,17 @@ impl BrowserRuntime { } } } + + // Check CDP WebSocket connection is alive + if let Some(ref cdp) = state.cdp_client { + if !cdp.is_alive() { + let problem = + BrowserProblem::cdp_error("CDP WebSocket connection died unexpectedly"); + self.record_problem_locked(state, &problem); + state.state = BrowserState::Failed; + return; + } + } } fn snapshot_locked(&self, state: &BrowserRuntimeStateData) -> BrowserStatusResponse { @@ -910,7 +921,7 @@ impl BrowserRuntime { tty: false, interactive: false, owner: ProcessOwner::Desktop, - restart_policy: Some(RestartPolicy::Always), + restart_policy: Some(RestartPolicy::Never), }) .await .map_err(|err| {