chore: improve sandbox test infrastructure (#32)

Enhance the sandbox testing script with better error handling and reporting capabilities.
2026-04-16 20:01:27 +00:00 · 2026-01-29 07:19:23 -08:00 · 2026-01-29 07:19:23 -08:00 · 2ada366623
commit 2ada366623
parent bd8f6b9c97
1 changed files with 157 additions and 42 deletions
--- a/scripts/sandbox-testing/test-sandbox.ts
+++ b/scripts/sandbox-testing/test-sandbox.ts
@ -8,11 +8,12 @@
 * Providers: daytona, e2b, docker
 *
 * Options:
- *   --skip-build     Skip cargo build step
+ *   --skip-build          Skip cargo build step
- *   --use-release    Use pre-built release binary from releases.rivet.dev
+ *   --use-release         Use pre-built release binary from releases.rivet.dev
- *   --agent <name>   Test specific agent (claude, codex, mock)
+ *   --agent <name>        Test specific agent (claude, codex, mock)
- *   --keep-alive     Don't cleanup sandbox after test
+ *   --skip-agent-install  Skip pre-installing agents (tests on-demand install like Daytona example)
- *   --verbose        Show all logs
+ *   --keep-alive          Don't cleanup sandbox after test
 *   --verbose             Show all logs
 */
 import { execSync, spawn } from "node:child_process";
@ -30,6 +31,7 @@ const args = process.argv.slice(2);
 const provider = args.find((a) => !a.startsWith("--")) || "docker";
 const skipBuild = args.includes("--skip-build");
 const useRelease = args.includes("--use-release");
 const skipAgentInstall = args.includes("--skip-agent-install");
 const keepAlive = args.includes("--keep-alive");
 const verbose = args.includes("--verbose");
 const agentArg = args.find((a) => a.startsWith("--agent="))?.split("=")[1];
@ -117,7 +119,7 @@ interface Sandbox {
 }
 // Docker provider
-// Uses Alpine because Claude Code binary is built for musl libc
+// Uses Ubuntu because Claude Code and sandbox-agent are glibc binaries
 const dockerProvider: SandboxProvider = {
 	name: "docker",
 	requiredEnv: [],
@ -129,12 +131,12 @@ const dockerProvider: SandboxProvider = {
 		log.info(`Creating Docker container: ${id}`);
 		execSync(
-			`docker run -d --name ${id} ${envArgs} -p 0:3000 alpine:latest tail -f /dev/null`,
+			`docker run -d --name ${id} ${envArgs} -p 0:3000 ubuntu:22.04 tail -f /dev/null`,
 			{ stdio: verbose ? "inherit" : "pipe" },
 		);
-		// Install dependencies (Alpine uses apk; musl C++ libs needed for Claude Code)
+		// Install dependencies
-		execSync(`docker exec ${id} sh -c "apk add --no-cache curl ca-certificates libstdc++ libgcc bash"`, {
+		execSync(`docker exec ${id} sh -c "apt-get update && apt-get install -y curl ca-certificates"`, {
 			stdio: verbose ? "inherit" : "pipe",
 		});
@ -153,6 +155,7 @@ const dockerProvider: SandboxProvider = {
 			},
 			async upload(localPath, remotePath) {
 				execSync(`docker cp "${localPath}" ${id}:${remotePath}`, { stdio: verbose ? "inherit" : "pipe" });
 				execSync(`docker exec ${id} chmod +x ${remotePath}`, { stdio: verbose ? "inherit" : "pipe" });
 			},
 			async getBaseUrl(port) {
 				const portMapping = execSync(`docker port ${id} ${port}`, { encoding: "utf-8" }).trim();
@ -191,15 +194,17 @@ const daytonaProvider: SandboxProvider = {
 			id,
 			async exec(cmd) {
 				const result = await sandbox.process.executeCommand(cmd);
 				// Daytona SDK returns: { exitCode, result: string, artifacts: { stdout: string } }
 				return {
-					stdout: result.result.output || "",
+					stdout: result.result || "",
-					stderr: result.result.error || "",
+					stderr: "",
-					exitCode: result.result.exitCode,
+					exitCode: result.exitCode ?? 0,
 				};
 			},
 			async upload(localPath, remotePath) {
 				const content = readFileSync(localPath);
-				await sandbox.fs.uploadFile(remotePath, content);
+				// Daytona SDK signature: uploadFile(Buffer, remotePath)
 				await sandbox.fs.uploadFile(content, remotePath);
 				await sandbox.process.executeCommand(`chmod +x ${remotePath}`);
 			},
 			async getBaseUrl(port) {
@ -355,26 +360,10 @@ async function startServerAndCheckHealth(sandbox: Sandbox): Promise<string> {
 	throw new Error("Health check failed after 30 seconds");
 }
-// Test agent interaction
+// Send a message and wait for response, auto-approving permissions
-async function testAgent(baseUrl: string, agent: string, message: string): Promise<void> {
+// Returns the response text
-	log.section(`Testing ${agent} agent`);
+async function sendMessage(baseUrl: string, sessionId: string, message: string): Promise<string> {
-
+	log.info(`Sending message: "${message.slice(0, 60)}${message.length > 60 ? "..." : ""}"`);
 	const sessionId = crypto.randomUUID();
 	// Create session
 	log.info(`Creating session ${sessionId}...`);
 	const createRes = await fetch(`${baseUrl}/v1/sessions/${sessionId}`, {
 		method: "POST",
 		headers: { "Content-Type": "application/json" },
 		body: JSON.stringify({ agent }),
 	});
 	if (!createRes.ok) {
 		throw new Error(`Failed to create session: ${await createRes.text()}`);
 	}
 	log.success("Session created");
 	// Send message with streaming
 	log.info(`Sending message: "${message}"`);
 	const msgRes = await fetch(`${baseUrl}/v1/sessions/${sessionId}/messages/stream`, {
 		method: "POST",
 		headers: { "Content-Type": "application/json" },
@ -388,9 +377,11 @@ async function testAgent(baseUrl: string, agent: string, message: string): Promi
 	const reader = msgRes.body.getReader();
 	const decoder = new TextDecoder();
 	let buffer = "";
 	let responseText = "";
 	let receivedText = false;
 	let hasError = false;
 	let errorMessage = "";
 	let pendingPermission: string | null = null;
 	while (true) {
 		const { done, value } = await reader.read();
@ -418,6 +409,16 @@ async function testAgent(baseUrl: string, agent: string, message: string): Promi
 							receivedText = true;
 						}
 						process.stdout.write(text);
 						responseText += text;
 					}
 				}
 				// Handle permission requests - auto-approve
 				if (event.type === "permission.requested") {
 					const permissionId = event.data?.permission_id;
 					if (permissionId) {
 						pendingPermission = permissionId;
 						log.info(`Permission requested (${permissionId}), auto-approving...`);
 					}
 				}
@ -427,22 +428,123 @@ async function testAgent(baseUrl: string, agent: string, message: string): Promi
 					log.error(`Error event: ${errorMessage}`);
 				}
-				if (event.type === "session.ended") {
+				if (event.type === "agent.unparsed") {
-					const reason = event.data?.reason;
+					hasError = true;
-					log.info(`Session ended: ${reason || "unknown reason"}`);
+					errorMessage = `Agent unparsed: ${JSON.stringify(event.data)}`;
 					log.error(errorMessage);
 				}
 			} catch {}
 		}
 		// If we have a pending permission, approve it
 		if (pendingPermission) {
 			const permId = pendingPermission;
 			pendingPermission = null;
 			try {
 				const approveRes = await fetch(`${baseUrl}/v1/sessions/${sessionId}/permissions/${permId}/reply`, {
 					method: "POST",
 					headers: { "Content-Type": "application/json" },
 					body: JSON.stringify({ reply: "once" }),
 				});
 				if (approveRes.ok) {
 					log.success(`Permission ${permId} approved`);
 				} else {
 					log.warn(`Failed to approve permission: ${await approveRes.text()}`);
 				}
 			} catch (err) {
 				log.warn(`Error approving permission: ${err}`);
 			}
 		}
 	}
 	if (receivedText) {
 		console.log(); // newline after response
-		log.success("Received response from agent");
+	}
-	} else if (hasError) {
+
 	if (hasError) {
 		throw new Error(`Agent returned error: ${errorMessage}`);
-	} else {
+	}
 	return responseText;
 }
 // Test agent interaction
 async function testAgent(baseUrl: string, agent: string, message: string): Promise<void> {
 	log.section(`Testing ${agent} agent`);
 	const sessionId = crypto.randomUUID();
 	// Create session
 	log.info(`Creating session ${sessionId}...`);
 	const createRes = await fetch(`${baseUrl}/v1/sessions/${sessionId}`, {
 		method: "POST",
 		headers: { "Content-Type": "application/json" },
 		body: JSON.stringify({ agent }),
 	});
 	if (!createRes.ok) {
 		throw new Error(`Failed to create session: ${await createRes.text()}`);
 	}
 	log.success("Session created");
 	const response = await sendMessage(baseUrl, sessionId, message);
 	if (!response) {
 		throw new Error("No response received from agent");
 	}
 	log.success("Received response from agent");
 }
 // Test that agent can actually modify files and run commands
 async function testAgentActions(baseUrl: string, agent: string, sandbox: Sandbox): Promise<void> {
 	log.section(`Testing ${agent} agent actions (file + command)`);
 	const sessionId = crypto.randomUUID();
 	const testFile = "/tmp/sandbox-test-file.txt";
 	const expectedContent = "Hello from sandbox test!";
 	// For Claude running as root in containers, we must use default permission mode
 	// and handle permissions via the API (bypass mode is not supported as root).
 	// For other agents, we can use bypass mode.
 	const permissionMode = agent === "claude" ? "default" : "bypass";
 	log.info(`Creating session ${sessionId} with permissionMode=${permissionMode}...`);
 	const createRes = await fetch(`${baseUrl}/v1/sessions/${sessionId}`, {
 		method: "POST",
 		headers: { "Content-Type": "application/json" },
 		body: JSON.stringify({ agent, permissionMode }),
 	});
 	if (!createRes.ok) {
 		throw new Error(`Failed to create session: ${await createRes.text()}`);
 	}
 	log.success("Session created");
 	// Ask agent to create a file
 	const fileMessage = `Create a file at ${testFile} with exactly this content (no quotes, no extra text): ${expectedContent}`;
 	await sendMessage(baseUrl, sessionId, fileMessage);
 	// Verify file was created
 	log.info("Verifying file was created...");
 	const fileCheck = await sandbox.exec(`cat ${testFile} 2>&1`);
 	if (fileCheck.exitCode !== 0) {
 		throw new Error(`File was not created: ${fileCheck.stderr || fileCheck.stdout}`);
 	}
 	if (!fileCheck.stdout.includes("Hello from sandbox test")) {
 		throw new Error(`File content mismatch. Expected "${expectedContent}", got "${fileCheck.stdout.trim()}"`);
 	}
 	log.success(`File created with correct content: "${fileCheck.stdout.trim()}"`);
 	// Ask agent to run a command and create output
 	const cmdMessage = `Run this command and tell me the output: echo "command-test-$(date +%s)" > /tmp/cmd-output.txt && cat /tmp/cmd-output.txt`;
 	await sendMessage(baseUrl, sessionId, cmdMessage);
 	// Verify command was executed
 	log.info("Verifying command was executed...");
 	const cmdCheck = await sandbox.exec("cat /tmp/cmd-output.txt 2>&1");
 	if (cmdCheck.exitCode !== 0) {
 		throw new Error(`Command output file not found: ${cmdCheck.stderr || cmdCheck.stdout}`);
 	}
 	if (!cmdCheck.stdout.includes("command-test-")) {
 		throw new Error(`Command output mismatch. Expected "command-test-*", got "${cmdCheck.stdout.trim()}"`);
 	}
 	log.success(`Command executed successfully: "${cmdCheck.stdout.trim()}"`);
 }
 // Check environment diagnostics
@ -453,9 +555,10 @@ async function checkEnvironment(sandbox: Sandbox): Promise<void> {
 		{ name: "Environment variables", cmd: "env | grep -E 'ANTHROPIC|OPENAI|CLAUDE|CODEX' | sed 's/=.*/=<set>/'" },
 		// Check both /root (Alpine) and /home/user (E2B/Debian) paths
 		{ name: "Agent binaries", cmd: "ls -la ~/.local/share/sandbox-agent/bin/ 2>/dev/null || ls -la /root/.local/share/sandbox-agent/bin/ 2>/dev/null || ls -la /home/user/.local/share/sandbox-agent/bin/ 2>/dev/null || echo 'No agents installed'" },
 		{ name: "Claude version", cmd: "~/.local/share/sandbox-agent/bin/claude --version 2>&1 || /root/.local/share/sandbox-agent/bin/claude --version 2>&1 || echo 'Claude not installed'" },
 		{ name: "sandbox-agent version", cmd: "sandbox-agent --version 2>/dev/null || echo 'Not installed'" },
 		{ name: "Server process", cmd: "pgrep -a sandbox-agent 2>/dev/null || ps aux | grep sandbox-agent | grep -v grep || echo 'Not running'" },
-		{ name: "Server logs (last 20 lines)", cmd: "tail -20 /tmp/sandbox-agent.log 2>/dev/null || echo 'No logs'" },
+		{ name: "Server logs (last 50 lines)", cmd: "tail -50 /tmp/sandbox-agent.log 2>/dev/null || echo 'No logs'" },
 		{ name: "Network: api.anthropic.com", cmd: "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 https://api.anthropic.com/v1/messages 2>&1 || echo 'UNREACHABLE'" },
 		{ name: "Network: api.openai.com", cmd: "curl -s -o /dev/null -w '%{http_code}' --connect-timeout 5 https://api.openai.com/v1/models 2>&1 || echo 'UNREACHABLE'" },
 	];
@ -519,8 +622,12 @@ async function main() {
 		// Install sandbox-agent
 		await installSandboxAgent(sandbox, binaryPath);
-		// Install agents
+		// Install agents (unless --skip-agent-install to test on-demand install like Daytona example)
-		await installAgents(sandbox, agents);
+		if (skipAgentInstall) {
 			log.info("Skipping agent pre-install (testing on-demand installation)");
 		} else {
 			await installAgents(sandbox, agents);
 		}
 		// Check environment
 		await checkEnvironment(sandbox);
@ -530,8 +637,16 @@ async function main() {
 		// Test each agent
 		for (const agent of agents) {
 			// Basic response test
 			const message = agent === "mock" ? "hello" : "Say hello in 10 words or less";
 			await testAgent(baseUrl, agent, message);
 			// For real agents, also test file/command actions with permission handling.
 			// Claude uses default permission mode and we auto-approve via API.
 			// Other agents can use bypass mode.
 			if (agent !== "mock") {
 				await testAgentActions(baseUrl, agent, sandbox);
 			}
 		}
 		log.section("All tests passed!");