diff --git a/docs/credentials.mdx b/docs/credentials.mdx new file mode 100644 index 0000000..ce1ce7b --- /dev/null +++ b/docs/credentials.mdx @@ -0,0 +1,144 @@ +--- +title: "Credentials" +description: "How sandbox-agent discovers and uses provider credentials." +icon: "key" +--- + +Sandbox-agent automatically discovers API credentials from environment variables and agent config files. Credentials are used to authenticate with AI providers (Anthropic, OpenAI) when spawning agents. + +## Credential sources + +Credentials are extracted in priority order. The first valid credential found for each provider is used. + +### Environment variables (highest priority) + +**API keys** (checked first): + +| Variable | Provider | +|----------|----------| +| `ANTHROPIC_API_KEY` | Anthropic | +| `CLAUDE_API_KEY` | Anthropic (fallback) | +| `OPENAI_API_KEY` | OpenAI | +| `CODEX_API_KEY` | OpenAI (fallback) | + +**OAuth tokens** (checked if no API key found): + +| Variable | Provider | +|----------|----------| +| `CLAUDE_CODE_OAUTH_TOKEN` | Anthropic (OAuth) | +| `ANTHROPIC_AUTH_TOKEN` | Anthropic (OAuth fallback) | + +OAuth tokens from environment variables are only used when `include_oauth` is enabled (the default). + +### Agent config files + +If no environment variable is set, sandbox-agent checks agent-specific config files: + +| Agent | Config path | Provider | +|-------|-------------|----------| +| Amp | `~/.amp/config.json` | Anthropic | +| Claude Code | `~/.claude.json`, `~/.claude/.credentials.json` | Anthropic | +| Codex | `~/.codex/auth.json` | OpenAI | +| OpenCode | `~/.local/share/opencode/auth.json` | Both | + +OAuth tokens are supported for Claude Code, Codex, and OpenCode. Expired tokens are automatically skipped. + +## Provider requirements by agent + +| Agent | Required provider | +|-------|-------------------| +| Claude Code | Anthropic | +| Amp | Anthropic | +| Codex | OpenAI | +| OpenCode | Anthropic or OpenAI | +| Mock | None | + +## Error handling behavior + +Sandbox-agent uses a **best-effort, fail-forward** approach to credentials: + +### Extraction failures are silent + +If a config file is missing, unreadable, or malformed, extraction continues to the next source. No errors are thrown. Missing credentials simply mean the provider is marked as unavailable. + +``` +~/.claude.json missing → try ~/.claude/.credentials.json +~/.claude/.credentials.json missing → try OpenCode config +All sources exhausted → anthropic = None (not an error) +``` + +### Agents spawn without credential validation + +When you send a message to a session, sandbox-agent does **not** pre-validate credentials. The agent process is spawned with whatever credentials were found (or none), and the agent's native error surfaces if authentication fails. + +This design: +- Lets you test agent error handling behavior +- Avoids duplicating provider-specific auth validation +- Ensures sandbox-agent faithfully proxies agent behavior + +For example, sending a message to Claude Code without Anthropic credentials will spawn the agent, which will then emit its own "ANTHROPIC_API_KEY not set" error through the event stream. + +## Checking credential status + +### API endpoint + +The `GET /v1/agents` endpoint includes a `credentialsAvailable` field for each agent: + +```json +{ + "agents": [ + { + "id": "claude", + "installed": true, + "credentialsAvailable": true, + ... + }, + { + "id": "codex", + "installed": true, + "credentialsAvailable": false, + ... + } + ] +} +``` + +### TypeScript SDK + +```typescript +const { agents } = await client.listAgents(); +for (const agent of agents) { + console.log(`${agent.id}: ${agent.credentialsAvailable ? 'authenticated' : 'no credentials'}`); +} +``` + +### OpenCode compatibility + +The `/opencode/provider` endpoint returns a `connected` array listing providers with valid credentials: + +```json +{ + "all": [...], + "connected": ["claude", "mock"] +} +``` + +## Passing credentials explicitly + +You can override auto-discovered credentials by setting environment variables before starting sandbox-agent: + +```bash +export ANTHROPIC_API_KEY=sk-ant-... +export OPENAI_API_KEY=sk-... +sandbox-agent daemon start +``` + +Or when using the SDK in embedded mode: + +```typescript +const client = await SandboxAgentClient.spawn({ + env: { + ANTHROPIC_API_KEY: process.env.MY_ANTHROPIC_KEY, + }, +}); +``` diff --git a/docs/docs.json b/docs/docs.json index 61bfbf9..f881604 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -70,6 +70,7 @@ "cli", "inspector", "session-transcript-schema", + "credentials", "gigacode", { "group": "AI", diff --git a/docs/openapi.json b/docs/openapi.json index 7b46e7d..2c4444b 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -805,12 +805,17 @@ "required": [ "id", "installed", + "credentialsAvailable", "capabilities" ], "properties": { "capabilities": { "$ref": "#/components/schemas/AgentCapabilities" }, + "credentialsAvailable": { + "type": "boolean", + "description": "Whether the agent's required provider credentials are available" + }, "id": { "type": "string" }, diff --git a/frontend/packages/inspector/src/components/debug/AgentsTab.tsx b/frontend/packages/inspector/src/components/debug/AgentsTab.tsx index 1d6216c..65222d8 100644 --- a/frontend/packages/inspector/src/components/debug/AgentsTab.tsx +++ b/frontend/packages/inspector/src/components/debug/AgentsTab.tsx @@ -39,6 +39,7 @@ const AgentsTab = ({ : defaultAgents.map((id) => ({ id, installed: false, + credentialsAvailable: false, version: undefined, path: undefined, capabilities: emptyFeatureCoverage @@ -49,6 +50,9 @@ const AgentsTab = ({ {agent.installed ? "Installed" : "Missing"} + + {agent.credentialsAvailable ? "Authenticated" : "No Credentials"} +
{agent.version ? `v${agent.version}` : "Version unknown"} diff --git a/research/agents/amp.md b/research/agents/amp.md index ff314dd..c9bced7 100644 --- a/research/agents/amp.md +++ b/research/agents/amp.md @@ -415,6 +415,31 @@ if let Some(model) = options.model.as_deref() { 3. **Wait for Amp API** — Amp may add model/mode discovery in a future release 4. **Scrape ampcode.com** — Check if the web UI exposes available modes/models +## Command Execution & Process Management + +### Agent Tool Execution + +Amp executes commands via the `Bash` tool, similar to Claude Code. Synchronous execution, blocks the agent turn. Permission rules can pre-authorize specific commands: + +```typescript +{ tool: "Bash", matches: { command: "git *" }, action: "allow" } +``` + +### No User-Initiated Command Injection + +Amp does not expose any mechanism for external clients to inject command results into the agent's context. No `!` prefix equivalent, no command injection API. + +### Comparison + +| Capability | Supported? | Notes | +|-----------|-----------|-------| +| Agent runs commands | Yes (`Bash` tool) | Synchronous, blocks agent turn | +| User runs commands → agent sees output | No | | +| External API for command injection | No | | +| Command source tracking | No | | +| Background process management | No | Shell `&` only | +| PTY / interactive terminal | No | | + ## Notes - Amp is similar to Claude Code (same streaming format) diff --git a/research/agents/claude.md b/research/agents/claude.md index b78f278..450dd48 100644 --- a/research/agents/claude.md +++ b/research/agents/claude.md @@ -279,6 +279,44 @@ x-api-key: anthropic-version: 2023-06-01 ``` +## Command Execution & Process Management + +### Agent Tool Execution + +The agent executes commands via the `Bash` tool. This is synchronous - the agent blocks until the command exits. Tool schema: + +```json +{ + "command": "string", + "timeout": "number", + "workingDirectory": "string" +} +``` + +There is no background process support. If the agent needs a long-running process (e.g., dev server), it uses shell backgrounding (`&`) within a single `Bash` tool call. + +### User-Initiated Command Execution (`!` prefix) + +Claude Code's TUI supports `!command` syntax where the user types `!npm test` to run a command directly. The output is injected into the conversation as a user message so the agent can see it on the next turn. + +**This is a client-side TUI feature only.** It is not exposed in the API schema or streaming protocol. The CLI runs the command locally and stuffs the output into the next user message. There is no protocol-level concept of "user ran a command" vs "agent ran a command." + +### No External Command Injection API + +External clients (SDKs, frontends) cannot programmatically inject command results into Claude's conversation context. The only way to provide command output to the agent is: +- Include it in the user prompt text +- Use the `!` prefix in the interactive TUI + +### Comparison + +| Capability | Supported? | Notes | +|-----------|-----------|-------| +| Agent runs commands | Yes (`Bash` tool) | Synchronous, blocks agent turn | +| User runs commands → agent sees output | Yes (`!cmd` in TUI) | Client-side only, not in protocol | +| External API for command injection | No | | +| Background process management | No | Shell `&` only | +| PTY / interactive terminal | No | | + ## Notes - Claude CLI manages its own OAuth refresh internally diff --git a/research/agents/codex.md b/research/agents/codex.md index 8d3d970..d1e93ae 100644 --- a/research/agents/codex.md +++ b/research/agents/codex.md @@ -347,6 +347,68 @@ Requires a running Codex app-server process. Send the JSON-RPC request to the ap - Requires an active app-server process (cannot query models without starting one) - No standalone CLI command like `codex models` +## Command Execution & Process Management + +### Agent Tool Execution + +Codex executes commands via `LocalShellAction`. The agent proposes a command, and external clients approve/deny via JSON-RPC (`item/commandExecution/requestApproval`). + +### Command Source Tracking (`ExecCommandSource`) + +Codex is the only agent that explicitly tracks **who initiated a command** at the protocol level: + +```json +{ + "ExecCommandSource": { + "enum": ["agent", "user_shell", "unified_exec_startup", "unified_exec_interaction"] + } +} +``` + +| Source | Meaning | +|--------|---------| +| `agent` | Agent decided to run this command via tool call | +| `user_shell` | User ran a command in a shell (equivalent to Claude Code's `!` prefix) | +| `unified_exec_startup` | Startup script ran this command | +| `unified_exec_interaction` | Interactive execution | + +This means user-initiated shell commands are **first-class protocol events** in Codex, not a client-side hack like Claude Code's `!` prefix. + +### Command Execution Events + +Codex emits structured events for command execution: + +- `exec_command_begin` - Command started (includes `source`, `command`, `cwd`, `turn_id`) +- `exec_command_output_delta` - Streaming output chunk (includes `stream: stdout|stderr`) +- `exec_command_end` - Command completed (includes `exit_code`, `source`) + +### Parsed Command Analysis (`CommandAction`) + +Codex provides semantic analysis of what a command does: + +```json +{ + "commandActions": [ + { "type": "read", "path": "/src/main.ts" }, + { "type": "write", "path": "/src/utils.ts" }, + { "type": "install", "package": "lodash" } + ] +} +``` + +Action types: `read`, `write`, `listFiles`, `search`, `install`, `remove`, `other`. + +### Comparison + +| Capability | Supported? | Notes | +|-----------|-----------|-------| +| Agent runs commands | Yes (`LocalShellAction`) | With approval workflow | +| User runs commands → agent sees output | Yes (`user_shell` source) | First-class protocol event | +| External API for command injection | Yes (JSON-RPC approval) | Can approve/deny before execution | +| Command source tracking | Yes (`ExecCommandSource` enum) | Distinguishes agent vs user vs startup | +| Background process management | No | | +| PTY / interactive terminal | No | | + ## Notes - SDK is dynamically imported to reduce bundle size diff --git a/research/agents/opencode.md b/research/agents/opencode.md index 8708282..b698fb7 100644 --- a/research/agents/opencode.md +++ b/research/agents/opencode.md @@ -585,6 +585,60 @@ const response = await client.provider.list(); When an OpenCode server is running, call `GET /provider` on its HTTP port. Returns full model metadata including capabilities, costs, context limits, and modalities. +## Command Execution & Process Management + +### Agent Tool Execution + +The agent executes commands via internal tools (not exposed in the HTTP API). The agent's tool calls are synchronous within its turn. Tool parts have states: `pending`, `running`, `completed`, `error`. + +### PTY System (`/pty/*`) - User-Facing Terminals + +Separate from the agent's command execution. PTYs are server-scoped interactive terminals for the user: + +- `POST /pty` - Create PTY (command, args, cwd, title, env) +- `GET /pty` - List all PTYs +- `GET /pty/{ptyID}` - Get PTY info +- `PUT /pty/{ptyID}` - Update PTY (title, resize via `size: {rows, cols}`) +- `DELETE /pty/{ptyID}` - Kill and remove PTY +- `GET /pty/{ptyID}/connect` - WebSocket for bidirectional I/O + +PTY events (globally broadcast via SSE): `pty.created`, `pty.updated`, `pty.exited`, `pty.deleted`. + +The agent does NOT use the PTY system. PTYs are for the user's interactive terminal panel, independent of any AI session. + +### Session Commands (`/session/{id}/command`, `/session/{id}/shell`) - Context Injection + +External clients can inject command results into an AI session's conversation context: + +- `POST /session/{sessionID}/command` - Executes a command and records the result as an `AssistantMessage` in the session. Required fields: `command`, `arguments`. The output becomes part of the AI's context for subsequent turns. +- `POST /session/{sessionID}/shell` - Similar but wraps in `sh -c`. Required fields: `command`, `agent`. +- `GET /command` - Lists available command definitions (metadata, not execution). + +Session commands emit `command.executed` events with `sessionID` + `messageID`. + +**Key distinction**: These endpoints execute commands directly (not via the AI), then inject the output into the session as if the AI produced it. The AI doesn't actively run the command - it just finds the output in its conversation history on the next turn. + +### Three Separate Execution Mechanisms + +| Mechanism | Who uses it | Scoped to | AI sees output? | +|-----------|-------------|-----------|----------------| +| Agent tools (internal) | AI agent | Session turn | Yes (immediate) | +| PTY (`/pty/*`) | User/frontend | Server (global) | No | +| Session commands (`/session/{id}/*`) | Frontend/SDK client | Session | Yes (next turn) | + +The agent has no tool to interact with PTYs and cannot access the session command endpoints. When the agent needs to run a background process, it uses its internal bash-equivalent tool with shell backgrounding (`&`). + +### Comparison + +| Capability | Supported? | Notes | +|-----------|-----------|-------| +| Agent runs commands | Yes (internal tools) | Synchronous, blocks agent turn | +| User runs commands → agent sees output | Yes (`/session/{id}/command`) | HTTP API, first-class | +| External API for command injection | Yes | Session-scoped endpoints | +| Command source tracking | Implicit | Endpoint implies source (no enum) | +| Background process management | No | Shell `&` only for agent | +| PTY / interactive terminal | Yes (`/pty/*`) | Server-scoped, WebSocket I/O | + ## Notes - OpenCode is the most feature-rich runtime (streaming, questions, permissions) diff --git a/research/process-terminal-design.md b/research/process-terminal-design.md new file mode 100644 index 0000000..d7ba8d5 --- /dev/null +++ b/research/process-terminal-design.md @@ -0,0 +1,374 @@ +# Research: Process & Terminal System Design + +Research on PTY/terminal and process management APIs across sandbox platforms, with design recommendations for sandbox-agent. + +## Competitive Landscape + +### Transport Comparison + +| Platform | PTY Transport | Command Transport | Unified? | +|----------|--------------|-------------------|----------| +| **OpenCode** | WebSocket (`/pty/{id}/connect`) | REST (session-scoped, AI-mediated) | No | +| **E2B** | gRPC server-stream (output) + unary RPC (input) | Same gRPC service | Yes | +| **Daytona** | WebSocket | REST | No | +| **Kubernetes** | WebSocket (channel byte mux) | Same WebSocket | Yes | +| **Docker** | HTTP connection hijack | Same connection | Yes | +| **Fly.io** | SSH over WireGuard | REST (sync, 60s max) | No | +| **Vercel Sandboxes** | No PTY API | REST SDK (async generator for logs) | N/A | +| **Gitpod** | gRPC (Listen=output, Write=input) | Same gRPC service | Yes | + +### Resize Mechanism + +| Platform | How | Notes | +|----------|-----|-------| +| **OpenCode** | `PUT /pty/{id}` with `size: {rows, cols}` | Separate REST call | +| **E2B** | Separate `Update` RPC | Separate gRPC call | +| **Daytona** | Separate HTTP POST | Sends SIGWINCH | +| **Kubernetes** | In-band WebSocket message (channel byte 4) | `{"Width": N, "Height": N}` | +| **Docker** | `POST /exec/{id}/resize?h=N&w=N` | Separate REST call | +| **Gitpod** | Separate `SetSize` RPC | Separate gRPC call | + +**Consensus**: Almost all platforms use a separate call for resize. Only Kubernetes does it in-band. Since resize is a control signal (not data), a separate mechanism is cleaner. + +### I/O Multiplexing + +I/O multiplexing is how platforms distinguish between stdout, stderr, and PTY data on a shared connection. + +| Platform | Method | Detail | +|----------|--------|--------| +| **Docker** | 8-byte binary header per frame | Byte 0 = stream type (0=stdin, 1=stdout, 2=stderr). When TTY=true, no mux (raw stream). | +| **Kubernetes** | 1-byte channel prefix per WebSocket message | 0=stdin, 1=stdout, 2=stderr, 3=error, 4=resize, 255=close | +| **E2B** | gRPC `oneof` in protobuf | `DataEvent.output` is `oneof { bytes stdout, bytes stderr, bytes pty }` | +| **OpenCode** | None | PTY is a unified stream. Commands capture stdout/stderr separately in response. | +| **Daytona** | None | PTY is unified. Commands return structured `{stdout, stderr}`. | + +**Key insight**: When a process runs with a PTY allocated, stdout and stderr are merged by the kernel into a single stream. Multiplexing only matters for non-PTY command execution. OpenCode and Daytona handle this by keeping PTY (unified stream) and commands (structured response) as separate APIs. + +### Reconnection + +| Platform | Method | Replays missed output? | +|----------|--------|----------------------| +| **E2B** | `Connect` RPC by PID or tag | No - only new events from reconnect point | +| **Daytona** | New WebSocket to same PTY session | No | +| **Kubernetes** | Not supported (connection = session) | N/A | +| **Docker** | Not supported (connection = session) | N/A | +| **OpenCode** | `GET /pty/{id}/connect` (WebSocket) | Unknown (not documented) | + +### Process Identification + +| Platform | ID Type | Notes | +|----------|---------|-------| +| **OpenCode** | String (`pty_N`) | Pattern `^pty.*` | +| **E2B** | PID (uint32) or tag (string) | Dual selector | +| **Daytona** | Session ID / PID | | +| **Docker** | Exec ID (string, server-generated) | | +| **Kubernetes** | Connection-scoped | No ID - the WebSocket IS the process | +| **Gitpod** | Alias (string) | Human-readable | + +### Scoping + +| Platform | PTY Scope | Command Scope | +|----------|-----------|---------------| +| **OpenCode** | Server-wide (global) | Session-specific (AI-mediated) | +| **E2B** | Sandbox-wide | Sandbox-wide | +| **Daytona** | Sandbox-wide | Sandbox-wide | +| **Docker** | Container-scoped | Container-scoped | +| **Kubernetes** | Pod-scoped | Pod-scoped | + +## Key Questions & Analysis + +### Q: Should PTY transport be WebSocket? + +**Yes.** WebSocket is the right choice for PTY I/O: +- Bidirectional: client sends keystrokes, server sends terminal output +- Low latency: no HTTP request overhead per keystroke +- Persistent connection: terminal sessions are long-lived +- Industry consensus: OpenCode, Daytona, and Kubernetes all use WebSocket for PTY + +### Q: Should command transport be WebSocket or REST? + +**REST is sufficient for commands. WebSocket is not needed.** + +The distinction comes down to the nature of each operation: + +- **PTY**: Long-lived, bidirectional, interactive. User types, terminal responds. Needs WebSocket. +- **Commands**: Request-response. Client says "run `ls -la`", server runs it, returns stdout/stderr/exit_code. This is a natural REST operation. + +The "full duplex" question: commands don't need full duplex because: +1. Input is sent once at invocation (the command string) +2. Output is collected and returned when the process exits +3. There's no ongoing interactive input during execution + +For **streaming output** of long-running commands (e.g., `npm install`), there are two clean options: +1. **SSE**: Server-Sent Events for output streaming (output-only, which is all you need) +2. **PTY**: If the user needs to interact with the process (send ctrl+c, provide stdin), they should use a PTY instead + +This matches how OpenCode separates the two: commands are REST, PTYs are WebSocket. + +**Recommendation**: Keep commands as REST. If a command needs streaming output or interactive input, the user should create a PTY instead. This avoids building a second WebSocket protocol for a use case that PTYs already cover. + +### Q: Should resize be WebSocket in-band or separate POST? + +**Separate endpoint (PUT or POST).** + +Reasons: +- Resize is a control signal, not data. Mixing it into the data stream requires a framing protocol to distinguish resize messages from terminal input. +- OpenCode already defines `PUT /pty/{id}` with `size: {rows, cols}` - this is the existing spec. +- E2B, Daytona, Docker, and Gitpod all use separate calls. +- Only Kubernetes does in-band (because their channel-byte protocol already has a mux layer). +- A separate endpoint is simpler to implement, test, and debug. + +**Recommendation**: Use `PUT /pty/{id}` with `size` field (matching OpenCode spec). Alternatively, a dedicated `POST /pty/{id}/resize` if we want to keep update and resize semantically separate. + +### Q: What is I/O multiplexing? + +I/O multiplexing is the mechanism for distinguishing between different data streams (stdout, stderr, stdin, control signals) on a single connection. + +**When it matters**: Non-PTY command execution where stdout and stderr need to be kept separate. + +**When it doesn't matter**: PTY sessions. When a PTY is allocated, the kernel merges stdout and stderr into a single stream (the PTY master fd). There is only one output stream. This is why terminals show stdout and stderr interleaved - the PTY doesn't distinguish them. + +**For sandbox-agent**: Since PTYs are unified streams and commands use REST (separate stdout/stderr in the JSON response), we don't need a multiplexing protocol. The API design naturally separates the two cases. + +### Q: How should reconnect work? + +**Reconnect is an application-level concept, not just HTTP/WebSocket reconnection.** + +The distinction: + +- **HTTP/WebSocket reconnect**: The transport-level connection drops and is re-established. This is handled by the client library automatically (retry logic, exponential backoff). The server doesn't need to know. +- **Process reconnect**: The client disconnects from a running process but the process keeps running. Later, the client (or a different client) connects to the same process and starts receiving output again. + +**E2B's model**: Disconnecting a stream (via AbortController) leaves the process running. `Connect` RPC by PID or tag re-establishes the output stream. Missed output during disconnection is lost. This works because: +1. Processes are long-lived (servers, shells) +2. For terminals, the screen state can be recovered by the shell/application redrawing +3. For commands, if you care about all output, don't disconnect + +**Recommendation for sandbox-agent**: Reconnect should be supported at the application level: +1. `GET /pty/{id}/connect` (WebSocket) can be called multiple times for the same PTY +2. If the WebSocket drops, the PTY process keeps running +3. Client reconnects by opening a new WebSocket to the same endpoint +4. No output replay (too complex, rarely needed - terminal apps redraw on reconnect via SIGWINCH) +5. This is essentially what OpenCode's `/pty/{id}/connect` endpoint already implies + +This naturally leads to the **persistent process system** concept (see below). + +### Q: How are PTY events different from PTY transport? + +Two completely separate channels serving different purposes: + +**PTY Events** (via SSE on `/event` or `/sessions/{id}/events/sse`): +- Lifecycle notifications: `pty.created`, `pty.updated`, `pty.exited`, `pty.deleted` +- Lightweight JSON metadata (PTY id, status, exit code) +- Broadcast to all subscribers +- Used by UIs to update PTY lists, show status indicators, handle cleanup + +**PTY Transport** (via WebSocket on `/pty/{id}/connect`): +- Raw terminal I/O: binary input/output bytes +- High-frequency, high-bandwidth +- Point-to-point (one client connected to one PTY) +- Used by terminal emulators (xterm.js) to render the terminal + +**Analogy**: Events are like email notifications ("a new terminal was opened"). Transport is like the phone call (the actual terminal session). + +### Q: How are PTY and commands different in OpenCode? + +They serve fundamentally different purposes: + +**PTY (`/pty/*`)** - Direct execution environment: +- Server-scoped (not tied to any AI session) +- Creates a real terminal process +- User interacts directly via WebSocket +- Not part of the AI conversation +- Think: "the terminal panel in VS Code" + +**Commands (`/session/{sessionID}/command`, `/session/{sessionID}/shell`)** - AI-mediated execution: +- Session-scoped (tied to an AI session) +- The command is sent **to the AI assistant** for execution +- Creates an `AssistantMessage` in the session's conversation history +- Output becomes part of the AI's context +- Think: "asking Claude to run a command as a tool call" + +**Why commands are session-specific**: Because they're AI operations, not direct execution. When you call `POST /session/{id}/command`, the server: +1. Creates an assistant message in the session +2. Runs the command +3. Captures output as message parts +4. Emits `message.part.updated` events +5. The AI can see this output in subsequent turns + +This is how the AI "uses terminal tools" - the command infrastructure provides the bridge between the AI session and system execution. + +### Q: Should scoping be system-wide? + +**Yes, for both PTY and commands.** + +Current OpenCode behavior: +- PTYs: Already server-wide (global) +- Commands: Session-scoped (for AI context injection) + +**For sandbox-agent**, since we're the orchestration layer (not the AI): +- **PTYs**: System-wide. Any client should be able to list, connect to, or manage any PTY. +- **Commands/processes**: System-wide. Process execution is a system primitive, not an AI primitive. If a caller wants to associate a process with a session, they can do so at their layer. + +The session-scoping of commands in OpenCode is an OpenCode-specific concern (AI context injection). Sandbox-agent should provide the lower-level primitive (system-wide process execution) and let the OpenCode compat layer handle the session association. + +## Persistent Process System + +### The Concept + +A persistent process system means: +1. **Spawn** a process (PTY or command) via API +2. Process runs independently of any client connection +3. **Connect/disconnect** to the process I/O at will +4. Process continues running through disconnections +5. **Query** process status, list running processes +6. **Kill/signal** processes explicitly + +This is distinct from the typical "connection = process lifetime" model (Kubernetes, Docker exec) where closing the connection kills the process. + +### How E2B Does It + +E2B's `Process` service is the best reference implementation: + +``` +Start(cmd, pty?) → stream of events (output) +Connect(pid/tag) → stream of events (reconnect) +SendInput(pid, data) → ok +Update(pid, size) → ok (resize) +SendSignal(pid, signal) → ok +List() → running processes +``` + +Key design choices: +- **Unified service**: PTY and command are the same service, differentiated by the `pty` field in `StartRequest` +- **Process outlives connection**: Disconnecting the output stream (aborting the `Start`/`Connect` RPC) does NOT kill the process +- **Explicit termination**: Must call `SendSignal(SIGKILL)` to stop a process +- **Tag-based selection**: Processes can be tagged at creation for later lookup without knowing the PID + +### Recommendation for Sandbox-Agent + +Sandbox-agent should implement a **persistent process manager** that: + +1. **Is system-wide** (not session-scoped) +2. **Supports both PTY and non-PTY modes** +3. **Decouples process lifetime from connection lifetime** +4. **Exposes via both REST (lifecycle) and WebSocket (I/O)** + +#### Proposed API Surface + +**Process Lifecycle (REST)**: +| Method | Endpoint | Description | +|--------|----------|-------------| +| `POST` | `/v1/processes` | Create/spawn a process (PTY or command) | +| `GET` | `/v1/processes` | List all processes | +| `GET` | `/v1/processes/{id}` | Get process info (status, pid, exit code) | +| `DELETE` | `/v1/processes/{id}` | Kill process (SIGTERM, then SIGKILL) | +| `POST` | `/v1/processes/{id}/signal` | Send signal (SIGTERM, SIGKILL, SIGINT, etc.) | +| `POST` | `/v1/processes/{id}/resize` | Resize PTY (rows, cols) | +| `POST` | `/v1/processes/{id}/input` | Send stdin/pty input (REST fallback) | + +**Process I/O (WebSocket)**: +| Method | Endpoint | Description | +|--------|----------|-------------| +| `GET` | `/v1/processes/{id}/connect` | WebSocket for bidirectional I/O | + +**Process Events (SSE)**: +| Event | Description | +|-------|-------------| +| `process.created` | Process spawned | +| `process.updated` | Process metadata changed | +| `process.exited` | Process terminated (includes exit code) | +| `process.deleted` | Process record removed | + +#### Create Request + +```json +{ + "command": "bash", + "args": ["-i", "-l"], + "cwd": "/workspace", + "env": {"TERM": "xterm-256color"}, + "pty": { // Optional - if present, allocate PTY + "rows": 24, + "cols": 80 + }, + "tag": "main-terminal", // Optional - for lookup by name + "label": "Terminal 1" // Optional - display name +} +``` + +#### Process Object + +```json +{ + "id": "proc_abc123", + "tag": "main-terminal", + "label": "Terminal 1", + "command": "bash", + "args": ["-i", "-l"], + "cwd": "/workspace", + "pid": 12345, + "pty": true, + "status": "running", // "running" | "exited" + "exit_code": null, // Set when exited + "created_at": "2025-01-15T...", + "exited_at": null +} +``` + +#### OpenCode Compatibility Layer + +The OpenCode compat layer maps to this system: + +| OpenCode Endpoint | Maps To | +|-------------------|---------| +| `POST /pty` | `POST /v1/processes` (with `pty` field) | +| `GET /pty` | `GET /v1/processes?pty=true` | +| `GET /pty/{id}` | `GET /v1/processes/{id}` | +| `PUT /pty/{id}` | `POST /v1/processes/{id}/resize` + metadata update | +| `DELETE /pty/{id}` | `DELETE /v1/processes/{id}` | +| `GET /pty/{id}/connect` | `GET /v1/processes/{id}/connect` | +| `POST /session/{id}/command` | Create process + capture output into session | +| `POST /session/{id}/shell` | Create process (shell mode) + capture output into session | + +### Open Questions + +1. **Output buffering for reconnect**: Should we buffer recent output (e.g., last 64KB) so reconnecting clients get some history? E2B doesn't do this, but it would improve UX for flaky connections. + +2. **Process limits**: Should there be a max number of concurrent processes? E2B doesn't expose one, but sandbox environments have limited resources. + +3. **Auto-cleanup**: Should processes be auto-cleaned after exiting? Options: + - Keep forever until explicitly deleted + - Auto-delete after N seconds/minutes + - Keep metadata but release resources + +4. **Input via REST vs WebSocket-only**: The REST `POST /processes/{id}/input` endpoint is useful for one-shot input (e.g., "send ctrl+c") without establishing a WebSocket. E2B has both `SendInput` (unary) and `StreamInput` (streaming) for this reason. + +5. **Multiple WebSocket connections to same process**: Should we allow multiple clients to connect to the same process simultaneously? (Pair programming, monitoring). E2B supports this via multiple `Connect` calls. + +## User-Initiated Command Injection ("Run command, give AI context") + +A common pattern across agents: the user (or frontend) runs a command and the output is injected into the AI's conversation context. This is distinct from the agent running a command via its own tools. + +| Agent | Feature | Mechanism | Protocol-level? | +|-------|---------|-----------|----------------| +| **Claude Code** | `!command` prefix in TUI | CLI runs command locally, injects output as user message | No - client-side hack, not in API schema | +| **Codex** | `user_shell` source | `ExecCommandSource` enum distinguishes `agent` vs `user_shell` vs `unified_exec_*` | Yes - first-class protocol event | +| **OpenCode** | `/session/{id}/command` | HTTP endpoint runs command, records result as `AssistantMessage` | Yes - HTTP API | +| **Amp** | N/A | Not supported | N/A | + +**Design implication for sandbox-agent**: The process system should support an optional `session_id` field when creating a process. If provided, the process output is associated with that session so the agent can see it. If not provided, the process runs independently (like a PTY). This unifies: +- User interactive terminals (no session association) +- User-initiated commands for AI context (session association) +- Agent-initiated background processes (session association) + +## Sources + +- [E2B Process Proto](https://github.com/e2b-dev/E2B) - `process.proto` gRPC service definition +- [E2B JS SDK](https://github.com/e2b-dev/E2B/tree/main/packages/js-sdk) - `commands/pty.ts`, `commands/index.ts` +- [Daytona SDK](https://www.daytona.io/docs/en/typescript-sdk/process/) - REST + WebSocket PTY API +- [Kubernetes RemoteCommand](https://github.com/kubernetes/apimachinery/blob/master/pkg/util/remotecommand/constants.go) - WebSocket subprotocol +- [Docker Engine API](https://docker-docs.uclv.cu/engine/api/v1.21/) - Exec API with stream multiplexing +- [Fly.io Machines API](https://fly.io/docs/machines/api/) - REST exec with 60s limit +- [Gitpod terminal.proto](https://codeberg.org/kanishka-reading-list/gitpod/src/branch/main/components/supervisor-api/terminal.proto) - gRPC terminal service +- [OpenCode OpenAPI Spec](https://github.com/opencode-ai/opencode) - PTY and session command endpoints diff --git a/sdks/typescript/src/generated/openapi.ts b/sdks/typescript/src/generated/openapi.ts index 1e3239e..59eb12c 100644 --- a/sdks/typescript/src/generated/openapi.ts +++ b/sdks/typescript/src/generated/openapi.ts @@ -87,6 +87,8 @@ export interface components { }; AgentInfo: { capabilities: components["schemas"]["AgentCapabilities"]; + /** @description Whether the agent's required provider credentials are available */ + credentialsAvailable: boolean; id: string; installed: boolean; path?: string | null; diff --git a/server/packages/agent-credentials/src/lib.rs b/server/packages/agent-credentials/src/lib.rs index b456a2b..b2c2225 100644 --- a/server/packages/agent-credentials/src/lib.rs +++ b/server/packages/agent-credentials/src/lib.rs @@ -63,7 +63,9 @@ pub fn extract_claude_credentials( ]; for path in config_paths { - let data = read_json_file(&path)?; + let Some(data) = read_json_file(&path) else { + continue; + }; for key_path in &key_paths { if let Some(key) = read_string_field(&data, key_path) { if key.starts_with("sk-ant-") { diff --git a/server/packages/sandbox-agent/src/opencode_compat.rs b/server/packages/sandbox-agent/src/opencode_compat.rs index 14272bd..8e9ac00 100644 --- a/server/packages/sandbox-agent/src/opencode_compat.rs +++ b/server/packages/sandbox-agent/src/opencode_compat.rs @@ -21,12 +21,16 @@ use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use tokio::sync::{broadcast, Mutex}; use tokio::time::interval; +use tracing::warn; use utoipa::{IntoParams, OpenApi, ToSchema}; use crate::router::{ is_question_tool_action, AgentModelInfo, AppState, CreateSessionRequest, PermissionReply, }; use sandbox_agent_agent_management::agents::AgentId; +use sandbox_agent_agent_management::credentials::{ + extract_all_credentials, CredentialExtractionOptions, ExtractedCredentials, +}; use sandbox_agent_error::SandboxError; use sandbox_agent_universal_agent_schema::{ ContentPart, FileAction, ItemDeltaData, ItemEventData, ItemKind, ItemRole, ItemStatus, @@ -235,6 +239,8 @@ struct OpenCodeModelCache { group_names: HashMap, default_group: String, default_model: String, + /// Group IDs that have valid credentials available + connected: Vec, } pub struct OpenCodeState { @@ -639,6 +645,21 @@ async fn opencode_model_cache(state: &OpenCodeAppState) -> OpenCodeModelCache { } async fn build_opencode_model_cache(state: &OpenCodeAppState) -> OpenCodeModelCache { + // Check credentials upfront + let credentials = match tokio::task::spawn_blocking(|| { + extract_all_credentials(&CredentialExtractionOptions::new()) + }) + .await + { + Ok(creds) => creds, + Err(err) => { + warn!("Failed to extract credentials for model cache: {err}"); + ExtractedCredentials::default() + } + }; + let has_anthropic = credentials.anthropic.is_some(); + let has_openai = credentials.openai.is_some(); + let mut entries = Vec::new(); let mut model_lookup = HashMap::new(); let mut ambiguous_models = HashSet::new(); @@ -737,6 +758,28 @@ async fn build_opencode_model_cache(state: &OpenCodeAppState) -> OpenCodeModelCa } } + // Build connected list based on credential availability + let mut connected = Vec::new(); + for group_id in group_names.keys() { + let is_connected = match group_agents.get(group_id) { + Some(AgentId::Claude) | Some(AgentId::Amp) => has_anthropic, + Some(AgentId::Codex) => has_openai, + Some(AgentId::Opencode) => { + // Check the specific provider for opencode groups (e.g., "opencode:anthropic") + match opencode_group_provider(group_id) { + Some("anthropic") => has_anthropic, + Some("openai") => has_openai, + _ => has_anthropic || has_openai, + } + } + Some(AgentId::Mock) => true, + None => false, + }; + if is_connected { + connected.push(group_id.clone()); + } + } + OpenCodeModelCache { entries, model_lookup, @@ -745,6 +788,7 @@ async fn build_opencode_model_cache(state: &OpenCodeAppState) -> OpenCodeModelCa group_names, default_group, default_model, + connected, } } @@ -4005,7 +4049,6 @@ async fn oc_provider_list(State(state): State>) -> impl In } let mut providers = Vec::new(); let mut defaults = serde_json::Map::new(); - let mut connected = Vec::new(); for (group_id, entries) in grouped { let mut models = serde_json::Map::new(); for entry in entries { @@ -4025,12 +4068,12 @@ async fn oc_provider_list(State(state): State>) -> impl In if let Some(default_model) = cache.group_defaults.get(&group_id) { defaults.insert(group_id.clone(), Value::String(default_model.clone())); } - connected.push(group_id); } + // Use the connected list from cache (based on credential availability) let providers = json!({ "all": providers, "default": Value::Object(defaults), - "connected": connected + "connected": cache.connected }); (StatusCode::OK, Json(providers)) } diff --git a/server/packages/sandbox-agent/src/router.rs b/server/packages/sandbox-agent/src/router.rs index f4e4b91..345f64a 100644 --- a/server/packages/sandbox-agent/src/router.rs +++ b/server/packages/sandbox-agent/src/router.rs @@ -55,7 +55,9 @@ static USER_MESSAGE_COUNTER: AtomicU64 = AtomicU64::new(1); const ANTHROPIC_MODELS_URL: &str = "https://api.anthropic.com/v1/models?beta=true"; const ANTHROPIC_VERSION: &str = "2023-06-01"; -fn claude_oauth_fallback_models() -> AgentModelsResponse { +fn claude_fallback_models() -> AgentModelsResponse { + // Claude Code accepts model aliases: default, sonnet, opus, haiku + // These work for both API key and OAuth users AgentModelsResponse { models: vec![ AgentModelInfo { @@ -64,6 +66,12 @@ fn claude_oauth_fallback_models() -> AgentModelsResponse { variants: None, default_variant: None, }, + AgentModelInfo { + id: "sonnet".to_string(), + name: Some("Sonnet".to_string()), + variants: None, + default_variant: None, + }, AgentModelInfo { id: "opus".to_string(), name: Some("Opus".to_string()), @@ -1824,8 +1832,14 @@ impl SessionManager { agent: AgentId, ) -> Result { match agent { - AgentId::Claude => self.fetch_claude_models().await, - AgentId::Codex => self.fetch_codex_models().await, + AgentId::Claude => match self.fetch_claude_models().await { + Ok(response) if !response.models.is_empty() => Ok(response), + _ => Ok(claude_fallback_models()), + }, + AgentId::Codex => match self.fetch_codex_models().await { + Ok(response) if !response.models.is_empty() => Ok(response), + _ => Ok(codex_fallback_models()), + }, AgentId::Opencode => match self.fetch_opencode_models().await { Ok(models) => Ok(models), Err(_) => Ok(AgentModelsResponse { @@ -3480,7 +3494,7 @@ impl SessionManager { status = %status, "Anthropic model list rejected OAuth credentials; using Claude OAuth fallback models" ); - return Ok(claude_oauth_fallback_models()); + return Ok(claude_fallback_models()); } return Err(SandboxError::StreamError { message: format!("Anthropic models request failed {status}: {body}"), @@ -3540,7 +3554,7 @@ impl SessionManager { tracing::warn!( "Anthropic model list was empty for OAuth credentials; using Claude OAuth fallback models" ); - return Ok(claude_oauth_fallback_models()); + return Ok(claude_fallback_models()); } Ok(AgentModelsResponse { @@ -4058,6 +4072,8 @@ pub struct ServerStatusInfo { pub struct AgentInfo { pub id: String, pub installed: bool, + /// Whether the agent's required provider credentials are available + pub credentials_available: bool, #[serde(default, skip_serializing_if = "Option::is_none")] pub version: Option, #[serde(default, skip_serializing_if = "Option::is_none")] @@ -4325,6 +4341,10 @@ async fn list_agents( let agents = tokio::task::spawn_blocking(move || { + let credentials = extract_all_credentials(&CredentialExtractionOptions::new()); + let has_anthropic = credentials.anthropic.is_some(); + let has_openai = credentials.openai.is_some(); + all_agents() .into_iter() .map(|agent_id| { @@ -4333,6 +4353,13 @@ async fn list_agents( let path = manager.resolve_binary(agent_id).ok(); let capabilities = agent_capabilities_for(agent_id); + let credentials_available = match agent_id { + AgentId::Claude | AgentId::Amp => has_anthropic, + AgentId::Codex => has_openai, + AgentId::Opencode => has_anthropic || has_openai, + AgentId::Mock => true, + }; + // Add server_status for agents with shared processes let server_status = if capabilities.shared_process { @@ -4352,6 +4379,7 @@ async fn list_agents( AgentInfo { id: agent_id.as_str().to_string(), installed, + credentials_available, version, path: path.map(|path| path.to_string_lossy().to_string()), capabilities, @@ -4873,6 +4901,22 @@ fn mock_models_response() -> AgentModelsResponse { } } +fn codex_fallback_models() -> AgentModelsResponse { + let models = ["gpt-4o", "o3", "o4-mini"] + .into_iter() + .map(|id| AgentModelInfo { + id: id.to_string(), + name: None, + variants: Some(codex_variants()), + default_variant: Some("medium".to_string()), + }) + .collect(); + AgentModelsResponse { + models, + default_model: Some("gpt-4o".to_string()), + } +} + fn amp_variants() -> Vec { vec!["medium", "high", "xhigh"] .into_iter()