Auto-retry on transient provider errors (overloaded, rate limit, 5xx)

- Add retry logic with exponential backoff (2s, 4s, 8s) in AgentSession
- Disable Anthropic SDK built-in retries (maxRetries: 0) to allow app-level handling
- TUI shows retry status with Escape to cancel
- RPC mode: add set_auto_retry, abort_retry commands and auto_retry_start/end events
- Configurable via settings.json: retry.enabled, retry.maxRetries, retry.baseDelayMs
- Exclude context overflow errors from retry (handled by compaction)

fixes #157
This commit is contained in:
Mario Zechner 2025-12-10 23:36:46 +01:00
parent 79f5c6d22e
commit bb445d24f1
11 changed files with 379 additions and 3 deletions

View file

@ -2,6 +2,10 @@
## [Unreleased]
### Changed
- **Anthropic SDK retries disabled**: Set `maxRetries: 0` on Anthropic client to allow application-level retry handling. The SDK's built-in retries were interfering with coding-agent's retry logic. ([#157](https://github.com/badlogic/pi-mono/issues/157))
## [0.18.1] - 2025-12-10
### Added

View file

@ -295,6 +295,7 @@ function createClient(
baseURL: model.baseUrl,
defaultHeaders,
dangerouslyAllowBrowser: true,
maxRetries: 0, // Disable SDK retries, handled by coding-agent
});
return { client, isOAuthToken: true };
@ -311,6 +312,7 @@ function createClient(
baseURL: model.baseUrl,
dangerouslyAllowBrowser: true,
defaultHeaders,
maxRetries: 0, // Disable SDK retries, handled by coding-agent
});
return { client, isOAuthToken: false };

View file

@ -4,6 +4,8 @@
### Added
- **Auto-retry on transient errors**: Automatically retries requests when providers return overloaded, rate limit, or server errors (429, 500, 502, 503, 504). Uses exponential backoff (2s, 4s, 8s). Shows retry status in TUI with option to cancel via Escape. Configurable in `settings.json` via `retry.enabled`, `retry.maxRetries`, `retry.baseDelayMs`. RPC mode emits `auto_retry_start` and `auto_retry_end` events. ([#157](https://github.com/badlogic/pi-mono/issues/157))
- **HTML export line numbers**: Read tool calls in HTML exports now display line number ranges (e.g., `file.txt:10-20`) when offset/limit parameters are used, matching the TUI display format. Line numbers appear in yellow color for better visibility. ([#166](https://github.com/badlogic/pi-mono/issues/166))
### Fixed

View file

@ -524,13 +524,23 @@ See [Hooks Documentation](docs/hooks.md) for full API reference.
"shellPath": "C:\\path\\to\\bash.exe",
"queueMode": "one-at-a-time",
"compaction": {
"enabled": false,
"enabled": true,
"reserveTokens": 16384,
"keepRecentTokens": 20000
},
"retry": {
"enabled": true,
"maxRetries": 3,
"baseDelayMs": 2000
}
}
```
**Retry settings:**
- `enabled`: Auto-retry on transient errors (overloaded, rate limit, 5xx). Default: `true`
- `maxRetries`: Maximum retry attempts. Default: `3`
- `baseDelayMs`: Base delay for exponential backoff (2s, 4s, 8s). Default: `2000`
---
## CLI Reference

View file

@ -303,6 +303,34 @@ Response:
{"type": "response", "command": "set_auto_compaction", "success": true}
```
### Retry
#### set_auto_retry
Enable or disable automatic retry on transient errors (overloaded, rate limit, 5xx).
```json
{"type": "set_auto_retry", "enabled": true}
```
Response:
```json
{"type": "response", "command": "set_auto_retry", "success": true}
```
#### abort_retry
Abort an in-progress retry (cancel the delay and stop retrying).
```json
{"type": "abort_retry"}
```
Response:
```json
{"type": "response", "command": "abort_retry", "success": true}
```
### Bash
#### bash
@ -528,6 +556,8 @@ Events are streamed to stdout as JSON lines during agent operation. Events do NO
| `tool_execution_end` | Tool completes |
| `auto_compaction_start` | Auto-compaction begins |
| `auto_compaction_end` | Auto-compaction completes |
| `auto_retry_start` | Auto-retry begins (after transient error) |
| `auto_retry_end` | Auto-retry completes (success or final failure) |
### agent_start
@ -664,6 +694,38 @@ Emitted when automatic compaction runs (when context is nearly full).
If compaction was aborted, `result` is `null` and `aborted` is `true`.
### auto_retry_start / auto_retry_end
Emitted when automatic retry is triggered after a transient error (overloaded, rate limit, 5xx).
```json
{
"type": "auto_retry_start",
"attempt": 1,
"maxAttempts": 3,
"delayMs": 2000,
"errorMessage": "529 {\"type\":\"error\",\"error\":{\"type\":\"overloaded_error\",\"message\":\"Overloaded\"}}"
}
```
```json
{
"type": "auto_retry_end",
"success": true,
"attempt": 2
}
```
On final failure (max retries exceeded):
```json
{
"type": "auto_retry_end",
"success": false,
"attempt": 3,
"finalError": "529 overloaded_error: Overloaded"
}
```
## Error Handling
Failed commands return a response with `success: false`:

View file

@ -31,7 +31,9 @@ import { expandSlashCommand, type FileSlashCommand } from "./slash-commands.js";
export type AgentSessionEvent =
| AgentEvent
| { type: "auto_compaction_start"; reason: "threshold" | "overflow" }
| { type: "auto_compaction_end"; result: CompactionResult | null; aborted: boolean; willRetry: boolean };
| { type: "auto_compaction_end"; result: CompactionResult | null; aborted: boolean; willRetry: boolean }
| { type: "auto_retry_start"; attempt: number; maxAttempts: number; delayMs: number; errorMessage: string }
| { type: "auto_retry_end"; success: boolean; attempt: number; finalError?: string };
/** Listener function for agent session events */
export type AgentSessionEventListener = (event: AgentSessionEvent) => void;
@ -116,6 +118,12 @@ export class AgentSession {
private _compactionAbortController: AbortController | null = null;
private _autoCompactionAbortController: AbortController | null = null;
// Retry state
private _retryAbortController: AbortController | null = null;
private _retryAttempt = 0;
private _retryPromise: Promise<void> | null = null;
private _retryResolve: (() => void) | null = null;
// Bash execution state
private _bashAbortController: AbortController | null = null;
private _pendingBashMessages: BashExecutionMessage[] = [];
@ -184,14 +192,40 @@ export class AgentSession {
}
}
// Check auto-compaction after agent completes
// Check auto-retry and auto-compaction after agent completes
if (event.type === "agent_end" && this._lastAssistantMessage) {
const msg = this._lastAssistantMessage;
this._lastAssistantMessage = null;
// Check for retryable errors first (overloaded, rate limit, server errors)
if (this._isRetryableError(msg)) {
const didRetry = await this._handleRetryableError(msg);
if (didRetry) return; // Retry was initiated, don't proceed to compaction
} else if (this._retryAttempt > 0) {
// Previous retry succeeded - emit success event and reset counter
this._emit({
type: "auto_retry_end",
success: true,
attempt: this._retryAttempt,
});
this._retryAttempt = 0;
// Resolve the retry promise so waitForRetry() completes
this._resolveRetry();
}
await this._handleAgentEndCompaction(msg);
}
};
/** Resolve the pending retry promise */
private _resolveRetry(): void {
if (this._retryResolve) {
this._retryResolve();
this._retryResolve = null;
this._retryPromise = null;
}
}
/** Extract text content from a message */
private _getUserMessageText(message: Message): string {
if (message.role !== "user") return "";
@ -379,6 +413,7 @@ export class AgentSession {
const expandedText = expandCommands ? expandSlashCommand(text, [...this._fileCommands]) : text;
await this.agent.prompt(expandedText, options?.attachments);
await this.waitForRetry();
}
/**
@ -419,6 +454,7 @@ export class AgentSession {
* Abort current operation and wait for agent to become idle.
*/
async abort(): Promise<void> {
this.abortRetry();
this.agent.abort();
await this.agent.waitForIdle();
}
@ -784,6 +820,159 @@ export class AgentSession {
return this.settingsManager.getCompactionEnabled();
}
// =========================================================================
// Auto-Retry
// =========================================================================
/**
* Check if an error is retryable (overloaded, rate limit, server errors).
* Context overflow errors are NOT retryable (handled by compaction instead).
*/
private _isRetryableError(message: AssistantMessage): boolean {
if (message.stopReason !== "error" || !message.errorMessage) return false;
// Context overflow is handled by compaction, not retry
const contextWindow = this.model?.contextWindow ?? 0;
if (isContextOverflow(message, contextWindow)) return false;
const err = message.errorMessage;
// Match: overloaded_error, rate limit, 429, 500, 502, 503, 504, service unavailable
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server error|internal error/i.test(
err,
);
}
/**
* Handle retryable errors with exponential backoff.
* @returns true if retry was initiated, false if max retries exceeded or disabled
*/
private async _handleRetryableError(message: AssistantMessage): Promise<boolean> {
const settings = this.settingsManager.getRetrySettings();
if (!settings.enabled) return false;
this._retryAttempt++;
// Create retry promise on first attempt so waitForRetry() can await it
if (this._retryAttempt === 1 && !this._retryPromise) {
this._retryPromise = new Promise((resolve) => {
this._retryResolve = resolve;
});
}
if (this._retryAttempt > settings.maxRetries) {
// Max retries exceeded, emit final failure and reset
this._emit({
type: "auto_retry_end",
success: false,
attempt: this._retryAttempt - 1,
finalError: message.errorMessage,
});
this._retryAttempt = 0;
this._resolveRetry(); // Resolve so waitForRetry() completes
return false;
}
const delayMs = settings.baseDelayMs * 2 ** (this._retryAttempt - 1);
this._emit({
type: "auto_retry_start",
attempt: this._retryAttempt,
maxAttempts: settings.maxRetries,
delayMs,
errorMessage: message.errorMessage || "Unknown error",
});
// Remove error message from agent state (keep in session for history)
const messages = this.agent.state.messages;
if (messages.length > 0 && messages[messages.length - 1].role === "assistant") {
this.agent.replaceMessages(messages.slice(0, -1));
}
// Wait with exponential backoff (abortable)
this._retryAbortController = new AbortController();
try {
await this._sleep(delayMs, this._retryAbortController.signal);
} catch {
// Aborted during sleep - emit end event so UI can clean up
const attempt = this._retryAttempt;
this._retryAttempt = 0;
this._retryAbortController = null;
this._emit({
type: "auto_retry_end",
success: false,
attempt,
finalError: "Retry cancelled",
});
this._resolveRetry();
return false;
}
this._retryAbortController = null;
// Retry via continue() - use setTimeout to break out of event handler chain
setTimeout(() => {
this.agent.continue().catch(() => {
// Retry failed - will be caught by next agent_end
});
}, 0);
return true;
}
/**
* Sleep helper that respects abort signal.
*/
private _sleep(ms: number, signal?: AbortSignal): Promise<void> {
return new Promise((resolve, reject) => {
if (signal?.aborted) {
reject(new Error("Aborted"));
return;
}
const timeout = setTimeout(resolve, ms);
signal?.addEventListener("abort", () => {
clearTimeout(timeout);
reject(new Error("Aborted"));
});
});
}
/**
* Cancel in-progress retry.
*/
abortRetry(): void {
this._retryAbortController?.abort();
this._retryAttempt = 0;
this._resolveRetry();
}
/**
* Wait for any in-progress retry to complete.
* Returns immediately if no retry is in progress.
*/
private async waitForRetry(): Promise<void> {
if (this._retryPromise) {
await this._retryPromise;
}
}
/** Whether auto-retry is currently in progress */
get isRetrying(): boolean {
return this._retryPromise !== null;
}
/** Whether auto-retry is enabled */
get autoRetryEnabled(): boolean {
return this.settingsManager.getRetryEnabled();
}
/**
* Toggle auto-retry setting.
*/
setAutoRetryEnabled(enabled: boolean): void {
this.settingsManager.setRetryEnabled(enabled);
}
// =========================================================================
// Bash Execution
// =========================================================================

View file

@ -8,6 +8,12 @@ export interface CompactionSettings {
keepRecentTokens?: number; // default: 20000
}
export interface RetrySettings {
enabled?: boolean; // default: true
maxRetries?: number; // default: 3
baseDelayMs?: number; // default: 2000 (exponential backoff: 2s, 4s, 8s)
}
export interface Settings {
lastChangelogVersion?: string;
defaultProvider?: string;
@ -16,6 +22,7 @@ export interface Settings {
queueMode?: "all" | "one-at-a-time";
theme?: string;
compaction?: CompactionSettings;
retry?: RetrySettings;
hideThinkingBlock?: boolean;
shellPath?: string; // Custom shell path (e.g., for Cygwin users on Windows)
collapseChangelog?: boolean; // Show condensed changelog after update (use /changelog for full)
@ -149,6 +156,26 @@ export class SettingsManager {
};
}
getRetryEnabled(): boolean {
return this.settings.retry?.enabled ?? true;
}
setRetryEnabled(enabled: boolean): void {
if (!this.settings.retry) {
this.settings.retry = {};
}
this.settings.retry.enabled = enabled;
this.save();
}
getRetrySettings(): { enabled: boolean; maxRetries: number; baseDelayMs: number } {
return {
enabled: this.getRetryEnabled(),
maxRetries: this.settings.retry?.maxRetries ?? 3,
baseDelayMs: this.settings.retry?.baseDelayMs ?? 2000,
};
}
getHideThinkingBlock(): boolean {
return this.settings.hideThinkingBlock ?? false;
}

View file

@ -102,6 +102,10 @@ export class InteractiveMode {
private autoCompactionLoader: Loader | null = null;
private autoCompactionEscapeHandler?: () => void;
// Auto-retry state
private retryLoader: Loader | null = null;
private retryEscapeHandler?: () => void;
// Hook UI state
private hookSelector: HookSelectorComponent | null = null;
private hookInput: HookInputComponent | null = null;
@ -806,6 +810,46 @@ export class InteractiveMode {
this.ui.requestRender();
break;
}
case "auto_retry_start": {
// Set up escape to abort retry
this.retryEscapeHandler = this.editor.onEscape;
this.editor.onEscape = () => {
this.session.abortRetry();
};
// Show retry indicator
this.statusContainer.clear();
const delaySeconds = Math.round(event.delayMs / 1000);
this.retryLoader = new Loader(
this.ui,
(spinner) => theme.fg("warning", spinner),
(text) => theme.fg("muted", text),
`Retrying (${event.attempt}/${event.maxAttempts}) in ${delaySeconds}s... (esc to cancel)`,
);
this.statusContainer.addChild(this.retryLoader);
this.ui.requestRender();
break;
}
case "auto_retry_end": {
// Restore escape handler
if (this.retryEscapeHandler) {
this.editor.onEscape = this.retryEscapeHandler;
this.retryEscapeHandler = undefined;
}
// Stop loader
if (this.retryLoader) {
this.retryLoader.stop();
this.retryLoader = null;
this.statusContainer.clear();
}
// Show error only on final failure (success shows normal response)
if (!event.success) {
this.showError(`Retry failed after ${event.attempt} attempts: ${event.finalError || "Unknown error"}`);
}
this.ui.requestRender();
break;
}
}
}

View file

@ -264,6 +264,20 @@ export class RpcClient {
await this.send({ type: "set_auto_compaction", enabled });
}
/**
* Set auto-retry enabled/disabled.
*/
async setAutoRetry(enabled: boolean): Promise<void> {
await this.send({ type: "set_auto_retry", enabled });
}
/**
* Abort in-progress retry.
*/
async abortRetry(): Promise<void> {
await this.send({ type: "abort_retry" });
}
/**
* Execute a bash command.
*/

View file

@ -270,6 +270,20 @@ export async function runRpcMode(session: AgentSession): Promise<never> {
return success(id, "set_auto_compaction");
}
// =================================================================
// Retry
// =================================================================
case "set_auto_retry": {
session.setAutoRetryEnabled(command.enabled);
return success(id, "set_auto_retry");
}
case "abort_retry": {
session.abortRetry();
return success(id, "abort_retry");
}
// =================================================================
// Bash
// =================================================================

View file

@ -40,6 +40,10 @@ export type RpcCommand =
| { id?: string; type: "compact"; customInstructions?: string }
| { id?: string; type: "set_auto_compaction"; enabled: boolean }
// Retry
| { id?: string; type: "set_auto_retry"; enabled: boolean }
| { id?: string; type: "abort_retry" }
// Bash
| { id?: string; type: "bash"; command: string }
| { id?: string; type: "abort_bash" }
@ -127,6 +131,10 @@ export type RpcResponse =
| { id?: string; type: "response"; command: "compact"; success: true; data: CompactionResult }
| { id?: string; type: "response"; command: "set_auto_compaction"; success: true }
// Retry
| { id?: string; type: "response"; command: "set_auto_retry"; success: true }
| { id?: string; type: "response"; command: "abort_retry"; success: true }
// Bash
| { id?: string; type: "response"; command: "bash"; success: true; data: BashResult }
| { id?: string; type: "response"; command: "abort_bash"; success: true }