Auto-retry on transient provider errors (overloaded, rate limit, 5xx)

- Add retry logic with exponential backoff (2s, 4s, 8s) in AgentSession
- Disable Anthropic SDK built-in retries (maxRetries: 0) to allow app-level handling
- TUI shows retry status with Escape to cancel
- RPC mode: add set_auto_retry, abort_retry commands and auto_retry_start/end events
- Configurable via settings.json: retry.enabled, retry.maxRetries, retry.baseDelayMs
- Exclude context overflow errors from retry (handled by compaction)

fixes #157
This commit is contained in:
Mario Zechner 2025-12-10 23:36:46 +01:00
parent 79f5c6d22e
commit bb445d24f1
11 changed files with 379 additions and 3 deletions

View file

@ -31,7 +31,9 @@ import { expandSlashCommand, type FileSlashCommand } from "./slash-commands.js";
export type AgentSessionEvent =
| AgentEvent
| { type: "auto_compaction_start"; reason: "threshold" | "overflow" }
| { type: "auto_compaction_end"; result: CompactionResult | null; aborted: boolean; willRetry: boolean };
| { type: "auto_compaction_end"; result: CompactionResult | null; aborted: boolean; willRetry: boolean }
| { type: "auto_retry_start"; attempt: number; maxAttempts: number; delayMs: number; errorMessage: string }
| { type: "auto_retry_end"; success: boolean; attempt: number; finalError?: string };
/** Listener function for agent session events */
export type AgentSessionEventListener = (event: AgentSessionEvent) => void;
@ -116,6 +118,12 @@ export class AgentSession {
private _compactionAbortController: AbortController | null = null;
private _autoCompactionAbortController: AbortController | null = null;
// Retry state
private _retryAbortController: AbortController | null = null;
private _retryAttempt = 0;
private _retryPromise: Promise<void> | null = null;
private _retryResolve: (() => void) | null = null;
// Bash execution state
private _bashAbortController: AbortController | null = null;
private _pendingBashMessages: BashExecutionMessage[] = [];
@ -184,14 +192,40 @@ export class AgentSession {
}
}
// Check auto-compaction after agent completes
// Check auto-retry and auto-compaction after agent completes
if (event.type === "agent_end" && this._lastAssistantMessage) {
const msg = this._lastAssistantMessage;
this._lastAssistantMessage = null;
// Check for retryable errors first (overloaded, rate limit, server errors)
if (this._isRetryableError(msg)) {
const didRetry = await this._handleRetryableError(msg);
if (didRetry) return; // Retry was initiated, don't proceed to compaction
} else if (this._retryAttempt > 0) {
// Previous retry succeeded - emit success event and reset counter
this._emit({
type: "auto_retry_end",
success: true,
attempt: this._retryAttempt,
});
this._retryAttempt = 0;
// Resolve the retry promise so waitForRetry() completes
this._resolveRetry();
}
await this._handleAgentEndCompaction(msg);
}
};
/** Resolve the pending retry promise */
private _resolveRetry(): void {
if (this._retryResolve) {
this._retryResolve();
this._retryResolve = null;
this._retryPromise = null;
}
}
/** Extract text content from a message */
private _getUserMessageText(message: Message): string {
if (message.role !== "user") return "";
@ -379,6 +413,7 @@ export class AgentSession {
const expandedText = expandCommands ? expandSlashCommand(text, [...this._fileCommands]) : text;
await this.agent.prompt(expandedText, options?.attachments);
await this.waitForRetry();
}
/**
@ -419,6 +454,7 @@ export class AgentSession {
* Abort current operation and wait for agent to become idle.
*/
async abort(): Promise<void> {
this.abortRetry();
this.agent.abort();
await this.agent.waitForIdle();
}
@ -784,6 +820,159 @@ export class AgentSession {
return this.settingsManager.getCompactionEnabled();
}
// =========================================================================
// Auto-Retry
// =========================================================================
/**
* Check if an error is retryable (overloaded, rate limit, server errors).
* Context overflow errors are NOT retryable (handled by compaction instead).
*/
private _isRetryableError(message: AssistantMessage): boolean {
if (message.stopReason !== "error" || !message.errorMessage) return false;
// Context overflow is handled by compaction, not retry
const contextWindow = this.model?.contextWindow ?? 0;
if (isContextOverflow(message, contextWindow)) return false;
const err = message.errorMessage;
// Match: overloaded_error, rate limit, 429, 500, 502, 503, 504, service unavailable
return /overloaded|rate.?limit|too many requests|429|500|502|503|504|service.?unavailable|server error|internal error/i.test(
err,
);
}
/**
* Handle retryable errors with exponential backoff.
* @returns true if retry was initiated, false if max retries exceeded or disabled
*/
private async _handleRetryableError(message: AssistantMessage): Promise<boolean> {
const settings = this.settingsManager.getRetrySettings();
if (!settings.enabled) return false;
this._retryAttempt++;
// Create retry promise on first attempt so waitForRetry() can await it
if (this._retryAttempt === 1 && !this._retryPromise) {
this._retryPromise = new Promise((resolve) => {
this._retryResolve = resolve;
});
}
if (this._retryAttempt > settings.maxRetries) {
// Max retries exceeded, emit final failure and reset
this._emit({
type: "auto_retry_end",
success: false,
attempt: this._retryAttempt - 1,
finalError: message.errorMessage,
});
this._retryAttempt = 0;
this._resolveRetry(); // Resolve so waitForRetry() completes
return false;
}
const delayMs = settings.baseDelayMs * 2 ** (this._retryAttempt - 1);
this._emit({
type: "auto_retry_start",
attempt: this._retryAttempt,
maxAttempts: settings.maxRetries,
delayMs,
errorMessage: message.errorMessage || "Unknown error",
});
// Remove error message from agent state (keep in session for history)
const messages = this.agent.state.messages;
if (messages.length > 0 && messages[messages.length - 1].role === "assistant") {
this.agent.replaceMessages(messages.slice(0, -1));
}
// Wait with exponential backoff (abortable)
this._retryAbortController = new AbortController();
try {
await this._sleep(delayMs, this._retryAbortController.signal);
} catch {
// Aborted during sleep - emit end event so UI can clean up
const attempt = this._retryAttempt;
this._retryAttempt = 0;
this._retryAbortController = null;
this._emit({
type: "auto_retry_end",
success: false,
attempt,
finalError: "Retry cancelled",
});
this._resolveRetry();
return false;
}
this._retryAbortController = null;
// Retry via continue() - use setTimeout to break out of event handler chain
setTimeout(() => {
this.agent.continue().catch(() => {
// Retry failed - will be caught by next agent_end
});
}, 0);
return true;
}
/**
* Sleep helper that respects abort signal.
*/
private _sleep(ms: number, signal?: AbortSignal): Promise<void> {
return new Promise((resolve, reject) => {
if (signal?.aborted) {
reject(new Error("Aborted"));
return;
}
const timeout = setTimeout(resolve, ms);
signal?.addEventListener("abort", () => {
clearTimeout(timeout);
reject(new Error("Aborted"));
});
});
}
/**
* Cancel in-progress retry.
*/
abortRetry(): void {
this._retryAbortController?.abort();
this._retryAttempt = 0;
this._resolveRetry();
}
/**
* Wait for any in-progress retry to complete.
* Returns immediately if no retry is in progress.
*/
private async waitForRetry(): Promise<void> {
if (this._retryPromise) {
await this._retryPromise;
}
}
/** Whether auto-retry is currently in progress */
get isRetrying(): boolean {
return this._retryPromise !== null;
}
/** Whether auto-retry is enabled */
get autoRetryEnabled(): boolean {
return this.settingsManager.getRetryEnabled();
}
/**
* Toggle auto-retry setting.
*/
setAutoRetryEnabled(enabled: boolean): void {
this.settingsManager.setRetryEnabled(enabled);
}
// =========================================================================
// Bash Execution
// =========================================================================