From 25707f9ad4650037e075b68d4d77cbf4c71d1243 Mon Sep 17 00:00:00 2001
From: Mario Zechner <badlogicgames@gmail.com>
Date: Thu, 29 Jan 2026 00:43:38 +0100
Subject: [PATCH] fix(ai): 429 rate limit errors no longer trigger
 auto-compaction

429 (Too Many Requests) was incorrectly classified as context overflow,
triggering compaction instead of retry with backoff. The original logic
assumed token-based rate limiting correlates with context overflow, but
these are different concepts:

- Rate limiting (429): requests/tokens per time period (throughput)
- Context overflow: single request exceeds context window (size)

Now 429 errors are handled by the existing retry logic with exponential
backoff, while 400/413 remain as potential context overflow indicators.

fixes #1038
---
 packages/ai/CHANGELOG.md          |  1 +
 packages/ai/src/utils/overflow.ts | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md
index 1e11168f..db28b39f 100644
--- a/packages/ai/CHANGELOG.md
+++ b/packages/ai/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Fixed
 
+- Fixed 429 rate limit errors incorrectly triggering auto-compaction instead of retry with backoff ([#1038](https://github.com/badlogic/pi-mono/issues/1038))
 - Fixed Anthropic provider to handle `sensitive` stop_reason returned by API ([#978](https://github.com/badlogic/pi-mono/issues/978))
 - Fixed DeepSeek API compatibility by detecting `deepseek.com` URLs and disabling unsupported `developer` role ([#1048](https://github.com/badlogic/pi-mono/issues/1048))
 - Fixed Anthropic provider to preserve input token counts when proxies omit them in `message_delta` events ([#1045](https://github.com/badlogic/pi-mono/issues/1045))
diff --git a/packages/ai/src/utils/overflow.ts b/packages/ai/src/utils/overflow.ts
index 07c7400b..03d888b1 100644
--- a/packages/ai/src/utils/overflow.ts
+++ b/packages/ai/src/utils/overflow.ts
@@ -18,8 +18,8 @@ import type { AssistantMessage } from "../types.js";
  * - LM Studio: "tokens to keep from the initial prompt is greater than the context length"
  * - GitHub Copilot: "prompt token count of X exceeds the limit of Y"
  * - MiniMax: "invalid params, context window exceeds limit"
- * - Cerebras: Returns "400 status code (no body)" - handled separately below
- * - Mistral: Returns "400 status code (no body)" - handled separately below
+ * - Cerebras: Returns "400/413 status code (no body)" - handled separately below
+ * - Mistral: Returns "400/413 status code (no body)" - handled separately below
  * - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow
  * - Ollama: Silently truncates input - not detectable via error message
  */
@@ -57,8 +57,8 @@ const OVERFLOW_PATTERNS = [
  * - Google Gemini: "input token count exceeds the maximum"
  * - xAI (Grok): "maximum prompt length is X but request contains Y"
  * - Groq: "reduce the length of the messages"
- * - Cerebras: 400/413/429 status code (no body)
- * - Mistral: 400/413/429 status code (no body)
+ * - Cerebras: 400/413 status code (no body)
+ * - Mistral: 400/413 status code (no body)
  * - OpenRouter (all backends): "maximum context length is X tokens"
  * - llama.cpp: "exceeds the available context size"
  * - LM Studio: "greater than the context length"
@@ -92,9 +92,9 @@ export function isContextOverflow(message: AssistantMessage, contextWindow?: num
 			return true;
 		}
 
-		// Cerebras and Mistral return 400/413/429 with no body - check for status code pattern
-		// 429 can indicate token-based rate limiting which correlates with context overflow
-		if (/^4(00|13|29)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
+		// Cerebras and Mistral return 400/413 with no body for context overflow
+		// Note: 429 is rate limiting (requests/tokens per time), NOT context overflow
+		if (/^4(00|13)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
 			return true;
 		}
 	}