mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-19 23:01:32 +00:00
fix(ai): 429 rate limit errors no longer trigger auto-compaction
429 (Too Many Requests) was incorrectly classified as context overflow, triggering compaction instead of retry with backoff. The original logic assumed token-based rate limiting correlates with context overflow, but these are different concepts: - Rate limiting (429): requests/tokens per time period (throughput) - Context overflow: single request exceeds context window (size) Now 429 errors are handled by the existing retry logic with exponential backoff, while 400/413 remain as potential context overflow indicators. fixes #1038
This commit is contained in:
parent
a373dce747
commit
25707f9ad4
2 changed files with 8 additions and 7 deletions
|
|
@ -18,8 +18,8 @@ import type { AssistantMessage } from "../types.js";
|
|||
* - LM Studio: "tokens to keep from the initial prompt is greater than the context length"
|
||||
* - GitHub Copilot: "prompt token count of X exceeds the limit of Y"
|
||||
* - MiniMax: "invalid params, context window exceeds limit"
|
||||
* - Cerebras: Returns "400 status code (no body)" - handled separately below
|
||||
* - Mistral: Returns "400 status code (no body)" - handled separately below
|
||||
* - Cerebras: Returns "400/413 status code (no body)" - handled separately below
|
||||
* - Mistral: Returns "400/413 status code (no body)" - handled separately below
|
||||
* - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow
|
||||
* - Ollama: Silently truncates input - not detectable via error message
|
||||
*/
|
||||
|
|
@ -57,8 +57,8 @@ const OVERFLOW_PATTERNS = [
|
|||
* - Google Gemini: "input token count exceeds the maximum"
|
||||
* - xAI (Grok): "maximum prompt length is X but request contains Y"
|
||||
* - Groq: "reduce the length of the messages"
|
||||
* - Cerebras: 400/413/429 status code (no body)
|
||||
* - Mistral: 400/413/429 status code (no body)
|
||||
* - Cerebras: 400/413 status code (no body)
|
||||
* - Mistral: 400/413 status code (no body)
|
||||
* - OpenRouter (all backends): "maximum context length is X tokens"
|
||||
* - llama.cpp: "exceeds the available context size"
|
||||
* - LM Studio: "greater than the context length"
|
||||
|
|
@ -92,9 +92,9 @@ export function isContextOverflow(message: AssistantMessage, contextWindow?: num
|
|||
return true;
|
||||
}
|
||||
|
||||
// Cerebras and Mistral return 400/413/429 with no body - check for status code pattern
|
||||
// 429 can indicate token-based rate limiting which correlates with context overflow
|
||||
if (/^4(00|13|29)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
|
||||
// Cerebras and Mistral return 400/413 with no body for context overflow
|
||||
// Note: 429 is rate limiting (requests/tokens per time), NOT context overflow
|
||||
if (/^4(00|13)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue