import type { AssistantMessage } from "../types.js"; /** * Regex patterns to detect context overflow errors from different providers. * * These patterns match error messages returned when the input exceeds * the model's context window. * * Provider-specific patterns (with example error messages): * * - Anthropic: "prompt is too long: 213462 tokens > 200000 maximum" * - OpenAI: "Your input exceeds the context window of this model" * - Google: "The input token count (1196265) exceeds the maximum number of tokens allowed (1048575)" * - xAI: "This model's maximum prompt length is 131072 but the request contains 537812 tokens" * - Groq: "Please reduce the length of the messages or completion" * - OpenRouter: "This endpoint's maximum context length is X tokens. However, you requested about Y tokens" * - llama.cpp: "the request exceeds the available context size, try increasing it" * - LM Studio: "tokens to keep from the initial prompt is greater than the context length" * - GitHub Copilot: "prompt token count of X exceeds the limit of Y" * - MiniMax: "invalid params, context window exceeds limit" * - Kimi For Coding: "Your request exceeded model token limit: X (requested: Y)" * - Cerebras: Returns "400/413 status code (no body)" - handled separately below * - Mistral: "Prompt contains X tokens ... too large for model with Y maximum context length" * - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow * - Ollama: Silently truncates input - not detectable via error message */ const OVERFLOW_PATTERNS = [ /prompt is too long/i, // Anthropic /input is too long for requested model/i, // Amazon Bedrock /exceeds the context window/i, // OpenAI (Completions & Responses API) /input token count.*exceeds the maximum/i, // Google (Gemini) /maximum prompt length is \d+/i, // xAI (Grok) /reduce the length of the messages/i, // Groq /maximum context length is \d+ tokens/i, // OpenRouter (all backends) /exceeds the limit of \d+/i, // GitHub Copilot /exceeds the available context size/i, // llama.cpp server /greater than the context length/i, // LM Studio /context window exceeds limit/i, // MiniMax /exceeded model token limit/i, // Kimi For Coding /too large for model with \d+ maximum context length/i, // Mistral /context[_ ]length[_ ]exceeded/i, // Generic fallback /too many tokens/i, // Generic fallback /token limit exceeded/i, // Generic fallback ]; /** * Check if an assistant message represents a context overflow error. * * This handles two cases: * 1. Error-based overflow: Most providers return stopReason "error" with a * specific error message pattern. * 2. Silent overflow: Some providers accept overflow requests and return * successfully. For these, we check if usage.input exceeds the context window. * * ## Reliability by Provider * * **Reliable detection (returns error with detectable message):** * - Anthropic: "prompt is too long: X tokens > Y maximum" * - OpenAI (Completions & Responses): "exceeds the context window" * - Google Gemini: "input token count exceeds the maximum" * - xAI (Grok): "maximum prompt length is X but request contains Y" * - Groq: "reduce the length of the messages" * - Cerebras: 400/413 status code (no body) * - Mistral: "Prompt contains X tokens ... too large for model with Y maximum context length" * - OpenRouter (all backends): "maximum context length is X tokens" * - llama.cpp: "exceeds the available context size" * - LM Studio: "greater than the context length" * - Kimi For Coding: "exceeded model token limit: X (requested: Y)" * * **Unreliable detection:** * - z.ai: Sometimes accepts overflow silently (detectable via usage.input > contextWindow), * sometimes returns rate limit errors. Pass contextWindow param to detect silent overflow. * - Ollama: Silently truncates input without error. Cannot be detected via this function. * The response will have usage.input < expected, but we don't know the expected value. * * ## Custom Providers * * If you've added custom models via settings.json, this function may not detect * overflow errors from those providers. To add support: * * 1. Send a request that exceeds the model's context window * 2. Check the errorMessage in the response * 3. Create a regex pattern that matches the error * 4. The pattern should be added to OVERFLOW_PATTERNS in this file, or * check the errorMessage yourself before calling this function * * @param message - The assistant message to check * @param contextWindow - Optional context window size for detecting silent overflow (z.ai) * @returns true if the message indicates a context overflow */ export function isContextOverflow( message: AssistantMessage, contextWindow?: number, ): boolean { // Case 1: Check error message patterns if (message.stopReason === "error" && message.errorMessage) { // Check known patterns if (OVERFLOW_PATTERNS.some((p) => p.test(message.errorMessage!))) { return true; } // Cerebras returns 400/413 with no body for context overflow // Note: 429 is rate limiting (requests/tokens per time), NOT context overflow if ( /^4(00|13)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage) ) { return true; } } // Case 2: Silent overflow (z.ai style) - successful but usage exceeds context if (contextWindow && message.stopReason === "stop") { const inputTokens = message.usage.input + message.usage.cacheRead; if (inputTokens > contextWindow) { return true; } } return false; } /** * Get the overflow patterns for testing purposes. */ export function getOverflowPatterns(): RegExp[] { return [...OVERFLOW_PATTERNS]; }