mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-16 20:01:24 +00:00
- Add Mistral to KnownProvider type and model generation - Implement Mistral-specific compat handling in openai-completions: - requiresToolResultName: tool results need name field - requiresAssistantAfterToolResult: synthetic assistant message between tool/user - requiresThinkingAsText: thinking blocks as <thinking> text - requiresMistralToolIds: tool IDs must be exactly 9 alphanumeric chars - Add MISTRAL_API_KEY environment variable support - Add Mistral tests across all test files - Update documentation (README, CHANGELOG) for both ai and coding-agent packages - Remove client IDs from gemini.md, reference upstream source instead Closes #165
112 lines
4.9 KiB
TypeScript
112 lines
4.9 KiB
TypeScript
import type { AssistantMessage } from "../types.js";
|
|
|
|
/**
|
|
* Regex patterns to detect context overflow errors from different providers.
|
|
*
|
|
* These patterns match error messages returned when the input exceeds
|
|
* the model's context window.
|
|
*
|
|
* Provider-specific patterns (with example error messages):
|
|
*
|
|
* - Anthropic: "prompt is too long: 213462 tokens > 200000 maximum"
|
|
* - OpenAI: "Your input exceeds the context window of this model"
|
|
* - Google: "The input token count (1196265) exceeds the maximum number of tokens allowed (1048575)"
|
|
* - xAI: "This model's maximum prompt length is 131072 but the request contains 537812 tokens"
|
|
* - Groq: "Please reduce the length of the messages or completion"
|
|
* - OpenRouter: "This endpoint's maximum context length is X tokens. However, you requested about Y tokens"
|
|
* - llama.cpp: "the request exceeds the available context size, try increasing it"
|
|
* - LM Studio: "tokens to keep from the initial prompt is greater than the context length"
|
|
* - Cerebras: Returns "400 status code (no body)" - handled separately below
|
|
* - Mistral: Returns "400 status code (no body)" - handled separately below
|
|
* - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow
|
|
* - Ollama: Silently truncates input - not detectable via error message
|
|
*/
|
|
const OVERFLOW_PATTERNS = [
|
|
/prompt is too long/i, // Anthropic
|
|
/exceeds the context window/i, // OpenAI (Completions & Responses API)
|
|
/input token count.*exceeds the maximum/i, // Google (Gemini)
|
|
/maximum prompt length is \d+/i, // xAI (Grok)
|
|
/reduce the length of the messages/i, // Groq
|
|
/maximum context length is \d+ tokens/i, // OpenRouter (all backends)
|
|
/exceeds the available context size/i, // llama.cpp server
|
|
/greater than the context length/i, // LM Studio
|
|
/context length exceeded/i, // Generic fallback
|
|
/too many tokens/i, // Generic fallback
|
|
/token limit exceeded/i, // Generic fallback
|
|
];
|
|
|
|
/**
|
|
* Check if an assistant message represents a context overflow error.
|
|
*
|
|
* This handles two cases:
|
|
* 1. Error-based overflow: Most providers return stopReason "error" with a
|
|
* specific error message pattern.
|
|
* 2. Silent overflow: Some providers accept overflow requests and return
|
|
* successfully. For these, we check if usage.input exceeds the context window.
|
|
*
|
|
* ## Reliability by Provider
|
|
*
|
|
* **Reliable detection (returns error with detectable message):**
|
|
* - Anthropic: "prompt is too long: X tokens > Y maximum"
|
|
* - OpenAI (Completions & Responses): "exceeds the context window"
|
|
* - Google Gemini: "input token count exceeds the maximum"
|
|
* - xAI (Grok): "maximum prompt length is X but request contains Y"
|
|
* - Groq: "reduce the length of the messages"
|
|
* - Cerebras: 400/413 status code (no body)
|
|
* - Mistral: 400/413 status code (no body)
|
|
* - OpenRouter (all backends): "maximum context length is X tokens"
|
|
* - llama.cpp: "exceeds the available context size"
|
|
* - LM Studio: "greater than the context length"
|
|
*
|
|
* **Unreliable detection:**
|
|
* - z.ai: Sometimes accepts overflow silently (detectable via usage.input > contextWindow),
|
|
* sometimes returns rate limit errors. Pass contextWindow param to detect silent overflow.
|
|
* - Ollama: Silently truncates input without error. Cannot be detected via this function.
|
|
* The response will have usage.input < expected, but we don't know the expected value.
|
|
*
|
|
* ## Custom Providers
|
|
*
|
|
* If you've added custom models via settings.json, this function may not detect
|
|
* overflow errors from those providers. To add support:
|
|
*
|
|
* 1. Send a request that exceeds the model's context window
|
|
* 2. Check the errorMessage in the response
|
|
* 3. Create a regex pattern that matches the error
|
|
* 4. The pattern should be added to OVERFLOW_PATTERNS in this file, or
|
|
* check the errorMessage yourself before calling this function
|
|
*
|
|
* @param message - The assistant message to check
|
|
* @param contextWindow - Optional context window size for detecting silent overflow (z.ai)
|
|
* @returns true if the message indicates a context overflow
|
|
*/
|
|
export function isContextOverflow(message: AssistantMessage, contextWindow?: number): boolean {
|
|
// Case 1: Check error message patterns
|
|
if (message.stopReason === "error" && message.errorMessage) {
|
|
// Check known patterns
|
|
if (OVERFLOW_PATTERNS.some((p) => p.test(message.errorMessage!))) {
|
|
return true;
|
|
}
|
|
|
|
// Cerebras and Mistral return 400/413 with no body - check for status code pattern
|
|
if (/^4(00|13)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Case 2: Silent overflow (z.ai style) - successful but usage exceeds context
|
|
if (contextWindow && message.stopReason === "stop") {
|
|
const inputTokens = message.usage.input + message.usage.cacheRead;
|
|
if (inputTokens > contextWindow) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Get the overflow patterns for testing purposes.
|
|
*/
|
|
export function getOverflowPatterns(): RegExp[] {
|
|
return [...OVERFLOW_PATTERNS];
|
|
}
|