move pi-mono into companion-cloud as apps/companion-os

- Copy all pi-mono source into apps/companion-os/ - Update Dockerfile to COPY pre-built binary instead of downloading from GitHub Releases - Update deploy-staging.yml to build pi from source (bun compile) before Docker build - Add apps/companion-os/** to path triggers - No more cross-repo dispatch needed Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-15 09:01:13 +00:00 · 2026-03-07 09:22:50 -08:00 · 2026-03-07 09:22:50 -08:00 · 0250f72976
commit 0250f72976
579 changed files with 206942 additions and 0 deletions
--- a/packages/ai/src/utils/overflow.ts
+++ b/packages/ai/src/utils/overflow.ts
@ -0,0 +1,127 @@
+import type { AssistantMessage } from "../types.js";
+
+/**
+ * Regex patterns to detect context overflow errors from different providers.
+ *
+ * These patterns match error messages returned when the input exceeds
+ * the model's context window.
+ *
+ * Provider-specific patterns (with example error messages):
+ *
+ * - Anthropic: "prompt is too long: 213462 tokens > 200000 maximum"
+ * - OpenAI: "Your input exceeds the context window of this model"
+ * - Google: "The input token count (1196265) exceeds the maximum number of tokens allowed (1048575)"
+ * - xAI: "This model's maximum prompt length is 131072 but the request contains 537812 tokens"
+ * - Groq: "Please reduce the length of the messages or completion"
+ * - OpenRouter: "This endpoint's maximum context length is X tokens. However, you requested about Y tokens"
+ * - llama.cpp: "the request exceeds the available context size, try increasing it"
+ * - LM Studio: "tokens to keep from the initial prompt is greater than the context length"
+ * - GitHub Copilot: "prompt token count of X exceeds the limit of Y"
+ * - MiniMax: "invalid params, context window exceeds limit"
+ * - Kimi For Coding: "Your request exceeded model token limit: X (requested: Y)"
+ * - Cerebras: Returns "400/413 status code (no body)" - handled separately below
+ * - Mistral: "Prompt contains X tokens ... too large for model with Y maximum context length"
+ * - z.ai: Does NOT error, accepts overflow silently - handled via usage.input > contextWindow
+ * - Ollama: Silently truncates input - not detectable via error message
+ */
+const OVERFLOW_PATTERNS = [
+  /prompt is too long/i, // Anthropic
+  /input is too long for requested model/i, // Amazon Bedrock
+  /exceeds the context window/i, // OpenAI (Completions & Responses API)
+  /input token count.*exceeds the maximum/i, // Google (Gemini)
+  /maximum prompt length is \d+/i, // xAI (Grok)
+  /reduce the length of the messages/i, // Groq
+  /maximum context length is \d+ tokens/i, // OpenRouter (all backends)
+  /exceeds the limit of \d+/i, // GitHub Copilot
+  /exceeds the available context size/i, // llama.cpp server
+  /greater than the context length/i, // LM Studio
+  /context window exceeds limit/i, // MiniMax
+  /exceeded model token limit/i, // Kimi For Coding
+  /too large for model with \d+ maximum context length/i, // Mistral
+  /context[_ ]length[_ ]exceeded/i, // Generic fallback
+  /too many tokens/i, // Generic fallback
+  /token limit exceeded/i, // Generic fallback
+];
+
+/**
+ * Check if an assistant message represents a context overflow error.
+ *
+ * This handles two cases:
+ * 1. Error-based overflow: Most providers return stopReason "error" with a
+ *    specific error message pattern.
+ * 2. Silent overflow: Some providers accept overflow requests and return
+ *    successfully. For these, we check if usage.input exceeds the context window.
+ *
+ * ## Reliability by Provider
+ *
+ * **Reliable detection (returns error with detectable message):**
+ * - Anthropic: "prompt is too long: X tokens > Y maximum"
+ * - OpenAI (Completions & Responses): "exceeds the context window"
+ * - Google Gemini: "input token count exceeds the maximum"
+ * - xAI (Grok): "maximum prompt length is X but request contains Y"
+ * - Groq: "reduce the length of the messages"
+ * - Cerebras: 400/413 status code (no body)
+ * - Mistral: "Prompt contains X tokens ... too large for model with Y maximum context length"
+ * - OpenRouter (all backends): "maximum context length is X tokens"
+ * - llama.cpp: "exceeds the available context size"
+ * - LM Studio: "greater than the context length"
+ * - Kimi For Coding: "exceeded model token limit: X (requested: Y)"
+ *
+ * **Unreliable detection:**
+ * - z.ai: Sometimes accepts overflow silently (detectable via usage.input > contextWindow),
+ *   sometimes returns rate limit errors. Pass contextWindow param to detect silent overflow.
+ * - Ollama: Silently truncates input without error. Cannot be detected via this function.
+ *   The response will have usage.input < expected, but we don't know the expected value.
+ *
+ * ## Custom Providers
+ *
+ * If you've added custom models via settings.json, this function may not detect
+ * overflow errors from those providers. To add support:
+ *
+ * 1. Send a request that exceeds the model's context window
+ * 2. Check the errorMessage in the response
+ * 3. Create a regex pattern that matches the error
+ * 4. The pattern should be added to OVERFLOW_PATTERNS in this file, or
+ *    check the errorMessage yourself before calling this function
+ *
+ * @param message - The assistant message to check
+ * @param contextWindow - Optional context window size for detecting silent overflow (z.ai)
+ * @returns true if the message indicates a context overflow
+ */
+export function isContextOverflow(
+  message: AssistantMessage,
+  contextWindow?: number,
+): boolean {
+  // Case 1: Check error message patterns
+  if (message.stopReason === "error" && message.errorMessage) {
+    // Check known patterns
+    if (OVERFLOW_PATTERNS.some((p) => p.test(message.errorMessage!))) {
+      return true;
+    }
+
+    // Cerebras returns 400/413 with no body for context overflow
+    // Note: 429 is rate limiting (requests/tokens per time), NOT context overflow
+    if (
+      /^4(00|13)\s*(status code)?\s*\(no body\)/i.test(message.errorMessage)
+    ) {
+      return true;
+    }
+  }
+
+  // Case 2: Silent overflow (z.ai style) - successful but usage exceeds context
+  if (contextWindow && message.stopReason === "stop") {
+    const inputTokens = message.usage.input + message.usage.cacheRead;
+    if (inputTokens > contextWindow) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/**
+ * Get the overflow patterns for testing purposes.
+ */
+export function getOverflowPatterns(): RegExp[] {
+  return [...OVERFLOW_PATTERNS];
+}