fix(ai): ensure maxTokens > thinkingBudget for Claude thinking models

Claude requires max_tokens > thinking.budget_tokens. When caller specifies a small maxTokens (e.g. compaction with ~13k tokens) and reasoning is enabled with high budget (16k tokens), the constraint was violated. Fix: In mapOptionsForApi, add thinkingBudget on top of caller's maxTokens (capped at model.maxTokens). If still not enough room, reduce thinkingBudget to leave space for output. Applied to both anthropic-messages and google-gemini-cli APIs. Also adds test utilities for OAuth credential resolution and tests for compaction with thinking models. fixes #413
2026-04-16 17:01:02 +00:00 · 2026-01-03 02:45:30 +01:00 · 2026-01-03 02:45:30 +01:00 · 8df22faedf
commit 8df22faedf
parent 97af788344
4 changed files with 347 additions and 7 deletions
--- a/packages/ai/src/stream.ts
+++ b/packages/ai/src/stream.ts
@ -159,6 +159,8 @@ function mapOptionsForApi<TApi extends Api>(
 				return { ...base, thinkingEnabled: false } satisfies AnthropicOptions;
 			}

+			// Claude requires max_tokens > thinking.budget_tokens
+			// So we need to ensure maxTokens accounts for both thinking and output
 			const anthropicBudgets = {
 				minimal: 1024,
 				low: 2048,
@ -166,10 +168,21 @@ function mapOptionsForApi<TApi extends Api>(
 				high: 16384,
 			};

+			const minOutputTokens = 1024;
+			let thinkingBudget = anthropicBudgets[clampReasoning(options.reasoning)!];
+			// Caller's maxTokens is the desired output; add thinking budget on top, capped at model limit
+			const maxTokens = Math.min((base.maxTokens || 0) + thinkingBudget, model.maxTokens);
+
+			// If not enough room for thinking + output, reduce thinking budget
+			if (maxTokens <= thinkingBudget) {
+				thinkingBudget = Math.max(0, maxTokens - minOutputTokens);
+			}
+
 			return {
 				...base,
+				maxTokens,
 				thinkingEnabled: true,
-				thinkingBudgetTokens: anthropicBudgets[clampReasoning(options.reasoning)!],
+				thinkingBudgetTokens: thinkingBudget,
 			} satisfies AnthropicOptions;
 		}

@ -234,7 +247,9 @@ function mapOptionsForApi<TApi extends Api>(
 				} satisfies GoogleGeminiCliOptions;
 			}

-			// Gemini 2.x models use thinkingBudget
+			// Models using thinkingBudget (Gemini 2.x, Claude via Antigravity)
+			// Claude requires max_tokens > thinking.budget_tokens
+			// So we need to ensure maxTokens accounts for both thinking and output
 			const budgets: Record<ClampedReasoningEffort, number> = {
 				minimal: 1024,
 				low: 2048,
@ -242,11 +257,22 @@ function mapOptionsForApi<TApi extends Api>(
 				high: 16384,
 			};

+			const minOutputTokens = 1024;
+			let thinkingBudget = budgets[effort];
+			// Caller's maxTokens is the desired output; add thinking budget on top, capped at model limit
+			const maxTokens = Math.min((base.maxTokens || 0) + thinkingBudget, model.maxTokens);
+
+			// If not enough room for thinking + output, reduce thinking budget
+			if (maxTokens <= thinkingBudget) {
+				thinkingBudget = Math.max(0, maxTokens - minOutputTokens);
+			}
+
 			return {
 				...base,
+				maxTokens,
 				thinking: {
 					enabled: true,
-					budgetTokens: budgets[effort],
+					budgetTokens: thinkingBudget,
 				},
 			} satisfies GoogleGeminiCliOptions;
 		}