Improve Gemini CLI provider retries and headers (#670)

Improve Gemini CLI provider retries and headers - Add Antigravity endpoint fallback (tries daily sandbox then prod when baseUrl is unset) - Parse retry delays from headers (Retry-After, x-ratelimit-reset, x-ratelimit-reset-after) before body parsing - Derive stable sessionId from first user message for cache affinity - Retry empty SSE streams with backoff without duplicate start/done events - Add anthropic-beta header for Claude thinking models only
2026-04-19 22:01:38 +00:00 · 2026-01-13 02:04:53 +02:00 · 2026-01-13 02:04:53 +02:00 · ff15414258
commit ff15414258
parent 9e4ae98358
5 changed files with 693 additions and 189 deletions
--- a/packages/ai/src/providers/google-gemini-cli.ts
+++ b/packages/ai/src/providers/google-gemini-cli.ts
@ -4,6 +4,7 @@
 * Uses the Cloud Code Assist API endpoint to access Gemini and Claude models.
 */
 import { createHash } from "node:crypto";
 import type { Content, ThinkingConfig } from "@google/genai";
 import { calculateCost } from "../models.js";
 import type {
@ -54,6 +55,8 @@ export interface GoogleGeminiCliOptions extends StreamOptions {
 }
 const DEFAULT_ENDPOINT = "https://cloudcode-pa.googleapis.com";
 const ANTIGRAVITY_DAILY_ENDPOINT = "https://daily-cloudcode-pa.sandbox.googleapis.com";
 const ANTIGRAVITY_ENDPOINT_FALLBACKS = [ANTIGRAVITY_DAILY_ENDPOINT, DEFAULT_ENDPOINT] as const;
 // Headers for Gemini CLI (prod endpoint)
 const GEMINI_CLI_HEADERS = {
 	"User-Agent": "google-cloud-sdk vscode_cloudshelleditor/0.1",
@ -163,16 +166,66 @@ let toolCallCounter = 0;
 // Retry configuration
 const MAX_RETRIES = 3;
 const BASE_DELAY_MS = 1000;
 const MAX_EMPTY_STREAM_RETRIES = 2;
 const EMPTY_STREAM_BASE_DELAY_MS = 500;
 const CLAUDE_THINKING_BETA_HEADER = "interleaved-thinking-2025-05-14";
 /**
 * Extract retry delay from Gemini error response (in milliseconds).
- * Parses patterns like:
+ * Checks headers first (Retry-After, x-ratelimit-reset, x-ratelimit-reset-after),
 * then parses body patterns like:
 * - "Your quota will reset after 39s"
 * - "Your quota will reset after 18h31m10s"
 * - "Please retry in Xs" or "Please retry in Xms"
 * - "retryDelay": "34.074824224s" (JSON field)
 */
-function extractRetryDelay(errorText: string): number | undefined {
+export function extractRetryDelay(errorText: string, response?: Response | Headers): number | undefined {
 	const normalizeDelay = (ms: number): number | undefined => (ms > 0 ? Math.ceil(ms + 1000) : undefined);
 	const headers = response instanceof Headers ? response : response?.headers;
 	if (headers) {
 		const retryAfter = headers.get("retry-after");
 		if (retryAfter) {
 			const retryAfterSeconds = Number(retryAfter);
 			if (Number.isFinite(retryAfterSeconds)) {
 				const delay = normalizeDelay(retryAfterSeconds * 1000);
 				if (delay !== undefined) {
 					return delay;
 				}
 			}
 			const retryAfterDate = new Date(retryAfter);
 			const retryAfterMs = retryAfterDate.getTime();
 			if (!Number.isNaN(retryAfterMs)) {
 				const delay = normalizeDelay(retryAfterMs - Date.now());
 				if (delay !== undefined) {
 					return delay;
 				}
 			}
 		}
 		const rateLimitReset = headers.get("x-ratelimit-reset");
 		if (rateLimitReset) {
 			const resetSeconds = Number.parseInt(rateLimitReset, 10);
 			if (!Number.isNaN(resetSeconds)) {
 				const delay = normalizeDelay(resetSeconds * 1000 - Date.now());
 				if (delay !== undefined) {
 					return delay;
 				}
 			}
 		}
 		const rateLimitResetAfter = headers.get("x-ratelimit-reset-after");
 		if (rateLimitResetAfter) {
 			const resetAfterSeconds = Number(rateLimitResetAfter);
 			if (Number.isFinite(resetAfterSeconds)) {
 				const delay = normalizeDelay(resetAfterSeconds * 1000);
 				if (delay !== undefined) {
 					return delay;
 				}
 			}
 		}
 	}
 	// Pattern 1: "Your quota will reset after ..." (formats: "18h31m10s", "10m15s", "6s", "39s")
 	const durationMatch = errorText.match(/reset after (?:(\d+)h)?(?:(\d+)m)?(\d+(?:\.\d+)?)s/i);
 	if (durationMatch) {
@ -181,8 +234,9 @@ function extractRetryDelay(errorText: string): number | undefined {
 		const seconds = parseFloat(durationMatch[3]);
 		if (!Number.isNaN(seconds)) {
 			const totalMs = ((hours * 60 + minutes) * 60 + seconds) * 1000;
-			if (totalMs > 0) {
+			const delay = normalizeDelay(totalMs);
-				return Math.ceil(totalMs + 1000); // Add 1s buffer
+			if (delay !== undefined) {
 				return delay;
 			}
 		}
 	}
@ -193,7 +247,10 @@ function extractRetryDelay(errorText: string): number | undefined {
 		const value = parseFloat(retryInMatch[1]);
 		if (!Number.isNaN(value) && value > 0) {
 			const ms = retryInMatch[2].toLowerCase() === "ms" ? value : value * 1000;
-			return Math.ceil(ms + 1000);
+			const delay = normalizeDelay(ms);
 			if (delay !== undefined) {
 				return delay;
 			}
 		}
 	}
@ -203,13 +260,21 @@ function extractRetryDelay(errorText: string): number | undefined {
 		const value = parseFloat(retryDelayMatch[1]);
 		if (!Number.isNaN(value) && value > 0) {
 			const ms = retryDelayMatch[2].toLowerCase() === "ms" ? value : value * 1000;
-			return Math.ceil(ms + 1000);
+			const delay = normalizeDelay(ms);
 			if (delay !== undefined) {
 				return delay;
 			}
 		}
 	}
 	return undefined;
 }
 function isClaudeThinkingModel(modelId: string): boolean {
 	const normalized = modelId.toLowerCase();
 	return normalized.includes("claude") && normalized.includes("thinking");
 }
 /**
 * Check if an error is retryable (rate limit, server error, network error, etc.)
 */
@ -258,6 +323,7 @@ interface CloudCodeAssistRequest {
 	model: string;
 	request: {
 		contents: Content[];
 		sessionId?: string;
 		systemInstruction?: { role?: string; parts: { text: string }[] };
 		generationConfig?: {
 			maxOutputTokens?: number;
@ -355,17 +421,26 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 				throw new Error("Missing token or projectId in Google Cloud credentials. Use /login to re-authenticate.");
 			}
-			const endpoint = model.baseUrl || DEFAULT_ENDPOINT;
+			const isAntigravity = model.provider === "google-antigravity";
-			const url = `${endpoint}/v1internal:streamGenerateContent?alt=sse`;
+			const baseUrl = model.baseUrl?.trim();
 			const endpoints = baseUrl ? [baseUrl] : isAntigravity ? ANTIGRAVITY_ENDPOINT_FALLBACKS : [DEFAULT_ENDPOINT];
 			// Use Antigravity headers for sandbox endpoint, otherwise Gemini CLI headers
 			const isAntigravity = endpoint.includes("sandbox.googleapis.com");
 			const requestBody = buildRequest(model, context, projectId, options, isAntigravity);
 			const headers = isAntigravity ? ANTIGRAVITY_HEADERS : GEMINI_CLI_HEADERS;
 			const requestHeaders = {
 				Authorization: `Bearer ${accessToken}`,
 				"Content-Type": "application/json",
 				Accept: "text/event-stream",
 				...headers,
 				...(isClaudeThinkingModel(model.id) ? { "anthropic-beta": CLAUDE_THINKING_BETA_HEADER } : {}),
 			};
 			const requestBodyJson = JSON.stringify(requestBody);
 			// Fetch with retry logic for rate limits and transient errors
 			let response: Response | undefined;
 			let lastError: Error | undefined;
 			let requestUrl: string | undefined;
 			for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
 				if (options?.signal?.aborted) {
@ -373,15 +448,12 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 				}
 				try {
-					response = await fetch(url, {
+					const endpoint = endpoints[Math.min(attempt, endpoints.length - 1)];
 					requestUrl = `${endpoint}/v1internal:streamGenerateContent?alt=sse`;
 					response = await fetch(requestUrl, {
 						method: "POST",
-						headers: {
+						headers: requestHeaders,
-							Authorization: `Bearer ${accessToken}`,
+						body: requestBodyJson,
 							"Content-Type": "application/json",
 							Accept: "text/event-stream",
 							...headers,
 						},
 						body: JSON.stringify(requestBody),
 						signal: options?.signal,
 					});
@ -394,7 +466,7 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 					// Check if retryable
 					if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) {
 						// Use server-provided delay or exponential backoff
-						const serverDelay = extractRetryDelay(errorText);
+						const serverDelay = extractRetryDelay(errorText, response);
 						const delayMs = serverDelay ?? BASE_DELAY_MS * 2 ** attempt;
 						await sleep(delayMs, options?.signal);
 						continue;
@ -428,73 +500,160 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 				throw lastError ?? new Error("Failed to get response after retries");
 			}
-			if (!response.body) {
+			let started = false;
-				throw new Error("No response body");
+			const ensureStarted = () => {
-			}
+				if (!started) {
-
+					stream.push({ type: "start", partial: output });
-			stream.push({ type: "start", partial: output });
+					started = true;
-
+				}
 			let currentBlock: TextContent | ThinkingContent | null = null;
 			const blocks = output.content;
 			const blockIndex = () => blocks.length - 1;
 			// Read SSE stream
 			const reader = response.body.getReader();
 			const decoder = new TextDecoder();
 			let buffer = "";
 			// Set up abort handler to cancel reader when signal fires
 			const abortHandler = () => {
 				void reader.cancel().catch(() => {});
 			};
 			options?.signal?.addEventListener("abort", abortHandler);
-			try {
+			const resetOutput = () => {
-				while (true) {
+				output.content = [];
-					// Check abort signal before each read
+				output.usage = {
-					if (options?.signal?.aborted) {
+					input: 0,
-						throw new Error("Request was aborted");
+					output: 0,
-					}
+					cacheRead: 0,
 					cacheWrite: 0,
 					totalTokens: 0,
 					cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
 				};
 				output.stopReason = "stop";
 				output.errorMessage = undefined;
 				output.timestamp = Date.now();
 				started = false;
 			};
-					const { done, value } = await reader.read();
+			const streamResponse = async (activeResponse: Response): Promise<boolean> => {
-					if (done) break;
+				if (!activeResponse.body) {
 					throw new Error("No response body");
 				}
-					buffer += decoder.decode(value, { stream: true });
+				let hasContent = false;
-					const lines = buffer.split("\n");
+				let currentBlock: TextContent | ThinkingContent | null = null;
-					buffer = lines.pop() || "";
+				const blocks = output.content;
 				const blockIndex = () => blocks.length - 1;
-					for (const line of lines) {
+				// Read SSE stream
-						if (!line.startsWith("data:")) continue;
+				const reader = activeResponse.body.getReader();
 				const decoder = new TextDecoder();
 				let buffer = "";
-						const jsonStr = line.slice(5).trim();
+				// Set up abort handler to cancel reader when signal fires
-						if (!jsonStr) continue;
+				const abortHandler = () => {
 					void reader.cancel().catch(() => {});
 				};
 				options?.signal?.addEventListener("abort", abortHandler);
-						let chunk: CloudCodeAssistResponseChunk;
+				try {
-						try {
+					while (true) {
-							chunk = JSON.parse(jsonStr);
+						// Check abort signal before each read
-						} catch {
+						if (options?.signal?.aborted) {
-							continue;
+							throw new Error("Request was aborted");
 						}
-						// Unwrap the response
+						const { done, value } = await reader.read();
-						const responseData = chunk.response;
+						if (done) break;
 						if (!responseData) continue;
-						const candidate = responseData.candidates?.[0];
+						buffer += decoder.decode(value, { stream: true });
-						if (candidate?.content?.parts) {
+						const lines = buffer.split("\n");
-							for (const part of candidate.content.parts) {
+						buffer = lines.pop() || "";
-								if (part.text !== undefined) {
+
-									const isThinking = isThinkingPart(part);
+						for (const line of lines) {
-									if (
+							if (!line.startsWith("data:")) continue;
-										!currentBlock ||
+
-										(isThinking && currentBlock.type !== "thinking") ||
+							const jsonStr = line.slice(5).trim();
-										(!isThinking && currentBlock.type !== "text")
+							if (!jsonStr) continue;
-									) {
+
 							let chunk: CloudCodeAssistResponseChunk;
 							try {
 								chunk = JSON.parse(jsonStr);
 							} catch {
 								continue;
 							}
 							// Unwrap the response
 							const responseData = chunk.response;
 							if (!responseData) continue;
 							const candidate = responseData.candidates?.[0];
 							if (candidate?.content?.parts) {
 								for (const part of candidate.content.parts) {
 									if (part.text !== undefined) {
 										hasContent = true;
 										const isThinking = isThinkingPart(part);
 										if (
 											!currentBlock ||
 											(isThinking && currentBlock.type !== "thinking") ||
 											(!isThinking && currentBlock.type !== "text")
 										) {
 											if (currentBlock) {
 												if (currentBlock.type === "text") {
 													stream.push({
 														type: "text_end",
 														contentIndex: blocks.length - 1,
 														content: currentBlock.text,
 														partial: output,
 													});
 												} else {
 													stream.push({
 														type: "thinking_end",
 														contentIndex: blockIndex(),
 														content: currentBlock.thinking,
 														partial: output,
 													});
 												}
 											}
 											if (isThinking) {
 												currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
 												output.content.push(currentBlock);
 												ensureStarted();
 												stream.push({
 													type: "thinking_start",
 													contentIndex: blockIndex(),
 													partial: output,
 												});
 											} else {
 												currentBlock = { type: "text", text: "" };
 												output.content.push(currentBlock);
 												ensureStarted();
 												stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
 											}
 										}
 										if (currentBlock.type === "thinking") {
 											currentBlock.thinking += part.text;
 											currentBlock.thinkingSignature = retainThoughtSignature(
 												currentBlock.thinkingSignature,
 												part.thoughtSignature,
 											);
 											stream.push({
 												type: "thinking_delta",
 												contentIndex: blockIndex(),
 												delta: part.text,
 												partial: output,
 											});
 										} else {
 											currentBlock.text += part.text;
 											currentBlock.textSignature = retainThoughtSignature(
 												currentBlock.textSignature,
 												part.thoughtSignature,
 											);
 											stream.push({
 												type: "text_delta",
 												contentIndex: blockIndex(),
 												delta: part.text,
 												partial: output,
 											});
 										}
 									}
 									if (part.functionCall) {
 										hasContent = true;
 										if (currentBlock) {
 											if (currentBlock.type === "text") {
 												stream.push({
 													type: "text_end",
-													contentIndex: blocks.length - 1,
+													contentIndex: blockIndex(),
 													content: currentBlock.text,
 													partial: output,
 												});
@ -506,143 +665,142 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 													partial: output,
 												});
 											}
 											currentBlock = null;
 										}
-										if (isThinking) {
+
-											currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
+										const providedId = part.functionCall.id;
-											output.content.push(currentBlock);
+										const needsNewId =
-											stream.push({ type: "thinking_start", contentIndex: blockIndex(), partial: output });
+											!providedId ||
-										} else {
+											output.content.some((b) => b.type === "toolCall" && b.id === providedId);
-											currentBlock = { type: "text", text: "" };
+										const toolCallId = needsNewId
-											output.content.push(currentBlock);
+											? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
-											stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
+											: providedId;
-										}
+
-									}
+										const toolCall: ToolCall = {
-									if (currentBlock.type === "thinking") {
+											type: "toolCall",
-										currentBlock.thinking += part.text;
+											id: toolCallId,
-										currentBlock.thinkingSignature = retainThoughtSignature(
+											name: part.functionCall.name || "",
-											currentBlock.thinkingSignature,
+											arguments: part.functionCall.args as Record<string, unknown>,
-											part.thoughtSignature,
+											...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
-										);
+										};
 										output.content.push(toolCall);
 										ensureStarted();
 										stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
 										stream.push({
-											type: "thinking_delta",
+											type: "toolcall_delta",
 											contentIndex: blockIndex(),
-											delta: part.text,
+											delta: JSON.stringify(toolCall.arguments),
 											partial: output,
 										});
 									} else {
 										currentBlock.text += part.text;
 										currentBlock.textSignature = retainThoughtSignature(
 											currentBlock.textSignature,
 											part.thoughtSignature,
 										);
 										stream.push({
-											type: "text_delta",
+											type: "toolcall_end",
 											contentIndex: blockIndex(),
-											delta: part.text,
+											toolCall,
 											partial: output,
 										});
 									}
 								}
 							}
-								if (part.functionCall) {
+							if (candidate?.finishReason) {
-									if (currentBlock) {
+								output.stopReason = mapStopReasonString(candidate.finishReason);
-										if (currentBlock.type === "text") {
+								if (output.content.some((b) => b.type === "toolCall")) {
-											stream.push({
+									output.stopReason = "toolUse";
 												type: "text_end",
 												contentIndex: blockIndex(),
 												content: currentBlock.text,
 												partial: output,
 											});
 										} else {
 											stream.push({
 												type: "thinking_end",
 												contentIndex: blockIndex(),
 												content: currentBlock.thinking,
 												partial: output,
 											});
 										}
 										currentBlock = null;
 									}
 									const providedId = part.functionCall.id;
 									const needsNewId =
 										!providedId || output.content.some((b) => b.type === "toolCall" && b.id === providedId);
 									const toolCallId = needsNewId
 										? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
 										: providedId;
 									const toolCall: ToolCall = {
 										type: "toolCall",
 										id: toolCallId,
 										name: part.functionCall.name || "",
 										arguments: part.functionCall.args as Record<string, unknown>,
 										...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
 									};
 									output.content.push(toolCall);
 									stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
 									stream.push({
 										type: "toolcall_delta",
 										contentIndex: blockIndex(),
 										delta: JSON.stringify(toolCall.arguments),
 										partial: output,
 									});
 									stream.push({ type: "toolcall_end", contentIndex: blockIndex(), toolCall, partial: output });
 								}
 							}
 						}
-						if (candidate?.finishReason) {
+							if (responseData.usageMetadata) {
-							output.stopReason = mapStopReasonString(candidate.finishReason);
+								// promptTokenCount includes cachedContentTokenCount, so subtract to get fresh input
-							if (output.content.some((b) => b.type === "toolCall")) {
+								const promptTokens = responseData.usageMetadata.promptTokenCount || 0;
-								output.stopReason = "toolUse";
+								const cacheReadTokens = responseData.usageMetadata.cachedContentTokenCount || 0;
-							}
+								output.usage = {
-						}
+									input: promptTokens - cacheReadTokens,
-
+									output:
-						if (responseData.usageMetadata) {
+										(responseData.usageMetadata.candidatesTokenCount || 0) +
-							// promptTokenCount includes cachedContentTokenCount, so subtract to get fresh input
+										(responseData.usageMetadata.thoughtsTokenCount || 0),
-							const promptTokens = responseData.usageMetadata.promptTokenCount || 0;
+									cacheRead: cacheReadTokens,
 							const cacheReadTokens = responseData.usageMetadata.cachedContentTokenCount || 0;
 							output.usage = {
 								input: promptTokens - cacheReadTokens,
 								output:
 									(responseData.usageMetadata.candidatesTokenCount || 0) +
 									(responseData.usageMetadata.thoughtsTokenCount || 0),
 								cacheRead: cacheReadTokens,
 								cacheWrite: 0,
 								totalTokens: responseData.usageMetadata.totalTokenCount || 0,
 								cost: {
 									input: 0,
 									output: 0,
 									cacheRead: 0,
 									cacheWrite: 0,
-									total: 0,
+									totalTokens: responseData.usageMetadata.totalTokenCount || 0,
-								},
+									cost: {
-							};
+										input: 0,
-							calculateCost(model, output.usage);
+										output: 0,
 										cacheRead: 0,
 										cacheWrite: 0,
 										total: 0,
 									},
 								};
 								calculateCost(model, output.usage);
 							}
 						}
 					}
 				} finally {
 					options?.signal?.removeEventListener("abort", abortHandler);
 				}
 				if (currentBlock) {
 					if (currentBlock.type === "text") {
 						stream.push({
 							type: "text_end",
 							contentIndex: blockIndex(),
 							content: currentBlock.text,
 							partial: output,
 						});
 					} else {
 						stream.push({
 							type: "thinking_end",
 							contentIndex: blockIndex(),
 							content: currentBlock.thinking,
 							partial: output,
 						});
 					}
 				}
 				return hasContent;
 			};
 			let receivedContent = false;
 			let currentResponse = response;
 			for (let emptyAttempt = 0; emptyAttempt <= MAX_EMPTY_STREAM_RETRIES; emptyAttempt++) {
 				if (options?.signal?.aborted) {
 					throw new Error("Request was aborted");
 				}
 				if (emptyAttempt > 0) {
 					const backoffMs = EMPTY_STREAM_BASE_DELAY_MS * 2 ** (emptyAttempt - 1);
 					await sleep(backoffMs, options?.signal);
 					if (!requestUrl) {
 						throw new Error("Missing request URL");
 					}
 					currentResponse = await fetch(requestUrl, {
 						method: "POST",
 						headers: requestHeaders,
 						body: requestBodyJson,
 						signal: options?.signal,
 					});
 					if (!currentResponse.ok) {
 						const retryErrorText = await currentResponse.text();
 						throw new Error(`Cloud Code Assist API error (${currentResponse.status}): ${retryErrorText}`);
 					}
 				}
 				const streamed = await streamResponse(currentResponse);
 				if (streamed) {
 					receivedContent = true;
 					break;
 				}
 				if (emptyAttempt < MAX_EMPTY_STREAM_RETRIES) {
 					resetOutput();
 				}
 			} finally {
 				options?.signal?.removeEventListener("abort", abortHandler);
 			}
-			if (currentBlock) {
+			if (!receivedContent) {
-				if (currentBlock.type === "text") {
+				throw new Error("Cloud Code Assist API returned an empty response");
 					stream.push({
 						type: "text_end",
 						contentIndex: blockIndex(),
 						content: currentBlock.text,
 						partial: output,
 					});
 				} else {
 					stream.push({
 						type: "thinking_end",
 						contentIndex: blockIndex(),
 						content: currentBlock.thinking,
 						partial: output,
 					});
 				}
 			}
 			if (options?.signal?.aborted) {
@ -671,7 +829,34 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 	return stream;
 };
-function buildRequest(
+function deriveSessionId(context: Context): string | undefined {
 	for (const message of context.messages) {
 		if (message.role !== "user") {
 			continue;
 		}
 		let text = "";
 		if (typeof message.content === "string") {
 			text = message.content;
 		} else if (Array.isArray(message.content)) {
 			text = message.content
 				.filter((item): item is TextContent => item.type === "text")
 				.map((item) => item.text)
 				.join("\n");
 		}
 		if (!text || text.trim().length === 0) {
 			return undefined;
 		}
 		const hash = createHash("sha256").update(text).digest("hex");
 		return hash.slice(0, 32);
 	}
 	return undefined;
 }
 export function buildRequest(
 	model: Model<"google-gemini-cli">,
 	context: Context,
 	projectId: string,
@ -706,6 +891,11 @@ function buildRequest(
 		contents,
 	};
 	const sessionId = deriveSessionId(context);
 	if (sessionId) {
 		request.sessionId = sessionId;
 	}
 	// System instruction must be object with parts, not plain string
 	if (context.systemPrompt) {
 		request.systemInstruction = {
--- a/packages/ai/test/google-gemini-cli-claude-thinking-header.test.ts
+++ b/packages/ai/test/google-gemini-cli-claude-thinking-header.test.ts
@ -0,0 +1,103 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { streamGoogleGeminiCli } from "../src/providers/google-gemini-cli.js";
 import type { Context, Model } from "../src/types.js";
 const originalFetch = global.fetch;
 const apiKey = JSON.stringify({ token: "token", projectId: "project" });
 const createSseResponse = () => {
 	const sse = `${[
 		`data: ${JSON.stringify({
 			response: {
 				candidates: [
 					{
 						content: { role: "model", parts: [{ text: "Hello" }] },
 						finishReason: "STOP",
 					},
 				],
 			},
 		})}`,
 	].join("\n\n")}\n\n`;
 	const encoder = new TextEncoder();
 	const stream = new ReadableStream<Uint8Array>({
 		start(controller) {
 			controller.enqueue(encoder.encode(sse));
 			controller.close();
 		},
 	});
 	return new Response(stream, {
 		status: 200,
 		headers: { "content-type": "text/event-stream" },
 	});
 };
 afterEach(() => {
 	global.fetch = originalFetch;
 	vi.restoreAllMocks();
 });
 describe("google-gemini-cli Claude thinking header", () => {
 	const context: Context = {
 		messages: [{ role: "user", content: "Say hello", timestamp: Date.now() }],
 	};
 	it("adds anthropic-beta for Claude thinking models", async () => {
 		const fetchMock = vi.fn(async (_input: string | URL, init?: RequestInit) => {
 			const headers = new Headers(init?.headers);
 			expect(headers.get("anthropic-beta")).toBe("interleaved-thinking-2025-05-14");
 			return createSseResponse();
 		});
 		global.fetch = fetchMock as typeof fetch;
 		const model: Model<"google-gemini-cli"> = {
 			id: "claude-opus-4-5-thinking",
 			name: "Claude Opus 4.5 Thinking",
 			api: "google-gemini-cli",
 			provider: "google-antigravity",
 			baseUrl: "https://cloudcode-pa.googleapis.com",
 			reasoning: true,
 			input: ["text"],
 			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
 			contextWindow: 128000,
 			maxTokens: 8192,
 		};
 		const stream = streamGoogleGeminiCli(model, context, { apiKey });
 		for await (const _event of stream) {
 			// exhaust stream
 		}
 		await stream.result();
 	});
 	it("does not add anthropic-beta for Gemini models", async () => {
 		const fetchMock = vi.fn(async (_input: string | URL, init?: RequestInit) => {
 			const headers = new Headers(init?.headers);
 			expect(headers.has("anthropic-beta")).toBe(false);
 			return createSseResponse();
 		});
 		global.fetch = fetchMock as typeof fetch;
 		const model: Model<"google-gemini-cli"> = {
 			id: "gemini-2.5-flash",
 			name: "Gemini 2.5 Flash",
 			api: "google-gemini-cli",
 			provider: "google-gemini-cli",
 			baseUrl: "https://cloudcode-pa.googleapis.com",
 			reasoning: false,
 			input: ["text"],
 			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
 			contextWindow: 128000,
 			maxTokens: 8192,
 		};
 		const stream = streamGoogleGeminiCli(model, context, { apiKey });
 		for await (const _event of stream) {
 			// exhaust stream
 		}
 		await stream.result();
 	});
 });
--- a/packages/ai/test/google-gemini-cli-empty-stream.test.ts
+++ b/packages/ai/test/google-gemini-cli-empty-stream.test.ts
@ -0,0 +1,108 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { streamGoogleGeminiCli } from "../src/providers/google-gemini-cli.js";
 import type { Context, Model } from "../src/types.js";
 const originalFetch = global.fetch;
 afterEach(() => {
 	global.fetch = originalFetch;
 	vi.restoreAllMocks();
 });
 describe("google-gemini-cli empty stream retry", () => {
 	it("retries empty SSE responses without duplicate start", async () => {
 		const emptyStream = new ReadableStream<Uint8Array>({
 			start(controller) {
 				controller.close();
 			},
 		});
 		const sse = `${[
 			`data: ${JSON.stringify({
 				response: {
 					candidates: [
 						{
 							content: { role: "model", parts: [{ text: "Hello" }] },
 							finishReason: "STOP",
 						},
 					],
 					usageMetadata: {
 						promptTokenCount: 1,
 						candidatesTokenCount: 1,
 						totalTokenCount: 2,
 					},
 				},
 			})}`,
 		].join("\n\n")}\n\n`;
 		const encoder = new TextEncoder();
 		const dataStream = new ReadableStream<Uint8Array>({
 			start(controller) {
 				controller.enqueue(encoder.encode(sse));
 				controller.close();
 			},
 		});
 		let callCount = 0;
 		const fetchMock = vi.fn(async () => {
 			callCount += 1;
 			if (callCount === 1) {
 				return new Response(emptyStream, {
 					status: 200,
 					headers: { "content-type": "text/event-stream" },
 				});
 			}
 			return new Response(dataStream, {
 				status: 200,
 				headers: { "content-type": "text/event-stream" },
 			});
 		});
 		global.fetch = fetchMock as typeof fetch;
 		const model: Model<"google-gemini-cli"> = {
 			id: "gemini-2.5-flash",
 			name: "Gemini 2.5 Flash",
 			api: "google-gemini-cli",
 			provider: "google-gemini-cli",
 			baseUrl: "https://cloudcode-pa.googleapis.com",
 			reasoning: false,
 			input: ["text"],
 			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
 			contextWindow: 128000,
 			maxTokens: 8192,
 		};
 		const context: Context = {
 			messages: [{ role: "user", content: "Say hello", timestamp: Date.now() }],
 		};
 		const stream = streamGoogleGeminiCli(model, context, {
 			apiKey: JSON.stringify({ token: "token", projectId: "project" }),
 		});
 		let startCount = 0;
 		let doneCount = 0;
 		let text = "";
 		for await (const event of stream) {
 			if (event.type === "start") {
 				startCount += 1;
 			}
 			if (event.type === "done") {
 				doneCount += 1;
 			}
 			if (event.type === "text_delta") {
 				text += event.delta;
 			}
 		}
 		const result = await stream.result();
 		expect(text).toBe("Hello");
 		expect(result.stopReason).toBe("stop");
 		expect(startCount).toBe(1);
 		expect(doneCount).toBe(1);
 		expect(fetchMock).toHaveBeenCalledTimes(2);
 	});
 });
--- a/packages/ai/test/google-gemini-cli-retry-delay.test.ts
+++ b/packages/ai/test/google-gemini-cli-retry-delay.test.ts
@ -0,0 +1,53 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { extractRetryDelay } from "../src/providers/google-gemini-cli.js";
 describe("extractRetryDelay header parsing", () => {
 	afterEach(() => {
 		vi.useRealTimers();
 	});
 	it("prefers Retry-After seconds header", () => {
 		vi.useFakeTimers();
 		vi.setSystemTime(new Date("2025-01-01T00:00:00Z"));
 		const response = new Response("", { headers: { "Retry-After": "5" } });
 		const delay = extractRetryDelay("Please retry in 1s", response);
 		expect(delay).toBe(6000);
 	});
 	it("parses Retry-After HTTP date header", () => {
 		vi.useFakeTimers();
 		const now = new Date("2025-01-01T00:00:00Z");
 		vi.setSystemTime(now);
 		const retryAt = new Date(now.getTime() + 12000).toUTCString();
 		const response = new Response("", { headers: { "Retry-After": retryAt } });
 		const delay = extractRetryDelay("", response);
 		expect(delay).toBe(13000);
 	});
 	it("parses x-ratelimit-reset header", () => {
 		vi.useFakeTimers();
 		const now = new Date("2025-01-01T00:00:00Z");
 		vi.setSystemTime(now);
 		const resetAtMs = now.getTime() + 20000;
 		const resetSeconds = Math.floor(resetAtMs / 1000).toString();
 		const response = new Response("", { headers: { "x-ratelimit-reset": resetSeconds } });
 		const delay = extractRetryDelay("", response);
 		expect(delay).toBe(21000);
 	});
 	it("parses x-ratelimit-reset-after header", () => {
 		vi.useFakeTimers();
 		vi.setSystemTime(new Date("2025-01-01T00:00:00Z"));
 		const response = new Response("", { headers: { "x-ratelimit-reset-after": "30" } });
 		const delay = extractRetryDelay("", response);
 		expect(delay).toBe(31000);
 	});
 });
--- a/packages/ai/test/google-gemini-cli-session-id.test.ts
+++ b/packages/ai/test/google-gemini-cli-session-id.test.ts
@ -0,0 +1,50 @@
 import { createHash } from "node:crypto";
 import { describe, expect, it } from "vitest";
 import { buildRequest } from "../src/providers/google-gemini-cli.js";
 import type { Context, Model } from "../src/types.js";
 const model: Model<"google-gemini-cli"> = {
 	id: "gemini-2.5-flash",
 	name: "Gemini 2.5 Flash",
 	api: "google-gemini-cli",
 	provider: "google-gemini-cli",
 	baseUrl: "https://cloudcode-pa.googleapis.com",
 	reasoning: false,
 	input: ["text"],
 	cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
 	contextWindow: 128000,
 	maxTokens: 8192,
 };
 describe("buildRequest sessionId", () => {
 	it("derives sessionId from the first user message", () => {
 		const context: Context = {
 			messages: [
 				{ role: "user", content: "First message", timestamp: Date.now() },
 				{ role: "user", content: "Second message", timestamp: Date.now() },
 			],
 		};
 		const result = buildRequest(model, context, "project-id");
 		const expected = createHash("sha256").update("First message").digest("hex").slice(0, 32);
 		expect(result.request.sessionId).toBe(expected);
 	});
 	it("omits sessionId when the first user message has no text", () => {
 		const context: Context = {
 			messages: [
 				{
 					role: "user",
 					content: [{ type: "image", data: "Zm9v", mimeType: "image/png" }],
 					timestamp: Date.now(),
 				},
 				{ role: "user", content: "Later text", timestamp: Date.now() },
 			],
 		};
 		const result = buildRequest(model, context, "project-id");
 		expect(result.request.sessionId).toBeUndefined();
 	});
 });