Improve Gemini CLI provider retries and headers (#670)

Improve Gemini CLI provider retries and headers - Add Antigravity endpoint fallback (tries daily sandbox then prod when baseUrl is unset) - Parse retry delays from headers (Retry-After, x-ratelimit-reset, x-ratelimit-reset-after) before body parsing - Derive stable sessionId from first user message for cache affinity - Retry empty SSE streams with backoff without duplicate start/done events - Add anthropic-beta header for Claude thinking models only
2026-04-15 14:03:49 +00:00 · 2026-01-13 02:04:53 +02:00 · 2026-01-13 02:04:53 +02:00 · ff15414258
commit ff15414258
parent 9e4ae98358
5 changed files with 693 additions and 189 deletions
--- a/packages/ai/src/providers/google-gemini-cli.ts
+++ b/packages/ai/src/providers/google-gemini-cli.ts
@ -4,6 +4,7 @@
 * Uses the Cloud Code Assist API endpoint to access Gemini and Claude models.
 */

+import { createHash } from "node:crypto";
 import type { Content, ThinkingConfig } from "@google/genai";
 import { calculateCost } from "../models.js";
 import type {
@ -54,6 +55,8 @@ export interface GoogleGeminiCliOptions extends StreamOptions {
 }

 const DEFAULT_ENDPOINT = "https://cloudcode-pa.googleapis.com";
+const ANTIGRAVITY_DAILY_ENDPOINT = "https://daily-cloudcode-pa.sandbox.googleapis.com";
+const ANTIGRAVITY_ENDPOINT_FALLBACKS = [ANTIGRAVITY_DAILY_ENDPOINT, DEFAULT_ENDPOINT] as const;
 // Headers for Gemini CLI (prod endpoint)
 const GEMINI_CLI_HEADERS = {
 	"User-Agent": "google-cloud-sdk vscode_cloudshelleditor/0.1",
@ -163,16 +166,66 @@ let toolCallCounter = 0;
 // Retry configuration
 const MAX_RETRIES = 3;
 const BASE_DELAY_MS = 1000;
+const MAX_EMPTY_STREAM_RETRIES = 2;
+const EMPTY_STREAM_BASE_DELAY_MS = 500;
+const CLAUDE_THINKING_BETA_HEADER = "interleaved-thinking-2025-05-14";

 /**
 * Extract retry delay from Gemini error response (in milliseconds).
- * Parses patterns like:
+ * Checks headers first (Retry-After, x-ratelimit-reset, x-ratelimit-reset-after),
+ * then parses body patterns like:
 * - "Your quota will reset after 39s"
 * - "Your quota will reset after 18h31m10s"
 * - "Please retry in Xs" or "Please retry in Xms"
 * - "retryDelay": "34.074824224s" (JSON field)
 */
-function extractRetryDelay(errorText: string): number | undefined {
+export function extractRetryDelay(errorText: string, response?: Response | Headers): number | undefined {
+	const normalizeDelay = (ms: number): number | undefined => (ms > 0 ? Math.ceil(ms + 1000) : undefined);
+
+	const headers = response instanceof Headers ? response : response?.headers;
+	if (headers) {
+		const retryAfter = headers.get("retry-after");
+		if (retryAfter) {
+			const retryAfterSeconds = Number(retryAfter);
+			if (Number.isFinite(retryAfterSeconds)) {
+				const delay = normalizeDelay(retryAfterSeconds * 1000);
+				if (delay !== undefined) {
+					return delay;
+				}
+			}
+			const retryAfterDate = new Date(retryAfter);
+			const retryAfterMs = retryAfterDate.getTime();
+			if (!Number.isNaN(retryAfterMs)) {
+				const delay = normalizeDelay(retryAfterMs - Date.now());
+				if (delay !== undefined) {
+					return delay;
+				}
+			}
+		}
+
+		const rateLimitReset = headers.get("x-ratelimit-reset");
+		if (rateLimitReset) {
+			const resetSeconds = Number.parseInt(rateLimitReset, 10);
+			if (!Number.isNaN(resetSeconds)) {
+				const delay = normalizeDelay(resetSeconds * 1000 - Date.now());
+				if (delay !== undefined) {
+					return delay;
+				}
+			}
+		}
+
+		const rateLimitResetAfter = headers.get("x-ratelimit-reset-after");
+		if (rateLimitResetAfter) {
+			const resetAfterSeconds = Number(rateLimitResetAfter);
+			if (Number.isFinite(resetAfterSeconds)) {
+				const delay = normalizeDelay(resetAfterSeconds * 1000);
+				if (delay !== undefined) {
+					return delay;
+				}
+			}
+		}
+	}
+
 	// Pattern 1: "Your quota will reset after ..." (formats: "18h31m10s", "10m15s", "6s", "39s")
 	const durationMatch = errorText.match(/reset after (?:(\d+)h)?(?:(\d+)m)?(\d+(?:\.\d+)?)s/i);
 	if (durationMatch) {
@ -181,8 +234,9 @@ function extractRetryDelay(errorText: string): number | undefined {
 		const seconds = parseFloat(durationMatch[3]);
 		if (!Number.isNaN(seconds)) {
 			const totalMs = ((hours * 60 + minutes) * 60 + seconds) * 1000;
-			if (totalMs > 0) {
-				return Math.ceil(totalMs + 1000); // Add 1s buffer
+			const delay = normalizeDelay(totalMs);
+			if (delay !== undefined) {
+				return delay;
 			}
 		}
 	}
@ -193,7 +247,10 @@ function extractRetryDelay(errorText: string): number | undefined {
 		const value = parseFloat(retryInMatch[1]);
 		if (!Number.isNaN(value) && value > 0) {
 			const ms = retryInMatch[2].toLowerCase() === "ms" ? value : value * 1000;
-			return Math.ceil(ms + 1000);
+			const delay = normalizeDelay(ms);
+			if (delay !== undefined) {
+				return delay;
+			}
 		}
 	}

@ -203,13 +260,21 @@ function extractRetryDelay(errorText: string): number | undefined {
 		const value = parseFloat(retryDelayMatch[1]);
 		if (!Number.isNaN(value) && value > 0) {
 			const ms = retryDelayMatch[2].toLowerCase() === "ms" ? value : value * 1000;
-			return Math.ceil(ms + 1000);
+			const delay = normalizeDelay(ms);
+			if (delay !== undefined) {
+				return delay;
+			}
 		}
 	}

 	return undefined;
 }

+function isClaudeThinkingModel(modelId: string): boolean {
+	const normalized = modelId.toLowerCase();
+	return normalized.includes("claude") && normalized.includes("thinking");
+}
+
 /**
 * Check if an error is retryable (rate limit, server error, network error, etc.)
 */
@ -258,6 +323,7 @@ interface CloudCodeAssistRequest {
 	model: string;
 	request: {
 		contents: Content[];
+		sessionId?: string;
 		systemInstruction?: { role?: string; parts: { text: string }[] };
 		generationConfig?: {
 			maxOutputTokens?: number;
@ -355,17 +421,26 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 				throw new Error("Missing token or projectId in Google Cloud credentials. Use /login to re-authenticate.");
 			}

-			const endpoint = model.baseUrl || DEFAULT_ENDPOINT;
-			const url = `${endpoint}/v1internal:streamGenerateContent?alt=sse`;
+			const isAntigravity = model.provider === "google-antigravity";
+			const baseUrl = model.baseUrl?.trim();
+			const endpoints = baseUrl ? [baseUrl] : isAntigravity ? ANTIGRAVITY_ENDPOINT_FALLBACKS : [DEFAULT_ENDPOINT];

-			// Use Antigravity headers for sandbox endpoint, otherwise Gemini CLI headers
-			const isAntigravity = endpoint.includes("sandbox.googleapis.com");
 			const requestBody = buildRequest(model, context, projectId, options, isAntigravity);
 			const headers = isAntigravity ? ANTIGRAVITY_HEADERS : GEMINI_CLI_HEADERS;

+			const requestHeaders = {
+				Authorization: `Bearer ${accessToken}`,
+				"Content-Type": "application/json",
+				Accept: "text/event-stream",
+				...headers,
+				...(isClaudeThinkingModel(model.id) ? { "anthropic-beta": CLAUDE_THINKING_BETA_HEADER } : {}),
+			};
+			const requestBodyJson = JSON.stringify(requestBody);
+
 			// Fetch with retry logic for rate limits and transient errors
 			let response: Response | undefined;
 			let lastError: Error | undefined;
+			let requestUrl: string | undefined;

 			for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
 				if (options?.signal?.aborted) {
@ -373,15 +448,12 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 				}

 				try {
-					response = await fetch(url, {
+					const endpoint = endpoints[Math.min(attempt, endpoints.length - 1)];
+					requestUrl = `${endpoint}/v1internal:streamGenerateContent?alt=sse`;
+					response = await fetch(requestUrl, {
 						method: "POST",
-						headers: {
-							Authorization: `Bearer ${accessToken}`,
-							"Content-Type": "application/json",
-							Accept: "text/event-stream",
-							...headers,
-						},
-						body: JSON.stringify(requestBody),
+						headers: requestHeaders,
+						body: requestBodyJson,
 						signal: options?.signal,
 					});

@ -394,7 +466,7 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 					// Check if retryable
 					if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) {
 						// Use server-provided delay or exponential backoff
-						const serverDelay = extractRetryDelay(errorText);
+						const serverDelay = extractRetryDelay(errorText, response);
 						const delayMs = serverDelay ?? BASE_DELAY_MS * 2 ** attempt;
 						await sleep(delayMs, options?.signal);
 						continue;
@ -428,73 +500,160 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 				throw lastError ?? new Error("Failed to get response after retries");
 			}

-			if (!response.body) {
-				throw new Error("No response body");
-			}
-
-			stream.push({ type: "start", partial: output });
-
-			let currentBlock: TextContent | ThinkingContent | null = null;
-			const blocks = output.content;
-			const blockIndex = () => blocks.length - 1;
-
-			// Read SSE stream
-			const reader = response.body.getReader();
-			const decoder = new TextDecoder();
-			let buffer = "";
-
-			// Set up abort handler to cancel reader when signal fires
-			const abortHandler = () => {
-				void reader.cancel().catch(() => {});
+			let started = false;
+			const ensureStarted = () => {
+				if (!started) {
+					stream.push({ type: "start", partial: output });
+					started = true;
+				}
 			};
-			options?.signal?.addEventListener("abort", abortHandler);

-			try {
-				while (true) {
-					// Check abort signal before each read
-					if (options?.signal?.aborted) {
-						throw new Error("Request was aborted");
-					}
+			const resetOutput = () => {
+				output.content = [];
+				output.usage = {
+					input: 0,
+					output: 0,
+					cacheRead: 0,
+					cacheWrite: 0,
+					totalTokens: 0,
+					cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+				};
+				output.stopReason = "stop";
+				output.errorMessage = undefined;
+				output.timestamp = Date.now();
+				started = false;
+			};

-					const { done, value } = await reader.read();
-					if (done) break;
+			const streamResponse = async (activeResponse: Response): Promise<boolean> => {
+				if (!activeResponse.body) {
+					throw new Error("No response body");
+				}

-					buffer += decoder.decode(value, { stream: true });
-					const lines = buffer.split("\n");
-					buffer = lines.pop() || "";
+				let hasContent = false;
+				let currentBlock: TextContent | ThinkingContent | null = null;
+				const blocks = output.content;
+				const blockIndex = () => blocks.length - 1;

-					for (const line of lines) {
-						if (!line.startsWith("data:")) continue;
+				// Read SSE stream
+				const reader = activeResponse.body.getReader();
+				const decoder = new TextDecoder();
+				let buffer = "";

-						const jsonStr = line.slice(5).trim();
-						if (!jsonStr) continue;
+				// Set up abort handler to cancel reader when signal fires
+				const abortHandler = () => {
+					void reader.cancel().catch(() => {});
+				};
+				options?.signal?.addEventListener("abort", abortHandler);

-						let chunk: CloudCodeAssistResponseChunk;
-						try {
-							chunk = JSON.parse(jsonStr);
-						} catch {
-							continue;
+				try {
+					while (true) {
+						// Check abort signal before each read
+						if (options?.signal?.aborted) {
+							throw new Error("Request was aborted");
 						}

-						// Unwrap the response
-						const responseData = chunk.response;
-						if (!responseData) continue;
+						const { done, value } = await reader.read();
+						if (done) break;

-						const candidate = responseData.candidates?.[0];
-						if (candidate?.content?.parts) {
-							for (const part of candidate.content.parts) {
-								if (part.text !== undefined) {
-									const isThinking = isThinkingPart(part);
-									if (
-										!currentBlock ||
-										(isThinking && currentBlock.type !== "thinking") ||
-										(!isThinking && currentBlock.type !== "text")
-									) {
+						buffer += decoder.decode(value, { stream: true });
+						const lines = buffer.split("\n");
+						buffer = lines.pop() || "";
+
+						for (const line of lines) {
+							if (!line.startsWith("data:")) continue;
+
+							const jsonStr = line.slice(5).trim();
+							if (!jsonStr) continue;
+
+							let chunk: CloudCodeAssistResponseChunk;
+							try {
+								chunk = JSON.parse(jsonStr);
+							} catch {
+								continue;
+							}
+
+							// Unwrap the response
+							const responseData = chunk.response;
+							if (!responseData) continue;
+
+							const candidate = responseData.candidates?.[0];
+							if (candidate?.content?.parts) {
+								for (const part of candidate.content.parts) {
+									if (part.text !== undefined) {
+										hasContent = true;
+										const isThinking = isThinkingPart(part);
+										if (
+											!currentBlock ||
+											(isThinking && currentBlock.type !== "thinking") ||
+											(!isThinking && currentBlock.type !== "text")
+										) {
+											if (currentBlock) {
+												if (currentBlock.type === "text") {
+													stream.push({
+														type: "text_end",
+														contentIndex: blocks.length - 1,
+														content: currentBlock.text,
+														partial: output,
+													});
+												} else {
+													stream.push({
+														type: "thinking_end",
+														contentIndex: blockIndex(),
+														content: currentBlock.thinking,
+														partial: output,
+													});
+												}
+											}
+											if (isThinking) {
+												currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
+												output.content.push(currentBlock);
+												ensureStarted();
+												stream.push({
+													type: "thinking_start",
+													contentIndex: blockIndex(),
+													partial: output,
+												});
+											} else {
+												currentBlock = { type: "text", text: "" };
+												output.content.push(currentBlock);
+												ensureStarted();
+												stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
+											}
+										}
+										if (currentBlock.type === "thinking") {
+											currentBlock.thinking += part.text;
+											currentBlock.thinkingSignature = retainThoughtSignature(
+												currentBlock.thinkingSignature,
+												part.thoughtSignature,
+											);
+											stream.push({
+												type: "thinking_delta",
+												contentIndex: blockIndex(),
+												delta: part.text,
+												partial: output,
+											});
+										} else {
+											currentBlock.text += part.text;
+											currentBlock.textSignature = retainThoughtSignature(
+												currentBlock.textSignature,
+												part.thoughtSignature,
+											);
+											stream.push({
+												type: "text_delta",
+												contentIndex: blockIndex(),
+												delta: part.text,
+												partial: output,
+											});
+										}
+									}
+
+									if (part.functionCall) {
+										hasContent = true;
 										if (currentBlock) {
 											if (currentBlock.type === "text") {
 												stream.push({
 													type: "text_end",
-													contentIndex: blocks.length - 1,
+													contentIndex: blockIndex(),
 													content: currentBlock.text,
 													partial: output,
 												});
@ -506,143 +665,142 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 													partial: output,
 												});
 											}
+											currentBlock = null;
 										}
-										if (isThinking) {
-											currentBlock = { type: "thinking", thinking: "", thinkingSignature: undefined };
-											output.content.push(currentBlock);
-											stream.push({ type: "thinking_start", contentIndex: blockIndex(), partial: output });
-										} else {
-											currentBlock = { type: "text", text: "" };
-											output.content.push(currentBlock);
-											stream.push({ type: "text_start", contentIndex: blockIndex(), partial: output });
-										}
-									}
-									if (currentBlock.type === "thinking") {
-										currentBlock.thinking += part.text;
-										currentBlock.thinkingSignature = retainThoughtSignature(
-											currentBlock.thinkingSignature,
-											part.thoughtSignature,
-										);
+
+										const providedId = part.functionCall.id;
+										const needsNewId =
+											!providedId ||
+											output.content.some((b) => b.type === "toolCall" && b.id === providedId);
+										const toolCallId = needsNewId
+											? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
+											: providedId;
+
+										const toolCall: ToolCall = {
+											type: "toolCall",
+											id: toolCallId,
+											name: part.functionCall.name || "",
+											arguments: part.functionCall.args as Record<string, unknown>,
+											...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
+										};
+
+										output.content.push(toolCall);
+										ensureStarted();
+										stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
 										stream.push({
-											type: "thinking_delta",
+											type: "toolcall_delta",
 											contentIndex: blockIndex(),
-											delta: part.text,
+											delta: JSON.stringify(toolCall.arguments),
 											partial: output,
 										});
-									} else {
-										currentBlock.text += part.text;
-										currentBlock.textSignature = retainThoughtSignature(
-											currentBlock.textSignature,
-											part.thoughtSignature,
-										);
 										stream.push({
-											type: "text_delta",
+											type: "toolcall_end",
 											contentIndex: blockIndex(),
-											delta: part.text,
+											toolCall,
 											partial: output,
 										});
 									}
 								}
+							}

-								if (part.functionCall) {
-									if (currentBlock) {
-										if (currentBlock.type === "text") {
-											stream.push({
-												type: "text_end",
-												contentIndex: blockIndex(),
-												content: currentBlock.text,
-												partial: output,
-											});
-										} else {
-											stream.push({
-												type: "thinking_end",
-												contentIndex: blockIndex(),
-												content: currentBlock.thinking,
-												partial: output,
-											});
-										}
-										currentBlock = null;
-									}
-
-									const providedId = part.functionCall.id;
-									const needsNewId =
-										!providedId || output.content.some((b) => b.type === "toolCall" && b.id === providedId);
-									const toolCallId = needsNewId
-										? `${part.functionCall.name}_${Date.now()}_${++toolCallCounter}`
-										: providedId;
-
-									const toolCall: ToolCall = {
-										type: "toolCall",
-										id: toolCallId,
-										name: part.functionCall.name || "",
-										arguments: part.functionCall.args as Record<string, unknown>,
-										...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
-									};
-
-									output.content.push(toolCall);
-									stream.push({ type: "toolcall_start", contentIndex: blockIndex(), partial: output });
-									stream.push({
-										type: "toolcall_delta",
-										contentIndex: blockIndex(),
-										delta: JSON.stringify(toolCall.arguments),
-										partial: output,
-									});
-									stream.push({ type: "toolcall_end", contentIndex: blockIndex(), toolCall, partial: output });
+							if (candidate?.finishReason) {
+								output.stopReason = mapStopReasonString(candidate.finishReason);
+								if (output.content.some((b) => b.type === "toolCall")) {
+									output.stopReason = "toolUse";
 								}
 							}
-						}

-						if (candidate?.finishReason) {
-							output.stopReason = mapStopReasonString(candidate.finishReason);
-							if (output.content.some((b) => b.type === "toolCall")) {
-								output.stopReason = "toolUse";
-							}
-						}
-
-						if (responseData.usageMetadata) {
-							// promptTokenCount includes cachedContentTokenCount, so subtract to get fresh input
-							const promptTokens = responseData.usageMetadata.promptTokenCount || 0;
-							const cacheReadTokens = responseData.usageMetadata.cachedContentTokenCount || 0;
-							output.usage = {
-								input: promptTokens - cacheReadTokens,
-								output:
-									(responseData.usageMetadata.candidatesTokenCount || 0) +
-									(responseData.usageMetadata.thoughtsTokenCount || 0),
-								cacheRead: cacheReadTokens,
-								cacheWrite: 0,
-								totalTokens: responseData.usageMetadata.totalTokenCount || 0,
-								cost: {
-									input: 0,
-									output: 0,
-									cacheRead: 0,
+							if (responseData.usageMetadata) {
+								// promptTokenCount includes cachedContentTokenCount, so subtract to get fresh input
+								const promptTokens = responseData.usageMetadata.promptTokenCount || 0;
+								const cacheReadTokens = responseData.usageMetadata.cachedContentTokenCount || 0;
+								output.usage = {
+									input: promptTokens - cacheReadTokens,
+									output:
+										(responseData.usageMetadata.candidatesTokenCount || 0) +
+										(responseData.usageMetadata.thoughtsTokenCount || 0),
+									cacheRead: cacheReadTokens,
 									cacheWrite: 0,
-									total: 0,
-								},
-							};
-							calculateCost(model, output.usage);
+									totalTokens: responseData.usageMetadata.totalTokenCount || 0,
+									cost: {
+										input: 0,
+										output: 0,
+										cacheRead: 0,
+										cacheWrite: 0,
+										total: 0,
+									},
+								};
+								calculateCost(model, output.usage);
+							}
 						}
 					}
+				} finally {
+					options?.signal?.removeEventListener("abort", abortHandler);
+				}
+
+				if (currentBlock) {
+					if (currentBlock.type === "text") {
+						stream.push({
+							type: "text_end",
+							contentIndex: blockIndex(),
+							content: currentBlock.text,
+							partial: output,
+						});
+					} else {
+						stream.push({
+							type: "thinking_end",
+							contentIndex: blockIndex(),
+							content: currentBlock.thinking,
+							partial: output,
+						});
+					}
+				}
+
+				return hasContent;
+			};
+
+			let receivedContent = false;
+			let currentResponse = response;
+
+			for (let emptyAttempt = 0; emptyAttempt <= MAX_EMPTY_STREAM_RETRIES; emptyAttempt++) {
+				if (options?.signal?.aborted) {
+					throw new Error("Request was aborted");
+				}
+
+				if (emptyAttempt > 0) {
+					const backoffMs = EMPTY_STREAM_BASE_DELAY_MS * 2 ** (emptyAttempt - 1);
+					await sleep(backoffMs, options?.signal);
+
+					if (!requestUrl) {
+						throw new Error("Missing request URL");
+					}
+
+					currentResponse = await fetch(requestUrl, {
+						method: "POST",
+						headers: requestHeaders,
+						body: requestBodyJson,
+						signal: options?.signal,
+					});
+
+					if (!currentResponse.ok) {
+						const retryErrorText = await currentResponse.text();
+						throw new Error(`Cloud Code Assist API error (${currentResponse.status}): ${retryErrorText}`);
+					}
+				}
+
+				const streamed = await streamResponse(currentResponse);
+				if (streamed) {
+					receivedContent = true;
+					break;
+				}
+
+				if (emptyAttempt < MAX_EMPTY_STREAM_RETRIES) {
+					resetOutput();
 				}
-			} finally {
-				options?.signal?.removeEventListener("abort", abortHandler);
 			}

-			if (currentBlock) {
-				if (currentBlock.type === "text") {
-					stream.push({
-						type: "text_end",
-						contentIndex: blockIndex(),
-						content: currentBlock.text,
-						partial: output,
-					});
-				} else {
-					stream.push({
-						type: "thinking_end",
-						contentIndex: blockIndex(),
-						content: currentBlock.thinking,
-						partial: output,
-					});
-				}
+			if (!receivedContent) {
+				throw new Error("Cloud Code Assist API returned an empty response");
 			}

 			if (options?.signal?.aborted) {
@ -671,7 +829,34 @@ export const streamGoogleGeminiCli: StreamFunction<"google-gemini-cli"> = (
 	return stream;
 };

-function buildRequest(
+function deriveSessionId(context: Context): string | undefined {
+	for (const message of context.messages) {
+		if (message.role !== "user") {
+			continue;
+		}
+
+		let text = "";
+		if (typeof message.content === "string") {
+			text = message.content;
+		} else if (Array.isArray(message.content)) {
+			text = message.content
+				.filter((item): item is TextContent => item.type === "text")
+				.map((item) => item.text)
+				.join("\n");
+		}
+
+		if (!text || text.trim().length === 0) {
+			return undefined;
+		}
+
+		const hash = createHash("sha256").update(text).digest("hex");
+		return hash.slice(0, 32);
+	}
+
+	return undefined;
+}
+
+export function buildRequest(
 	model: Model<"google-gemini-cli">,
 	context: Context,
 	projectId: string,
@ -706,6 +891,11 @@ function buildRequest(
 		contents,
 	};

+	const sessionId = deriveSessionId(context);
+	if (sessionId) {
+		request.sessionId = sessionId;
+	}
+
 	// System instruction must be object with parts, not plain string
 	if (context.systemPrompt) {
 		request.systemInstruction = {
--- a/packages/ai/test/google-gemini-cli-claude-thinking-header.test.ts
+++ b/packages/ai/test/google-gemini-cli-claude-thinking-header.test.ts
@ -0,0 +1,103 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { streamGoogleGeminiCli } from "../src/providers/google-gemini-cli.js";
+import type { Context, Model } from "../src/types.js";
+
+const originalFetch = global.fetch;
+const apiKey = JSON.stringify({ token: "token", projectId: "project" });
+
+const createSseResponse = () => {
+	const sse = `${[
+		`data: ${JSON.stringify({
+			response: {
+				candidates: [
+					{
+						content: { role: "model", parts: [{ text: "Hello" }] },
+						finishReason: "STOP",
+					},
+				],
+			},
+		})}`,
+	].join("\n\n")}\n\n`;
+
+	const encoder = new TextEncoder();
+	const stream = new ReadableStream<Uint8Array>({
+		start(controller) {
+			controller.enqueue(encoder.encode(sse));
+			controller.close();
+		},
+	});
+
+	return new Response(stream, {
+		status: 200,
+		headers: { "content-type": "text/event-stream" },
+	});
+};
+
+afterEach(() => {
+	global.fetch = originalFetch;
+	vi.restoreAllMocks();
+});
+
+describe("google-gemini-cli Claude thinking header", () => {
+	const context: Context = {
+		messages: [{ role: "user", content: "Say hello", timestamp: Date.now() }],
+	};
+
+	it("adds anthropic-beta for Claude thinking models", async () => {
+		const fetchMock = vi.fn(async (_input: string | URL, init?: RequestInit) => {
+			const headers = new Headers(init?.headers);
+			expect(headers.get("anthropic-beta")).toBe("interleaved-thinking-2025-05-14");
+			return createSseResponse();
+		});
+
+		global.fetch = fetchMock as typeof fetch;
+
+		const model: Model<"google-gemini-cli"> = {
+			id: "claude-opus-4-5-thinking",
+			name: "Claude Opus 4.5 Thinking",
+			api: "google-gemini-cli",
+			provider: "google-antigravity",
+			baseUrl: "https://cloudcode-pa.googleapis.com",
+			reasoning: true,
+			input: ["text"],
+			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+			contextWindow: 128000,
+			maxTokens: 8192,
+		};
+
+		const stream = streamGoogleGeminiCli(model, context, { apiKey });
+		for await (const _event of stream) {
+			// exhaust stream
+		}
+		await stream.result();
+	});
+
+	it("does not add anthropic-beta for Gemini models", async () => {
+		const fetchMock = vi.fn(async (_input: string | URL, init?: RequestInit) => {
+			const headers = new Headers(init?.headers);
+			expect(headers.has("anthropic-beta")).toBe(false);
+			return createSseResponse();
+		});
+
+		global.fetch = fetchMock as typeof fetch;
+
+		const model: Model<"google-gemini-cli"> = {
+			id: "gemini-2.5-flash",
+			name: "Gemini 2.5 Flash",
+			api: "google-gemini-cli",
+			provider: "google-gemini-cli",
+			baseUrl: "https://cloudcode-pa.googleapis.com",
+			reasoning: false,
+			input: ["text"],
+			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+			contextWindow: 128000,
+			maxTokens: 8192,
+		};
+
+		const stream = streamGoogleGeminiCli(model, context, { apiKey });
+		for await (const _event of stream) {
+			// exhaust stream
+		}
+		await stream.result();
+	});
+});
--- a/packages/ai/test/google-gemini-cli-empty-stream.test.ts
+++ b/packages/ai/test/google-gemini-cli-empty-stream.test.ts
@ -0,0 +1,108 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { streamGoogleGeminiCli } from "../src/providers/google-gemini-cli.js";
+import type { Context, Model } from "../src/types.js";
+
+const originalFetch = global.fetch;
+
+afterEach(() => {
+	global.fetch = originalFetch;
+	vi.restoreAllMocks();
+});
+
+describe("google-gemini-cli empty stream retry", () => {
+	it("retries empty SSE responses without duplicate start", async () => {
+		const emptyStream = new ReadableStream<Uint8Array>({
+			start(controller) {
+				controller.close();
+			},
+		});
+
+		const sse = `${[
+			`data: ${JSON.stringify({
+				response: {
+					candidates: [
+						{
+							content: { role: "model", parts: [{ text: "Hello" }] },
+							finishReason: "STOP",
+						},
+					],
+					usageMetadata: {
+						promptTokenCount: 1,
+						candidatesTokenCount: 1,
+						totalTokenCount: 2,
+					},
+				},
+			})}`,
+		].join("\n\n")}\n\n`;
+
+		const encoder = new TextEncoder();
+		const dataStream = new ReadableStream<Uint8Array>({
+			start(controller) {
+				controller.enqueue(encoder.encode(sse));
+				controller.close();
+			},
+		});
+
+		let callCount = 0;
+		const fetchMock = vi.fn(async () => {
+			callCount += 1;
+			if (callCount === 1) {
+				return new Response(emptyStream, {
+					status: 200,
+					headers: { "content-type": "text/event-stream" },
+				});
+			}
+			return new Response(dataStream, {
+				status: 200,
+				headers: { "content-type": "text/event-stream" },
+			});
+		});
+
+		global.fetch = fetchMock as typeof fetch;
+
+		const model: Model<"google-gemini-cli"> = {
+			id: "gemini-2.5-flash",
+			name: "Gemini 2.5 Flash",
+			api: "google-gemini-cli",
+			provider: "google-gemini-cli",
+			baseUrl: "https://cloudcode-pa.googleapis.com",
+			reasoning: false,
+			input: ["text"],
+			cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+			contextWindow: 128000,
+			maxTokens: 8192,
+		};
+
+		const context: Context = {
+			messages: [{ role: "user", content: "Say hello", timestamp: Date.now() }],
+		};
+
+		const stream = streamGoogleGeminiCli(model, context, {
+			apiKey: JSON.stringify({ token: "token", projectId: "project" }),
+		});
+
+		let startCount = 0;
+		let doneCount = 0;
+		let text = "";
+
+		for await (const event of stream) {
+			if (event.type === "start") {
+				startCount += 1;
+			}
+			if (event.type === "done") {
+				doneCount += 1;
+			}
+			if (event.type === "text_delta") {
+				text += event.delta;
+			}
+		}
+
+		const result = await stream.result();
+
+		expect(text).toBe("Hello");
+		expect(result.stopReason).toBe("stop");
+		expect(startCount).toBe(1);
+		expect(doneCount).toBe(1);
+		expect(fetchMock).toHaveBeenCalledTimes(2);
+	});
+});
--- a/packages/ai/test/google-gemini-cli-retry-delay.test.ts
+++ b/packages/ai/test/google-gemini-cli-retry-delay.test.ts
@ -0,0 +1,53 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { extractRetryDelay } from "../src/providers/google-gemini-cli.js";
+
+describe("extractRetryDelay header parsing", () => {
+	afterEach(() => {
+		vi.useRealTimers();
+	});
+
+	it("prefers Retry-After seconds header", () => {
+		vi.useFakeTimers();
+		vi.setSystemTime(new Date("2025-01-01T00:00:00Z"));
+
+		const response = new Response("", { headers: { "Retry-After": "5" } });
+		const delay = extractRetryDelay("Please retry in 1s", response);
+
+		expect(delay).toBe(6000);
+	});
+
+	it("parses Retry-After HTTP date header", () => {
+		vi.useFakeTimers();
+		const now = new Date("2025-01-01T00:00:00Z");
+		vi.setSystemTime(now);
+
+		const retryAt = new Date(now.getTime() + 12000).toUTCString();
+		const response = new Response("", { headers: { "Retry-After": retryAt } });
+		const delay = extractRetryDelay("", response);
+
+		expect(delay).toBe(13000);
+	});
+
+	it("parses x-ratelimit-reset header", () => {
+		vi.useFakeTimers();
+		const now = new Date("2025-01-01T00:00:00Z");
+		vi.setSystemTime(now);
+
+		const resetAtMs = now.getTime() + 20000;
+		const resetSeconds = Math.floor(resetAtMs / 1000).toString();
+		const response = new Response("", { headers: { "x-ratelimit-reset": resetSeconds } });
+		const delay = extractRetryDelay("", response);
+
+		expect(delay).toBe(21000);
+	});
+
+	it("parses x-ratelimit-reset-after header", () => {
+		vi.useFakeTimers();
+		vi.setSystemTime(new Date("2025-01-01T00:00:00Z"));
+
+		const response = new Response("", { headers: { "x-ratelimit-reset-after": "30" } });
+		const delay = extractRetryDelay("", response);
+
+		expect(delay).toBe(31000);
+	});
+});
--- a/packages/ai/test/google-gemini-cli-session-id.test.ts
+++ b/packages/ai/test/google-gemini-cli-session-id.test.ts
@ -0,0 +1,50 @@
+import { createHash } from "node:crypto";
+import { describe, expect, it } from "vitest";
+import { buildRequest } from "../src/providers/google-gemini-cli.js";
+import type { Context, Model } from "../src/types.js";
+
+const model: Model<"google-gemini-cli"> = {
+	id: "gemini-2.5-flash",
+	name: "Gemini 2.5 Flash",
+	api: "google-gemini-cli",
+	provider: "google-gemini-cli",
+	baseUrl: "https://cloudcode-pa.googleapis.com",
+	reasoning: false,
+	input: ["text"],
+	cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+	contextWindow: 128000,
+	maxTokens: 8192,
+};
+
+describe("buildRequest sessionId", () => {
+	it("derives sessionId from the first user message", () => {
+		const context: Context = {
+			messages: [
+				{ role: "user", content: "First message", timestamp: Date.now() },
+				{ role: "user", content: "Second message", timestamp: Date.now() },
+			],
+		};
+
+		const result = buildRequest(model, context, "project-id");
+		const expected = createHash("sha256").update("First message").digest("hex").slice(0, 32);
+
+		expect(result.request.sessionId).toBe(expected);
+	});
+
+	it("omits sessionId when the first user message has no text", () => {
+		const context: Context = {
+			messages: [
+				{
+					role: "user",
+					content: [{ type: "image", data: "Zm9v", mimeType: "image/png" }],
+					timestamp: Date.now(),
+				},
+				{ role: "user", content: "Later text", timestamp: Date.now() },
+			],
+		};
+
+		const result = buildRequest(model, context, "project-id");
+
+		expect(result.request.sessionId).toBeUndefined();
+	});
+});