Fix Gemini multimodal tool results causing flaky responses

For Gemini 3, images are now nested inside functionResponse.parts per the docs. For older models, images are sent in a separate user message. See: https://ai.google.dev/gemini-api/docs/function-calling#multimodal
2026-04-15 23:01:30 +00:00 · 2025-12-21 02:41:27 +01:00 · 2025-12-21 02:41:27 +01:00 · bf51dd4126
commit bf51dd4126
parent ce950ae96e
2 changed files with 29 additions and 19 deletions
--- a/packages/ai/CHANGELOG.md
+++ b/packages/ai/CHANGELOG.md
@ -1,6 +1,6 @@
 # Changelog

-## [Unreleased]
+## [0.25.1] - 2025-12-21

 ### Added

@ -8,6 +8,8 @@

 ### Fixed

+- **Gemini multimodal tool results**: Fixed images in tool results causing flaky/broken responses with Gemini models. For Gemini 3, images are now nested inside `functionResponse.parts` per the [docs](https://ai.google.dev/gemini-api/docs/function-calling#multimodal). For older models (which don't support multimodal function responses), images are sent in a separate user message.
+
 - **Queued message steering**: When `getQueuedMessages` is provided, the agent loop now checks for queued user messages after each tool call and skips remaining tool calls in the current assistant message when a queued message arrives (emitting error tool results). 

 - **Double API version path in Google provider URL**: Fixed Gemini API calls returning 404 after baseUrl support was added. The SDK was appending its default apiVersion to baseUrl which already included the version path. ([#251](https://github.com/badlogic/pi-mono/pull/251) by [@shellfyred](https://github.com/shellfyred))
--- a/packages/ai/src/providers/google-shared.ts
+++ b/packages/ai/src/providers/google-shared.ts
@ -86,9 +86,6 @@ export function convertMessages<T extends GoogleApiType>(model: Model<T>, contex
 				parts,
 			});
 		} else if (msg.role === "toolResult") {
-			// Build parts array with functionResponse and/or images
-			const parts: Part[] = [];
-
 			// Extract text and image content
 			const textContent = msg.content.filter((c): c is TextContent => c.type === "text");
 			const textResult = textContent.map((c) => c.text).join("\n");
@ -96,40 +93,51 @@ export function convertMessages<T extends GoogleApiType>(model: Model<T>, contex
 				? msg.content.filter((c): c is ImageContent => c.type === "image")
 				: [];

-			// Always add functionResponse with text result (or placeholder if only images)
 			const hasText = textResult.length > 0;
 			const hasImages = imageContent.length > 0;

+			// Gemini 3 supports multimodal function responses with images nested inside functionResponse.parts
+			// See: https://ai.google.dev/gemini-api/docs/function-calling#multimodal
+			// Older models don't support this, so we put images in a separate user message.
+			const supportsMultimodalFunctionResponse = model.id.includes("gemini-3");
+
 			// Use "output" key for success, "error" key for errors as per SDK documentation
 			const responseValue = hasText ? sanitizeSurrogates(textResult) : hasImages ? "(see attached image)" : "";

-			parts.push({
+			const imageParts: Part[] = imageContent.map((imageBlock) => ({
+				inlineData: {
+					mimeType: imageBlock.mimeType,
+					data: imageBlock.data,
+				},
+			}));
+
+			const functionResponsePart: Part = {
 				functionResponse: {
 					id: msg.toolCallId,
 					name: msg.toolName,
 					response: msg.isError ? { error: responseValue } : { output: responseValue },
+					// Nest images inside functionResponse.parts for Gemini 3
+					...(hasImages && supportsMultimodalFunctionResponse && { parts: imageParts }),
 				},
-			});
-
-			// Add any images as inlineData parts
-			for (const imageBlock of imageContent) {
-				parts.push({
-					inlineData: {
-						mimeType: imageBlock.mimeType,
-						data: imageBlock.data,
-					},
-				});
-			}
+			};

 			// Cloud Code Assist API requires all function responses to be in a single user turn.
 			// Check if the last content is already a user turn with function responses and merge.
 			const lastContent = contents[contents.length - 1];
 			if (lastContent?.role === "user" && lastContent.parts?.some((p) => p.functionResponse)) {
-				lastContent.parts.push(...parts);
+				lastContent.parts.push(functionResponsePart);
 			} else {
 				contents.push({
 					role: "user",
-					parts,
+					parts: [functionResponsePart],
+				});
+			}
+
+			// For older models, add images in a separate user message
+			if (hasImages && !supportsMultimodalFunctionResponse) {
+				contents.push({
+					role: "user",
+					parts: [{ text: "Tool result image:" }, ...imageParts],
 				});
 			}
 		}