Fix Gemini multimodal tool results causing flaky responses

For Gemini 3, images are now nested inside functionResponse.parts per the docs. For older models, images are sent in a separate user message. See: https://ai.google.dev/gemini-api/docs/function-calling#multimodal
2026-04-21 02:04:32 +00:00 · 2025-12-21 02:41:27 +01:00 · 2025-12-21 02:41:27 +01:00 · bf51dd4126
commit bf51dd4126
parent ce950ae96e
2 changed files with 29 additions and 19 deletions
--- a/packages/ai/CHANGELOG.md
+++ b/packages/ai/CHANGELOG.md
@ -1,6 +1,6 @@
 # Changelog
-## [Unreleased]
+## [0.25.1] - 2025-12-21
 ### Added
@ -8,6 +8,8 @@
 ### Fixed
 - **Gemini multimodal tool results**: Fixed images in tool results causing flaky/broken responses with Gemini models. For Gemini 3, images are now nested inside `functionResponse.parts` per the [docs](https://ai.google.dev/gemini-api/docs/function-calling#multimodal). For older models (which don't support multimodal function responses), images are sent in a separate user message.
 - **Queued message steering**: When `getQueuedMessages` is provided, the agent loop now checks for queued user messages after each tool call and skips remaining tool calls in the current assistant message when a queued message arrives (emitting error tool results). 
 - **Double API version path in Google provider URL**: Fixed Gemini API calls returning 404 after baseUrl support was added. The SDK was appending its default apiVersion to baseUrl which already included the version path. ([#251](https://github.com/badlogic/pi-mono/pull/251) by [@shellfyred](https://github.com/shellfyred))
--- a/packages/ai/src/providers/google-shared.ts
+++ b/packages/ai/src/providers/google-shared.ts
@ -86,9 +86,6 @@ export function convertMessages<T extends GoogleApiType>(model: Model<T>, contex
 				parts,
 			});
 		} else if (msg.role === "toolResult") {
 			// Build parts array with functionResponse and/or images
 			const parts: Part[] = [];
 			// Extract text and image content
 			const textContent = msg.content.filter((c): c is TextContent => c.type === "text");
 			const textResult = textContent.map((c) => c.text).join("\n");
@ -96,40 +93,51 @@ export function convertMessages<T extends GoogleApiType>(model: Model<T>, contex
 				? msg.content.filter((c): c is ImageContent => c.type === "image")
 				: [];
 			// Always add functionResponse with text result (or placeholder if only images)
 			const hasText = textResult.length > 0;
 			const hasImages = imageContent.length > 0;
 			// Gemini 3 supports multimodal function responses with images nested inside functionResponse.parts
 			// See: https://ai.google.dev/gemini-api/docs/function-calling#multimodal
 			// Older models don't support this, so we put images in a separate user message.
 			const supportsMultimodalFunctionResponse = model.id.includes("gemini-3");
 			// Use "output" key for success, "error" key for errors as per SDK documentation
 			const responseValue = hasText ? sanitizeSurrogates(textResult) : hasImages ? "(see attached image)" : "";
-			parts.push({
+			const imageParts: Part[] = imageContent.map((imageBlock) => ({
 				inlineData: {
 					mimeType: imageBlock.mimeType,
 					data: imageBlock.data,
 				},
 			}));
 			const functionResponsePart: Part = {
 				functionResponse: {
 					id: msg.toolCallId,
 					name: msg.toolName,
 					response: msg.isError ? { error: responseValue } : { output: responseValue },
 					// Nest images inside functionResponse.parts for Gemini 3
 					...(hasImages && supportsMultimodalFunctionResponse && { parts: imageParts }),
 				},
-			});
+			};
 			// Add any images as inlineData parts
 			for (const imageBlock of imageContent) {
 				parts.push({
 					inlineData: {
 						mimeType: imageBlock.mimeType,
 						data: imageBlock.data,
 					},
 				});
 			}
 			// Cloud Code Assist API requires all function responses to be in a single user turn.
 			// Check if the last content is already a user turn with function responses and merge.
 			const lastContent = contents[contents.length - 1];
 			if (lastContent?.role === "user" && lastContent.parts?.some((p) => p.functionResponse)) {
-				lastContent.parts.push(...parts);
+				lastContent.parts.push(functionResponsePart);
 			} else {
 				contents.push({
 					role: "user",
-					parts,
+					parts: [functionResponsePart],
 				});
 			}
 			// For older models, add images in a separate user message
 			if (hasImages && !supportsMultimodalFunctionResponse) {
 				contents.push({
 					role: "user",
 					parts: [{ text: "Tool result image:" }, ...imageParts],
 				});
 			}
 		}