mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-15 23:01:30 +00:00
Fix Gemini multimodal tool results causing flaky responses
For Gemini 3, images are now nested inside functionResponse.parts per the docs. For older models, images are sent in a separate user message. See: https://ai.google.dev/gemini-api/docs/function-calling#multimodal
This commit is contained in:
parent
ce950ae96e
commit
bf51dd4126
2 changed files with 29 additions and 19 deletions
|
|
@ -1,6 +1,6 @@
|
|||
# Changelog
|
||||
|
||||
## [Unreleased]
|
||||
## [0.25.1] - 2025-12-21
|
||||
|
||||
### Added
|
||||
|
||||
|
|
@ -8,6 +8,8 @@
|
|||
|
||||
### Fixed
|
||||
|
||||
- **Gemini multimodal tool results**: Fixed images in tool results causing flaky/broken responses with Gemini models. For Gemini 3, images are now nested inside `functionResponse.parts` per the [docs](https://ai.google.dev/gemini-api/docs/function-calling#multimodal). For older models (which don't support multimodal function responses), images are sent in a separate user message.
|
||||
|
||||
- **Queued message steering**: When `getQueuedMessages` is provided, the agent loop now checks for queued user messages after each tool call and skips remaining tool calls in the current assistant message when a queued message arrives (emitting error tool results).
|
||||
|
||||
- **Double API version path in Google provider URL**: Fixed Gemini API calls returning 404 after baseUrl support was added. The SDK was appending its default apiVersion to baseUrl which already included the version path. ([#251](https://github.com/badlogic/pi-mono/pull/251) by [@shellfyred](https://github.com/shellfyred))
|
||||
|
|
|
|||
|
|
@ -86,9 +86,6 @@ export function convertMessages<T extends GoogleApiType>(model: Model<T>, contex
|
|||
parts,
|
||||
});
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Build parts array with functionResponse and/or images
|
||||
const parts: Part[] = [];
|
||||
|
||||
// Extract text and image content
|
||||
const textContent = msg.content.filter((c): c is TextContent => c.type === "text");
|
||||
const textResult = textContent.map((c) => c.text).join("\n");
|
||||
|
|
@ -96,40 +93,51 @@ export function convertMessages<T extends GoogleApiType>(model: Model<T>, contex
|
|||
? msg.content.filter((c): c is ImageContent => c.type === "image")
|
||||
: [];
|
||||
|
||||
// Always add functionResponse with text result (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
const hasImages = imageContent.length > 0;
|
||||
|
||||
// Gemini 3 supports multimodal function responses with images nested inside functionResponse.parts
|
||||
// See: https://ai.google.dev/gemini-api/docs/function-calling#multimodal
|
||||
// Older models don't support this, so we put images in a separate user message.
|
||||
const supportsMultimodalFunctionResponse = model.id.includes("gemini-3");
|
||||
|
||||
// Use "output" key for success, "error" key for errors as per SDK documentation
|
||||
const responseValue = hasText ? sanitizeSurrogates(textResult) : hasImages ? "(see attached image)" : "";
|
||||
|
||||
parts.push({
|
||||
const imageParts: Part[] = imageContent.map((imageBlock) => ({
|
||||
inlineData: {
|
||||
mimeType: imageBlock.mimeType,
|
||||
data: imageBlock.data,
|
||||
},
|
||||
}));
|
||||
|
||||
const functionResponsePart: Part = {
|
||||
functionResponse: {
|
||||
id: msg.toolCallId,
|
||||
name: msg.toolName,
|
||||
response: msg.isError ? { error: responseValue } : { output: responseValue },
|
||||
// Nest images inside functionResponse.parts for Gemini 3
|
||||
...(hasImages && supportsMultimodalFunctionResponse && { parts: imageParts }),
|
||||
},
|
||||
});
|
||||
|
||||
// Add any images as inlineData parts
|
||||
for (const imageBlock of imageContent) {
|
||||
parts.push({
|
||||
inlineData: {
|
||||
mimeType: imageBlock.mimeType,
|
||||
data: imageBlock.data,
|
||||
},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Cloud Code Assist API requires all function responses to be in a single user turn.
|
||||
// Check if the last content is already a user turn with function responses and merge.
|
||||
const lastContent = contents[contents.length - 1];
|
||||
if (lastContent?.role === "user" && lastContent.parts?.some((p) => p.functionResponse)) {
|
||||
lastContent.parts.push(...parts);
|
||||
lastContent.parts.push(functionResponsePart);
|
||||
} else {
|
||||
contents.push({
|
||||
role: "user",
|
||||
parts,
|
||||
parts: [functionResponsePart],
|
||||
});
|
||||
}
|
||||
|
||||
// For older models, add images in a separate user message
|
||||
if (hasImages && !supportsMultimodalFunctionResponse) {
|
||||
contents.push({
|
||||
role: "user",
|
||||
parts: [{ text: "Tool result image:" }, ...imageParts],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue