Add image support in tool results across all providers

Tool results now use content blocks and can include both text and images.
All providers (Anthropic, Google, OpenAI Completions, OpenAI Responses)
correctly pass images from tool results to LLMs.

- Update ToolResultMessage type to use content blocks
- Add placeholder text for image-only tool results in Google/Anthropic
- OpenAI providers send tool result + follow-up user message with images
- Fix Anthropic JSON parsing for empty tool arguments
- Add comprehensive tests for image-only and text+image tool results
- Update README with tool result content blocks API
This commit is contained in:
Mario Zechner 2025-11-12 10:45:56 +01:00
parent 9dac37d836
commit 84dcab219b
37 changed files with 720 additions and 544 deletions

View file

@ -60,14 +60,18 @@ async function calculateTest<TApi extends Api>(model: Model<TApi>, options: Opti
break;
case "tool_execution_end":
if (!event.isError && typeof event.result === "object" && event.result.output) {
if (!event.isError && typeof event.result === "object" && event.result.content) {
const textOutput = event.result.content
.filter((c: any) => c.type === "text")
.map((c: any) => c.text)
.join("\n");
toolCallCount++;
// Extract number from output like "expression = result"
const match = event.result.output.match(/=\s*([\d.]+)/);
const match = textOutput.match(/=\s*([\d.]+)/);
if (match) {
const value = parseFloat(match[1]);
toolResults.push(value);
console.log(`Tool ${toolCallCount}: ${event.result.output}`);
console.log(`Tool ${toolCallCount}: ${textOutput}`);
}
}
break;

View file

@ -55,7 +55,7 @@ const providerContexts = {
role: "toolResult" as const,
toolCallId: "toolu_01abc123",
toolName: "get_weather",
output: "Weather in Tokyo: 18°C, partly cloudy",
content: [{ type: "text", text: "Weather in Tokyo: 18°C, partly cloudy" }],
isError: false,
timestamp: Date.now(),
} satisfies ToolResultMessage,
@ -106,7 +106,7 @@ const providerContexts = {
role: "toolResult" as const,
toolCallId: "call_gemini_123",
toolName: "get_weather",
output: "Weather in Berlin: 22°C, sunny",
content: [{ type: "text", text: "Weather in Berlin: 22°C, sunny" }],
isError: false,
timestamp: Date.now(),
} satisfies ToolResultMessage,
@ -156,7 +156,7 @@ const providerContexts = {
role: "toolResult" as const,
toolCallId: "call_abc123",
toolName: "get_weather",
output: "Weather in London: 15°C, rainy",
content: [{ type: "text", text: "Weather in London: 15°C, rainy" }],
isError: false,
timestamp: Date.now(),
} satisfies ToolResultMessage,
@ -208,7 +208,7 @@ const providerContexts = {
role: "toolResult" as const,
toolCallId: "call_789_item_012", // Match the updated ID format
toolName: "get_weather",
output: "Weather in Sydney: 25°C, clear",
content: [{ type: "text", text: "Weather in Sydney: 25°C, clear" }],
isError: false,
timestamp: Date.now(),
} satisfies ToolResultMessage,

View file

@ -0,0 +1,263 @@
import { readFileSync } from "node:fs";
import { join } from "node:path";
import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
import type { Api, Context, Model, Tool, ToolResultMessage } from "../src/index.js";
import { complete, getModel } from "../src/index.js";
import type { OptionsForApi } from "../src/types.js";
/**
* Test that tool results containing only images work correctly across all providers.
* This verifies that:
* 1. Tool results can contain image content blocks
* 2. Providers correctly pass images from tool results to the LLM
* 3. The LLM can see and describe images returned by tools
*/
async function handleToolWithImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
// Check if the model supports images
if (!model.input.includes("image")) {
console.log(`Skipping tool image result test - model ${model.id} doesn't support images`);
return;
}
// Read the test image
const imagePath = join(__dirname, "data", "red-circle.png");
const imageBuffer = readFileSync(imagePath);
const base64Image = imageBuffer.toString("base64");
// Define a tool that returns only an image (no text)
const getImageSchema = Type.Object({});
const getImageTool: Tool<typeof getImageSchema> = {
name: "get_circle",
description: "Returns a circle image for visualization",
parameters: getImageSchema,
};
const context: Context = {
systemPrompt: "You are a helpful assistant that uses tools when asked.",
messages: [
{
role: "user",
content: "Use the get_circle tool to get an image, and describe what you see, shapes, colors, etc.",
timestamp: Date.now(),
},
],
tools: [getImageTool],
};
// First request - LLM should call the tool
const firstResponse = await complete(model, context, options);
expect(firstResponse.stopReason).toBe("toolUse");
// Find the tool call
const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("Expected tool call");
}
expect(toolCall.name).toBe("get_circle");
// Add the tool call to context
context.messages.push(firstResponse);
// Create tool result with ONLY an image (no text)
const toolResult: ToolResultMessage = {
role: "toolResult",
toolCallId: toolCall.id,
toolName: toolCall.name,
content: [
{
type: "image",
data: base64Image,
mimeType: "image/png",
},
],
isError: false,
timestamp: Date.now(),
};
context.messages.push(toolResult);
// Second request - LLM should describe the image from the tool result
const secondResponse = await complete(model, context, options);
expect(secondResponse.stopReason).toBe("stop");
expect(secondResponse.errorMessage).toBeFalsy();
// Verify the LLM can see and describe the image
const textContent = secondResponse.content.find((b) => b.type === "text");
expect(textContent).toBeTruthy();
if (textContent && textContent.type === "text") {
const lowerContent = textContent.text.toLowerCase();
// Should mention red and circle since that's what the image shows
expect(lowerContent).toContain("red");
expect(lowerContent).toContain("circle");
}
}
/**
* Test that tool results containing both text and images work correctly across all providers.
* This verifies that:
* 1. Tool results can contain mixed content blocks (text + images)
* 2. Providers correctly pass both text and images from tool results to the LLM
* 3. The LLM can see both the text and images in tool results
*/
async function handleToolWithTextAndImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
// Check if the model supports images
if (!model.input.includes("image")) {
console.log(`Skipping tool text+image result test - model ${model.id} doesn't support images`);
return;
}
// Read the test image
const imagePath = join(__dirname, "data", "red-circle.png");
const imageBuffer = readFileSync(imagePath);
const base64Image = imageBuffer.toString("base64");
// Define a tool that returns both text and an image
const getImageSchema = Type.Object({});
const getImageTool: Tool<typeof getImageSchema> = {
name: "get_circle_with_description",
description: "Returns a circle image with a text description",
parameters: getImageSchema,
};
const context: Context = {
systemPrompt: "You are a helpful assistant that uses tools when asked.",
messages: [
{
role: "user",
content: "Use the get_circle_with_description tool and tell me what you learned.",
timestamp: Date.now(),
},
],
tools: [getImageTool],
};
// First request - LLM should call the tool
const firstResponse = await complete(model, context, options);
expect(firstResponse.stopReason).toBe("toolUse");
// Find the tool call
const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("Expected tool call");
}
expect(toolCall.name).toBe("get_circle_with_description");
// Add the tool call to context
context.messages.push(firstResponse);
// Create tool result with BOTH text and image
const toolResult: ToolResultMessage = {
role: "toolResult",
toolCallId: toolCall.id,
toolName: toolCall.name,
content: [
{
type: "text",
text: "This is a geometric shape with specific properties: it has a diameter of 100 pixels.",
},
{
type: "image",
data: base64Image,
mimeType: "image/png",
},
],
isError: false,
timestamp: Date.now(),
};
context.messages.push(toolResult);
// Second request - LLM should describe both the text and image from the tool result
const secondResponse = await complete(model, context, options);
expect(secondResponse.stopReason).toBe("stop");
expect(secondResponse.errorMessage).toBeFalsy();
// Verify the LLM can see both text and image
const textContent = secondResponse.content.find((b) => b.type === "text");
expect(textContent).toBeTruthy();
if (textContent && textContent.type === "text") {
const lowerContent = textContent.text.toLowerCase();
// Should mention details from the text (diameter/pixels)
expect(lowerContent.match(/diameter|100|pixel/)).toBeTruthy();
// Should also mention the visual properties (red and circle)
expect(lowerContent).toContain("red");
expect(lowerContent).toContain("circle");
}
}
describe("Tool Results with Images", () => {
describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider (gemini-2.5-flash)", () => {
const llm = getModel("google", "gemini-2.5-flash");
it("should handle tool result with only image", async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider (gpt-4o-mini)", () => {
const llm: Model<"openai-completions"> = { ...getModel("openai", "gpt-4o-mini"), api: "openai-completions" };
it("should handle tool result with only image", async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider (gpt-5-mini)", () => {
const llm = getModel("openai", "gpt-5-mini");
it("should handle tool result with only image", async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider (claude-haiku-4-5)", () => {
const model = getModel("anthropic", "claude-haiku-4-5");
it("should handle tool result with only image", async () => {
await handleToolWithImageResult(model);
});
it("should handle tool result with text and image", async () => {
await handleToolWithTextAndImageResult(model);
});
});
describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider (claude-sonnet-4-5)", () => {
const model = getModel("anthropic", "claude-sonnet-4-5");
it("should handle tool result with only image", async () => {
await handleToolWithImageResult(model);
});
it("should handle tool result with text and image", async () => {
await handleToolWithTextAndImageResult(model);
});
});
describe.skipIf(!process.env.OPENROUTER_API_KEY)("OpenRouter Provider (glm-4.5v)", () => {
const llm = getModel("openrouter", "z-ai/glm-4.5v");
it("should handle tool result with only image", async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", async () => {
await handleToolWithTextAndImageResult(llm);
});
});
});

View file

@ -305,7 +305,7 @@ async function multiTurn<TApi extends Api>(model: Model<TApi>, options?: Options
role: "toolResult",
toolCallId: block.id,
toolName: block.name,
output: `${result}`,
content: [{ type: "text", text: `${result}` }],
isError: false,
timestamp: Date.now(),
});

View file

@ -27,7 +27,7 @@ describe("Tool Validation with TypeBox and AJV", () => {
parameters: testSchema,
execute: async (_toolCallId, args) => {
return {
output: `Processed: ${args.name}, ${args.age}, ${args.email}`,
content: [{ type: "text", text: `Processed: ${args.name}, ${args.age}, ${args.email}` }],
details: undefined,
};
},
@ -130,7 +130,11 @@ describe("Tool Validation with TypeBox and AJV", () => {
const result = await testTool.execute("test-id", validInput as TestParams);
expect(result.output).toBe("Processed: John Doe, 30, john@example.com");
const textOutput = result.content
.filter((c: any) => c.type === "text")
.map((c: any) => c.text)
.join("\n");
expect(textOutput).toBe("Processed: John Doe, 30, john@example.com");
expect(result.details).toBeUndefined();
});
});

View file

@ -62,7 +62,10 @@ async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, option
role: "toolResult",
toolCallId: "test_1",
toolName: "test_tool",
output: `Test with emoji 🙈 and other characters:
content: [
{
type: "text",
text: `Test with emoji 🙈 and other characters:
- Monkey emoji: 🙈
- Thumbs up: 👍
- Heart:
@ -73,6 +76,8 @@ async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, option
- Chinese: 你好
- Mathematical symbols:
- Special quotes: "curly" 'quotes'`,
},
],
isError: false,
timestamp: Date.now(),
};
@ -141,7 +146,10 @@ async function testRealWorldLinkedInData<TApi extends Api>(llm: Model<TApi>, opt
role: "toolResult",
toolCallId: "linkedin_1",
toolName: "linkedin_skill",
output: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
content: [
{
type: "text",
text: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
Unanswered Comments: 2
=> {
@ -156,6 +164,8 @@ Unanswered Comments: 2
}
]
}`,
},
],
isError: false,
timestamp: Date.now(),
};
@ -226,7 +236,7 @@ async function testUnpairedHighSurrogate<TApi extends Api>(llm: Model<TApi>, opt
role: "toolResult",
toolCallId: "test_2",
toolName: "test_tool",
output: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized`,
content: [{ type: "text", text: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized` }],
isError: false,
timestamp: Date.now(),
};