Add image support in tool results across all providers

Tool results now use content blocks and can include both text and images. All providers (Anthropic, Google, OpenAI Completions, OpenAI Responses) correctly pass images from tool results to LLMs. - Update ToolResultMessage type to use content blocks - Add placeholder text for image-only tool results in Google/Anthropic - OpenAI providers send tool result + follow-up user message with images - Fix Anthropic JSON parsing for empty tool arguments - Add comprehensive tests for image-only and text+image tool results - Update README with tool result content blocks API
2026-04-19 23:01:32 +00:00 · 2025-11-12 10:45:56 +01:00 · 2025-11-12 10:45:56 +01:00 · 84dcab219b
commit 84dcab219b
parent 9dac37d836
37 changed files with 720 additions and 544 deletions
--- a/packages/ai/test/agent.test.ts
+++ b/packages/ai/test/agent.test.ts
@ -60,14 +60,18 @@ async function calculateTest<TApi extends Api>(model: Model<TApi>, options: Opti
 				break;

 			case "tool_execution_end":
-				if (!event.isError && typeof event.result === "object" && event.result.output) {
+				if (!event.isError && typeof event.result === "object" && event.result.content) {
+					const textOutput = event.result.content
+						.filter((c: any) => c.type === "text")
+						.map((c: any) => c.text)
+						.join("\n");
 					toolCallCount++;
 					// Extract number from output like "expression = result"
-					const match = event.result.output.match(/=\s*([\d.]+)/);
+					const match = textOutput.match(/=\s*([\d.]+)/);
 					if (match) {
 						const value = parseFloat(match[1]);
 						toolResults.push(value);
-						console.log(`Tool ${toolCallCount}: ${event.result.output}`);
+						console.log(`Tool ${toolCallCount}: ${textOutput}`);
 					}
 				}
 				break;
--- a/packages/ai/test/handoff.test.ts
+++ b/packages/ai/test/handoff.test.ts
@ -55,7 +55,7 @@ const providerContexts = {
 			role: "toolResult" as const,
 			toolCallId: "toolu_01abc123",
 			toolName: "get_weather",
-			output: "Weather in Tokyo: 18°C, partly cloudy",
+			content: [{ type: "text", text: "Weather in Tokyo: 18°C, partly cloudy" }],
 			isError: false,
 			timestamp: Date.now(),
 		} satisfies ToolResultMessage,
@ -106,7 +106,7 @@ const providerContexts = {
 			role: "toolResult" as const,
 			toolCallId: "call_gemini_123",
 			toolName: "get_weather",
-			output: "Weather in Berlin: 22°C, sunny",
+			content: [{ type: "text", text: "Weather in Berlin: 22°C, sunny" }],
 			isError: false,
 			timestamp: Date.now(),
 		} satisfies ToolResultMessage,
@ -156,7 +156,7 @@ const providerContexts = {
 			role: "toolResult" as const,
 			toolCallId: "call_abc123",
 			toolName: "get_weather",
-			output: "Weather in London: 15°C, rainy",
+			content: [{ type: "text", text: "Weather in London: 15°C, rainy" }],
 			isError: false,
 			timestamp: Date.now(),
 		} satisfies ToolResultMessage,
@ -208,7 +208,7 @@ const providerContexts = {
 			role: "toolResult" as const,
 			toolCallId: "call_789_item_012", // Match the updated ID format
 			toolName: "get_weather",
-			output: "Weather in Sydney: 25°C, clear",
+			content: [{ type: "text", text: "Weather in Sydney: 25°C, clear" }],
 			isError: false,
 			timestamp: Date.now(),
 		} satisfies ToolResultMessage,
--- a/packages/ai/test/image-tool-result.test.ts
+++ b/packages/ai/test/image-tool-result.test.ts
@ -0,0 +1,263 @@
+import { readFileSync } from "node:fs";
+import { join } from "node:path";
+import { Type } from "@sinclair/typebox";
+import { describe, expect, it } from "vitest";
+import type { Api, Context, Model, Tool, ToolResultMessage } from "../src/index.js";
+import { complete, getModel } from "../src/index.js";
+import type { OptionsForApi } from "../src/types.js";
+
+/**
+ * Test that tool results containing only images work correctly across all providers.
+ * This verifies that:
+ * 1. Tool results can contain image content blocks
+ * 2. Providers correctly pass images from tool results to the LLM
+ * 3. The LLM can see and describe images returned by tools
+ */
+async function handleToolWithImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
+	// Check if the model supports images
+	if (!model.input.includes("image")) {
+		console.log(`Skipping tool image result test - model ${model.id} doesn't support images`);
+		return;
+	}
+
+	// Read the test image
+	const imagePath = join(__dirname, "data", "red-circle.png");
+	const imageBuffer = readFileSync(imagePath);
+	const base64Image = imageBuffer.toString("base64");
+
+	// Define a tool that returns only an image (no text)
+	const getImageSchema = Type.Object({});
+	const getImageTool: Tool<typeof getImageSchema> = {
+		name: "get_circle",
+		description: "Returns a circle image for visualization",
+		parameters: getImageSchema,
+	};
+
+	const context: Context = {
+		systemPrompt: "You are a helpful assistant that uses tools when asked.",
+		messages: [
+			{
+				role: "user",
+				content: "Use the get_circle tool to get an image, and describe what you see, shapes, colors, etc.",
+				timestamp: Date.now(),
+			},
+		],
+		tools: [getImageTool],
+	};
+
+	// First request - LLM should call the tool
+	const firstResponse = await complete(model, context, options);
+	expect(firstResponse.stopReason).toBe("toolUse");
+
+	// Find the tool call
+	const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
+	expect(toolCall).toBeTruthy();
+	if (!toolCall || toolCall.type !== "toolCall") {
+		throw new Error("Expected tool call");
+	}
+	expect(toolCall.name).toBe("get_circle");
+
+	// Add the tool call to context
+	context.messages.push(firstResponse);
+
+	// Create tool result with ONLY an image (no text)
+	const toolResult: ToolResultMessage = {
+		role: "toolResult",
+		toolCallId: toolCall.id,
+		toolName: toolCall.name,
+		content: [
+			{
+				type: "image",
+				data: base64Image,
+				mimeType: "image/png",
+			},
+		],
+		isError: false,
+		timestamp: Date.now(),
+	};
+
+	context.messages.push(toolResult);
+
+	// Second request - LLM should describe the image from the tool result
+	const secondResponse = await complete(model, context, options);
+	expect(secondResponse.stopReason).toBe("stop");
+	expect(secondResponse.errorMessage).toBeFalsy();
+
+	// Verify the LLM can see and describe the image
+	const textContent = secondResponse.content.find((b) => b.type === "text");
+	expect(textContent).toBeTruthy();
+	if (textContent && textContent.type === "text") {
+		const lowerContent = textContent.text.toLowerCase();
+		// Should mention red and circle since that's what the image shows
+		expect(lowerContent).toContain("red");
+		expect(lowerContent).toContain("circle");
+	}
+}
+
+/**
+ * Test that tool results containing both text and images work correctly across all providers.
+ * This verifies that:
+ * 1. Tool results can contain mixed content blocks (text + images)
+ * 2. Providers correctly pass both text and images from tool results to the LLM
+ * 3. The LLM can see both the text and images in tool results
+ */
+async function handleToolWithTextAndImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
+	// Check if the model supports images
+	if (!model.input.includes("image")) {
+		console.log(`Skipping tool text+image result test - model ${model.id} doesn't support images`);
+		return;
+	}
+
+	// Read the test image
+	const imagePath = join(__dirname, "data", "red-circle.png");
+	const imageBuffer = readFileSync(imagePath);
+	const base64Image = imageBuffer.toString("base64");
+
+	// Define a tool that returns both text and an image
+	const getImageSchema = Type.Object({});
+	const getImageTool: Tool<typeof getImageSchema> = {
+		name: "get_circle_with_description",
+		description: "Returns a circle image with a text description",
+		parameters: getImageSchema,
+	};
+
+	const context: Context = {
+		systemPrompt: "You are a helpful assistant that uses tools when asked.",
+		messages: [
+			{
+				role: "user",
+				content: "Use the get_circle_with_description tool and tell me what you learned.",
+				timestamp: Date.now(),
+			},
+		],
+		tools: [getImageTool],
+	};
+
+	// First request - LLM should call the tool
+	const firstResponse = await complete(model, context, options);
+	expect(firstResponse.stopReason).toBe("toolUse");
+
+	// Find the tool call
+	const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
+	expect(toolCall).toBeTruthy();
+	if (!toolCall || toolCall.type !== "toolCall") {
+		throw new Error("Expected tool call");
+	}
+	expect(toolCall.name).toBe("get_circle_with_description");
+
+	// Add the tool call to context
+	context.messages.push(firstResponse);
+
+	// Create tool result with BOTH text and image
+	const toolResult: ToolResultMessage = {
+		role: "toolResult",
+		toolCallId: toolCall.id,
+		toolName: toolCall.name,
+		content: [
+			{
+				type: "text",
+				text: "This is a geometric shape with specific properties: it has a diameter of 100 pixels.",
+			},
+			{
+				type: "image",
+				data: base64Image,
+				mimeType: "image/png",
+			},
+		],
+		isError: false,
+		timestamp: Date.now(),
+	};
+
+	context.messages.push(toolResult);
+
+	// Second request - LLM should describe both the text and image from the tool result
+	const secondResponse = await complete(model, context, options);
+	expect(secondResponse.stopReason).toBe("stop");
+	expect(secondResponse.errorMessage).toBeFalsy();
+
+	// Verify the LLM can see both text and image
+	const textContent = secondResponse.content.find((b) => b.type === "text");
+	expect(textContent).toBeTruthy();
+	if (textContent && textContent.type === "text") {
+		const lowerContent = textContent.text.toLowerCase();
+		// Should mention details from the text (diameter/pixels)
+		expect(lowerContent.match(/diameter|100|pixel/)).toBeTruthy();
+		// Should also mention the visual properties (red and circle)
+		expect(lowerContent).toContain("red");
+		expect(lowerContent).toContain("circle");
+	}
+}
+
+describe("Tool Results with Images", () => {
+	describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider (gemini-2.5-flash)", () => {
+		const llm = getModel("google", "gemini-2.5-flash");
+
+		it("should handle tool result with only image", async () => {
+			await handleToolWithImageResult(llm);
+		});
+
+		it("should handle tool result with text and image", async () => {
+			await handleToolWithTextAndImageResult(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider (gpt-4o-mini)", () => {
+		const llm: Model<"openai-completions"> = { ...getModel("openai", "gpt-4o-mini"), api: "openai-completions" };
+
+		it("should handle tool result with only image", async () => {
+			await handleToolWithImageResult(llm);
+		});
+
+		it("should handle tool result with text and image", async () => {
+			await handleToolWithTextAndImageResult(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider (gpt-5-mini)", () => {
+		const llm = getModel("openai", "gpt-5-mini");
+
+		it("should handle tool result with only image", async () => {
+			await handleToolWithImageResult(llm);
+		});
+
+		it("should handle tool result with text and image", async () => {
+			await handleToolWithTextAndImageResult(llm);
+		});
+	});
+
+	describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider (claude-haiku-4-5)", () => {
+		const model = getModel("anthropic", "claude-haiku-4-5");
+
+		it("should handle tool result with only image", async () => {
+			await handleToolWithImageResult(model);
+		});
+
+		it("should handle tool result with text and image", async () => {
+			await handleToolWithTextAndImageResult(model);
+		});
+	});
+
+	describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider (claude-sonnet-4-5)", () => {
+		const model = getModel("anthropic", "claude-sonnet-4-5");
+
+		it("should handle tool result with only image", async () => {
+			await handleToolWithImageResult(model);
+		});
+
+		it("should handle tool result with text and image", async () => {
+			await handleToolWithTextAndImageResult(model);
+		});
+	});
+
+	describe.skipIf(!process.env.OPENROUTER_API_KEY)("OpenRouter Provider (glm-4.5v)", () => {
+		const llm = getModel("openrouter", "z-ai/glm-4.5v");
+
+		it("should handle tool result with only image", async () => {
+			await handleToolWithImageResult(llm);
+		});
+
+		it("should handle tool result with text and image", async () => {
+			await handleToolWithTextAndImageResult(llm);
+		});
+	});
+});
--- a/packages/ai/test/stream.test.ts
+++ b/packages/ai/test/stream.test.ts
@ -305,7 +305,7 @@ async function multiTurn<TApi extends Api>(model: Model<TApi>, options?: Options
 					role: "toolResult",
 					toolCallId: block.id,
 					toolName: block.name,
-					output: `${result}`,
+					content: [{ type: "text", text: `${result}` }],
 					isError: false,
 					timestamp: Date.now(),
 				});
--- a/packages/ai/test/tool-validation.test.ts
+++ b/packages/ai/test/tool-validation.test.ts
@ -27,7 +27,7 @@ describe("Tool Validation with TypeBox and AJV", () => {
 		parameters: testSchema,
 		execute: async (_toolCallId, args) => {
 			return {
-				output: `Processed: ${args.name}, ${args.age}, ${args.email}`,
+				content: [{ type: "text", text: `Processed: ${args.name}, ${args.age}, ${args.email}` }],
 				details: undefined,
 			};
 		},
@ -130,7 +130,11 @@ describe("Tool Validation with TypeBox and AJV", () => {

 		const result = await testTool.execute("test-id", validInput as TestParams);

-		expect(result.output).toBe("Processed: John Doe, 30, john@example.com");
+		const textOutput = result.content
+			.filter((c: any) => c.type === "text")
+			.map((c: any) => c.text)
+			.join("\n");
+		expect(textOutput).toBe("Processed: John Doe, 30, john@example.com");
 		expect(result.details).toBeUndefined();
 	});
 });
--- a/packages/ai/test/unicode-surrogate.test.ts
+++ b/packages/ai/test/unicode-surrogate.test.ts
@ -62,7 +62,10 @@ async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, option
 		role: "toolResult",
 		toolCallId: "test_1",
 		toolName: "test_tool",
-		output: `Test with emoji 🙈 and other characters:
+		content: [
+			{
+				type: "text",
+				text: `Test with emoji 🙈 and other characters:
 - Monkey emoji: 🙈
 - Thumbs up: 👍
 - Heart: ❤️
@ -73,6 +76,8 @@ async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, option
 - Chinese: 你好
 - Mathematical symbols: ∑∫∂√
 - Special quotes: "curly" 'quotes'`,
+			},
+		],
 		isError: false,
 		timestamp: Date.now(),
 	};
@ -141,7 +146,10 @@ async function testRealWorldLinkedInData<TApi extends Api>(llm: Model<TApi>, opt
 		role: "toolResult",
 		toolCallId: "linkedin_1",
 		toolName: "linkedin_skill",
-		output: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
+		content: [
+			{
+				type: "text",
+				text: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
 Unanswered Comments: 2

 => {
@ -156,6 +164,8 @@ Unanswered Comments: 2
    }
  ]
 }`,
+			},
+		],
 		isError: false,
 		timestamp: Date.now(),
 	};
@ -226,7 +236,7 @@ async function testUnpairedHighSurrogate<TApi extends Api>(llm: Model<TApi>, opt
 		role: "toolResult",
 		toolCallId: "test_2",
 		toolName: "test_tool",
-		output: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized`,
+		content: [{ type: "text", text: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized` }],
 		isError: false,
 		timestamp: Date.now(),
 	};