mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-17 03:03:44 +00:00
Add image support in tool results across all providers
Tool results now use content blocks and can include both text and images. All providers (Anthropic, Google, OpenAI Completions, OpenAI Responses) correctly pass images from tool results to LLMs. - Update ToolResultMessage type to use content blocks - Add placeholder text for image-only tool results in Google/Anthropic - OpenAI providers send tool result + follow-up user message with images - Fix Anthropic JSON parsing for empty tool arguments - Add comprehensive tests for image-only and text+image tool results - Update README with tool result content blocks API
This commit is contained in:
parent
9dac37d836
commit
84dcab219b
37 changed files with 720 additions and 544 deletions
|
|
@ -98,7 +98,6 @@ for await (const event of s) {
|
|||
const finalMessage = await s.result();
|
||||
context.messages.push(finalMessage);
|
||||
|
||||
// Handle tool calls if any
|
||||
// Handle tool calls if any
|
||||
const toolCalls = finalMessage.content.filter(b => b.type === 'toolCall');
|
||||
for (const call of toolCalls) {
|
||||
|
|
@ -111,13 +110,14 @@ for (const call of toolCalls) {
|
|||
})
|
||||
: 'Unknown tool';
|
||||
|
||||
// Add tool result to context
|
||||
// Add tool result to context (supports text and images)
|
||||
context.messages.push({
|
||||
role: 'toolResult',
|
||||
toolCallId: call.id,
|
||||
toolName: call.name,
|
||||
output: result,
|
||||
isError: false
|
||||
content: [{ type: 'text', text: result }],
|
||||
isError: false,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -179,7 +179,11 @@ const bookMeetingTool: Tool = {
|
|||
|
||||
### Handling Tool Calls
|
||||
|
||||
Tool results use content blocks and can include both text and images:
|
||||
|
||||
```typescript
|
||||
import { readFileSync } from 'fs';
|
||||
|
||||
const context: Context = {
|
||||
messages: [{ role: 'user', content: 'What is the weather in London?' }],
|
||||
tools: [weatherTool]
|
||||
|
|
@ -194,16 +198,31 @@ for (const block of response.content) {
|
|||
// If validation fails, an error event is emitted
|
||||
const result = await executeWeatherApi(block.arguments);
|
||||
|
||||
// Add tool result to continue the conversation
|
||||
// Add tool result with text content
|
||||
context.messages.push({
|
||||
role: 'toolResult',
|
||||
toolCallId: block.id,
|
||||
toolName: block.name,
|
||||
output: JSON.stringify(result),
|
||||
isError: false
|
||||
content: [{ type: 'text', text: JSON.stringify(result) }],
|
||||
isError: false,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Tool results can also include images (for vision-capable models)
|
||||
const imageBuffer = readFileSync('chart.png');
|
||||
context.messages.push({
|
||||
role: 'toolResult',
|
||||
toolCallId: 'tool_xyz',
|
||||
toolName: 'generate_chart',
|
||||
content: [
|
||||
{ type: 'text', text: 'Generated chart showing temperature trends' },
|
||||
{ type: 'image', data: imageBuffer.toString('base64'), mimeType: 'image/png' }
|
||||
],
|
||||
isError: false,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
```
|
||||
|
||||
### Streaming Tool Calls with Partial JSON
|
||||
|
|
@ -625,7 +644,7 @@ const geminiResponse = await complete(gemini, context);
|
|||
|
||||
All providers can handle messages from other providers, including:
|
||||
- Text content
|
||||
- Tool calls and tool results
|
||||
- Tool calls and tool results (including images in tool results)
|
||||
- Thinking/reasoning blocks (transformed to tagged text for cross-provider compatibility)
|
||||
- Aborted messages with partial content
|
||||
|
||||
|
|
@ -818,6 +837,23 @@ const weatherTool: AgentTool<typeof weatherSchema, { temp: number }> = {
|
|||
};
|
||||
}
|
||||
};
|
||||
|
||||
// Tools can also return images alongside text
|
||||
const chartTool: AgentTool<typeof Type.Object({ data: Type.Array(Type.Number()) })> = {
|
||||
label: 'Generate Chart',
|
||||
name: 'generate_chart',
|
||||
description: 'Generate a chart from data',
|
||||
parameters: Type.Object({ data: Type.Array(Type.Number()) }),
|
||||
execute: async (toolCallId, args) => {
|
||||
const chartImage = await generateChartImage(args.data);
|
||||
return {
|
||||
content: [
|
||||
{ type: 'text', text: `Generated chart with ${args.data.length} data points` },
|
||||
{ type: 'image', data: chartImage.toString('base64'), mimeType: 'image/png' }
|
||||
]
|
||||
};
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
### Validation and Error Handling
|
||||
|
|
|
|||
|
|
@ -216,11 +216,15 @@ async function executeToolCalls<T>(
|
|||
isError,
|
||||
});
|
||||
|
||||
// Convert result to content blocks
|
||||
const content: ToolResultMessage<T>["content"] =
|
||||
typeof resultOrError === "string" ? [{ type: "text", text: resultOrError }] : resultOrError.content;
|
||||
|
||||
const toolResultMessage: ToolResultMessage<T> = {
|
||||
role: "toolResult",
|
||||
toolCallId: toolCall.id,
|
||||
toolName: toolCall.name,
|
||||
output: typeof resultOrError === "string" ? resultOrError : resultOrError.output,
|
||||
content,
|
||||
details: typeof resultOrError === "string" ? ({} as T) : resultOrError.details,
|
||||
isError,
|
||||
timestamp: Date.now(),
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
import { type Static, Type } from "@sinclair/typebox";
|
||||
import type { AgentTool } from "../../agent/types.js";
|
||||
import type { AgentTool, AgentToolResult } from "../../agent/types.js";
|
||||
|
||||
export interface CalculateResult {
|
||||
output: string;
|
||||
export interface CalculateResult extends AgentToolResult<undefined> {
|
||||
content: Array<{ type: "text"; text: string }>;
|
||||
details: undefined;
|
||||
}
|
||||
|
||||
export function calculate(expression: string): CalculateResult {
|
||||
try {
|
||||
const result = new Function("return " + expression)();
|
||||
return { output: `${expression} = ${result}`, details: undefined };
|
||||
return { content: [{ type: "text", text: `${expression} = ${result}` }], details: undefined };
|
||||
} catch (e: any) {
|
||||
throw new Error(e.message || String(e));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,20 +8,22 @@ export async function getCurrentTime(timezone?: string): Promise<GetCurrentTimeR
|
|||
const date = new Date();
|
||||
if (timezone) {
|
||||
try {
|
||||
const timeStr = date.toLocaleString("en-US", {
|
||||
timeZone: timezone,
|
||||
dateStyle: "full",
|
||||
timeStyle: "long",
|
||||
});
|
||||
return {
|
||||
output: date.toLocaleString("en-US", {
|
||||
timeZone: timezone,
|
||||
dateStyle: "full",
|
||||
timeStyle: "long",
|
||||
}),
|
||||
content: [{ type: "text", text: timeStr }],
|
||||
details: { utcTimestamp: date.getTime() },
|
||||
};
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid timezone: ${timezone}. Current UTC time: ${date.toISOString()}`);
|
||||
}
|
||||
}
|
||||
const timeStr = date.toLocaleString("en-US", { dateStyle: "full", timeStyle: "long" });
|
||||
return {
|
||||
output: date.toLocaleString("en-US", { dateStyle: "full", timeStyle: "long" }),
|
||||
content: [{ type: "text", text: timeStr }],
|
||||
details: { utcTimestamp: date.getTime() },
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,17 +2,19 @@ import type { Static, TSchema } from "@sinclair/typebox";
|
|||
import type {
|
||||
AssistantMessage,
|
||||
AssistantMessageEvent,
|
||||
ImageContent,
|
||||
Message,
|
||||
Model,
|
||||
SimpleStreamOptions,
|
||||
TextContent,
|
||||
Tool,
|
||||
ToolResultMessage,
|
||||
} from "../types.js";
|
||||
|
||||
export interface AgentToolResult<T> {
|
||||
// Output of the tool to be given to the LLM in ToolResultMessage.content
|
||||
output: string;
|
||||
// Details to be displayed in a UI or loggedty
|
||||
// Content blocks supporting text and images
|
||||
content: (TextContent | ImageContent)[];
|
||||
// Details to be displayed in a UI or logged
|
||||
details: T;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import type {
|
|||
Api,
|
||||
AssistantMessage,
|
||||
Context,
|
||||
ImageContent,
|
||||
Message,
|
||||
Model,
|
||||
StopReason,
|
||||
|
|
@ -26,6 +27,58 @@ import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
|
|||
import { validateToolArguments } from "../utils/validation.js";
|
||||
import { transformMessages } from "./transorm-messages.js";
|
||||
|
||||
/**
|
||||
* Convert content blocks to Anthropic API format
|
||||
*/
|
||||
function convertContentBlocks(content: (TextContent | ImageContent)[]):
|
||||
| string
|
||||
| Array<
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
type: "image";
|
||||
source: {
|
||||
type: "base64";
|
||||
media_type: "image/jpeg" | "image/png" | "image/gif" | "image/webp";
|
||||
data: string;
|
||||
};
|
||||
}
|
||||
> {
|
||||
// If only text blocks, return as concatenated string for simplicity
|
||||
const hasImages = content.some((c) => c.type === "image");
|
||||
if (!hasImages) {
|
||||
return sanitizeSurrogates(content.map((c) => (c as TextContent).text).join("\n"));
|
||||
}
|
||||
|
||||
// If we have images, convert to content block array
|
||||
const blocks = content.map((block) => {
|
||||
if (block.type === "text") {
|
||||
return {
|
||||
type: "text" as const,
|
||||
text: sanitizeSurrogates(block.text),
|
||||
};
|
||||
}
|
||||
return {
|
||||
type: "image" as const,
|
||||
source: {
|
||||
type: "base64" as const,
|
||||
media_type: block.mimeType as "image/jpeg" | "image/png" | "image/gif" | "image/webp",
|
||||
data: block.data,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
// If only images (no text), add placeholder text block
|
||||
const hasText = blocks.some((b) => b.type === "text");
|
||||
if (!hasText) {
|
||||
blocks.unshift({
|
||||
type: "text" as const,
|
||||
text: "(see attached image)",
|
||||
});
|
||||
}
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
export interface AnthropicOptions extends StreamOptions {
|
||||
thinkingEnabled?: boolean;
|
||||
thinkingBudgetTokens?: number;
|
||||
|
|
@ -171,7 +224,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
|
|||
partial: output,
|
||||
});
|
||||
} else if (block.type === "toolCall") {
|
||||
block.arguments = JSON.parse(block.partialJson);
|
||||
block.arguments = parseStreamingJson(block.partialJson);
|
||||
|
||||
// Validate tool arguments if tool definition is available
|
||||
if (context.tools) {
|
||||
|
|
@ -432,7 +485,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
|
|||
toolResults.push({
|
||||
type: "tool_result",
|
||||
tool_use_id: sanitizeToolCallId(msg.toolCallId),
|
||||
content: sanitizeSurrogates(msg.output),
|
||||
content: convertContentBlocks(msg.content),
|
||||
is_error: msg.isError,
|
||||
});
|
||||
|
||||
|
|
@ -443,7 +496,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
|
|||
toolResults.push({
|
||||
type: "tool_result",
|
||||
tool_use_id: sanitizeToolCallId(nextMsg.toolCallId),
|
||||
content: sanitizeSurrogates(nextMsg.output),
|
||||
content: convertContentBlocks(nextMsg.content),
|
||||
is_error: nextMsg.isError,
|
||||
});
|
||||
j++;
|
||||
|
|
|
|||
|
|
@ -377,20 +377,44 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
|
|||
parts,
|
||||
});
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Build parts array with functionResponse and/or images
|
||||
const parts: Part[] = [];
|
||||
|
||||
// Extract text and image content
|
||||
const textResult = msg.content
|
||||
.filter((c) => c.type === "text")
|
||||
.map((c) => (c as any).text)
|
||||
.join("\n");
|
||||
const imageBlocks = model.input.includes("image") ? msg.content.filter((c) => c.type === "image") : [];
|
||||
|
||||
// Always add functionResponse with text result (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
const hasImages = imageBlocks.length > 0;
|
||||
|
||||
parts.push({
|
||||
functionResponse: {
|
||||
id: msg.toolCallId,
|
||||
name: msg.toolName,
|
||||
response: {
|
||||
result: hasText ? sanitizeSurrogates(textResult) : hasImages ? "(see attached image)" : "",
|
||||
isError: msg.isError,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Add any images as inlineData parts
|
||||
for (const imageBlock of imageBlocks) {
|
||||
parts.push({
|
||||
inlineData: {
|
||||
mimeType: (imageBlock as any).mimeType,
|
||||
data: (imageBlock as any).data,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
contents.push({
|
||||
role: "user",
|
||||
parts: [
|
||||
{
|
||||
functionResponse: {
|
||||
id: msg.toolCallId,
|
||||
name: msg.toolName,
|
||||
response: {
|
||||
result: sanitizeSurrogates(msg.output),
|
||||
isError: msg.isError,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
parts,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -386,11 +386,50 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
|
|||
}
|
||||
params.push(assistantMsg);
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Extract text and image content
|
||||
const textResult = msg.content
|
||||
.filter((c) => c.type === "text")
|
||||
.map((c) => (c as any).text)
|
||||
.join("\n");
|
||||
const hasImages = msg.content.some((c) => c.type === "image");
|
||||
|
||||
// Always send tool result with text (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
params.push({
|
||||
role: "tool",
|
||||
content: sanitizeSurrogates(msg.output),
|
||||
content: sanitizeSurrogates(hasText ? textResult : "(see attached image)"),
|
||||
tool_call_id: msg.toolCallId,
|
||||
});
|
||||
|
||||
// If there are images and model supports them, send a follow-up user message with images
|
||||
if (hasImages && model.input.includes("image")) {
|
||||
const contentBlocks: Array<
|
||||
{ type: "text"; text: string } | { type: "image_url"; image_url: { url: string } }
|
||||
> = [];
|
||||
|
||||
// Add text prefix
|
||||
contentBlocks.push({
|
||||
type: "text",
|
||||
text: "Attached image(s) from tool result:",
|
||||
});
|
||||
|
||||
// Add images
|
||||
for (const block of msg.content) {
|
||||
if (block.type === "image") {
|
||||
contentBlocks.push({
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:${(block as any).mimeType};base64,${(block as any).data}`,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
params.push({
|
||||
role: "user",
|
||||
content: contentBlocks,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -435,11 +435,47 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
|
|||
if (output.length === 0) continue;
|
||||
messages.push(...output);
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Extract text and image content
|
||||
const textResult = msg.content
|
||||
.filter((c) => c.type === "text")
|
||||
.map((c) => (c as any).text)
|
||||
.join("\n");
|
||||
const hasImages = msg.content.some((c) => c.type === "image");
|
||||
|
||||
// Always send function_call_output with text (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
messages.push({
|
||||
type: "function_call_output",
|
||||
call_id: msg.toolCallId.split("|")[0],
|
||||
output: sanitizeSurrogates(msg.output),
|
||||
output: sanitizeSurrogates(hasText ? textResult : "(see attached image)"),
|
||||
});
|
||||
|
||||
// If there are images and model supports them, send a follow-up user message with images
|
||||
if (hasImages && model.input.includes("image")) {
|
||||
const contentParts: ResponseInputContent[] = [];
|
||||
|
||||
// Add text prefix
|
||||
contentParts.push({
|
||||
type: "input_text",
|
||||
text: "Attached image(s) from tool result:",
|
||||
} satisfies ResponseInputText);
|
||||
|
||||
// Add images
|
||||
for (const block of msg.content) {
|
||||
if (block.type === "image") {
|
||||
contentParts.push({
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: `data:${(block as any).mimeType};base64,${(block as any).data}`,
|
||||
} satisfies ResponseInputImage);
|
||||
}
|
||||
}
|
||||
|
||||
messages.push({
|
||||
role: "user",
|
||||
content: contentParts,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ export interface ToolResultMessage<TDetails = any> {
|
|||
role: "toolResult";
|
||||
toolCallId: string;
|
||||
toolName: string;
|
||||
output: string;
|
||||
content: (TextContent | ImageContent)[]; // Supports text and images
|
||||
details?: TDetails;
|
||||
isError: boolean;
|
||||
timestamp: number; // Unix timestamp in milliseconds
|
||||
|
|
|
|||
|
|
@ -60,14 +60,18 @@ async function calculateTest<TApi extends Api>(model: Model<TApi>, options: Opti
|
|||
break;
|
||||
|
||||
case "tool_execution_end":
|
||||
if (!event.isError && typeof event.result === "object" && event.result.output) {
|
||||
if (!event.isError && typeof event.result === "object" && event.result.content) {
|
||||
const textOutput = event.result.content
|
||||
.filter((c: any) => c.type === "text")
|
||||
.map((c: any) => c.text)
|
||||
.join("\n");
|
||||
toolCallCount++;
|
||||
// Extract number from output like "expression = result"
|
||||
const match = event.result.output.match(/=\s*([\d.]+)/);
|
||||
const match = textOutput.match(/=\s*([\d.]+)/);
|
||||
if (match) {
|
||||
const value = parseFloat(match[1]);
|
||||
toolResults.push(value);
|
||||
console.log(`Tool ${toolCallCount}: ${event.result.output}`);
|
||||
console.log(`Tool ${toolCallCount}: ${textOutput}`);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ const providerContexts = {
|
|||
role: "toolResult" as const,
|
||||
toolCallId: "toolu_01abc123",
|
||||
toolName: "get_weather",
|
||||
output: "Weather in Tokyo: 18°C, partly cloudy",
|
||||
content: [{ type: "text", text: "Weather in Tokyo: 18°C, partly cloudy" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
} satisfies ToolResultMessage,
|
||||
|
|
@ -106,7 +106,7 @@ const providerContexts = {
|
|||
role: "toolResult" as const,
|
||||
toolCallId: "call_gemini_123",
|
||||
toolName: "get_weather",
|
||||
output: "Weather in Berlin: 22°C, sunny",
|
||||
content: [{ type: "text", text: "Weather in Berlin: 22°C, sunny" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
} satisfies ToolResultMessage,
|
||||
|
|
@ -156,7 +156,7 @@ const providerContexts = {
|
|||
role: "toolResult" as const,
|
||||
toolCallId: "call_abc123",
|
||||
toolName: "get_weather",
|
||||
output: "Weather in London: 15°C, rainy",
|
||||
content: [{ type: "text", text: "Weather in London: 15°C, rainy" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
} satisfies ToolResultMessage,
|
||||
|
|
@ -208,7 +208,7 @@ const providerContexts = {
|
|||
role: "toolResult" as const,
|
||||
toolCallId: "call_789_item_012", // Match the updated ID format
|
||||
toolName: "get_weather",
|
||||
output: "Weather in Sydney: 25°C, clear",
|
||||
content: [{ type: "text", text: "Weather in Sydney: 25°C, clear" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
} satisfies ToolResultMessage,
|
||||
|
|
|
|||
263
packages/ai/test/image-tool-result.test.ts
Normal file
263
packages/ai/test/image-tool-result.test.ts
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
import { readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { Api, Context, Model, Tool, ToolResultMessage } from "../src/index.js";
|
||||
import { complete, getModel } from "../src/index.js";
|
||||
import type { OptionsForApi } from "../src/types.js";
|
||||
|
||||
/**
|
||||
* Test that tool results containing only images work correctly across all providers.
|
||||
* This verifies that:
|
||||
* 1. Tool results can contain image content blocks
|
||||
* 2. Providers correctly pass images from tool results to the LLM
|
||||
* 3. The LLM can see and describe images returned by tools
|
||||
*/
|
||||
async function handleToolWithImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
|
||||
// Check if the model supports images
|
||||
if (!model.input.includes("image")) {
|
||||
console.log(`Skipping tool image result test - model ${model.id} doesn't support images`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read the test image
|
||||
const imagePath = join(__dirname, "data", "red-circle.png");
|
||||
const imageBuffer = readFileSync(imagePath);
|
||||
const base64Image = imageBuffer.toString("base64");
|
||||
|
||||
// Define a tool that returns only an image (no text)
|
||||
const getImageSchema = Type.Object({});
|
||||
const getImageTool: Tool<typeof getImageSchema> = {
|
||||
name: "get_circle",
|
||||
description: "Returns a circle image for visualization",
|
||||
parameters: getImageSchema,
|
||||
};
|
||||
|
||||
const context: Context = {
|
||||
systemPrompt: "You are a helpful assistant that uses tools when asked.",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Use the get_circle tool to get an image, and describe what you see, shapes, colors, etc.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
tools: [getImageTool],
|
||||
};
|
||||
|
||||
// First request - LLM should call the tool
|
||||
const firstResponse = await complete(model, context, options);
|
||||
expect(firstResponse.stopReason).toBe("toolUse");
|
||||
|
||||
// Find the tool call
|
||||
const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
|
||||
expect(toolCall).toBeTruthy();
|
||||
if (!toolCall || toolCall.type !== "toolCall") {
|
||||
throw new Error("Expected tool call");
|
||||
}
|
||||
expect(toolCall.name).toBe("get_circle");
|
||||
|
||||
// Add the tool call to context
|
||||
context.messages.push(firstResponse);
|
||||
|
||||
// Create tool result with ONLY an image (no text)
|
||||
const toolResult: ToolResultMessage = {
|
||||
role: "toolResult",
|
||||
toolCallId: toolCall.id,
|
||||
toolName: toolCall.name,
|
||||
content: [
|
||||
{
|
||||
type: "image",
|
||||
data: base64Image,
|
||||
mimeType: "image/png",
|
||||
},
|
||||
],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
context.messages.push(toolResult);
|
||||
|
||||
// Second request - LLM should describe the image from the tool result
|
||||
const secondResponse = await complete(model, context, options);
|
||||
expect(secondResponse.stopReason).toBe("stop");
|
||||
expect(secondResponse.errorMessage).toBeFalsy();
|
||||
|
||||
// Verify the LLM can see and describe the image
|
||||
const textContent = secondResponse.content.find((b) => b.type === "text");
|
||||
expect(textContent).toBeTruthy();
|
||||
if (textContent && textContent.type === "text") {
|
||||
const lowerContent = textContent.text.toLowerCase();
|
||||
// Should mention red and circle since that's what the image shows
|
||||
expect(lowerContent).toContain("red");
|
||||
expect(lowerContent).toContain("circle");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that tool results containing both text and images work correctly across all providers.
|
||||
* This verifies that:
|
||||
* 1. Tool results can contain mixed content blocks (text + images)
|
||||
* 2. Providers correctly pass both text and images from tool results to the LLM
|
||||
* 3. The LLM can see both the text and images in tool results
|
||||
*/
|
||||
async function handleToolWithTextAndImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
|
||||
// Check if the model supports images
|
||||
if (!model.input.includes("image")) {
|
||||
console.log(`Skipping tool text+image result test - model ${model.id} doesn't support images`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Read the test image
|
||||
const imagePath = join(__dirname, "data", "red-circle.png");
|
||||
const imageBuffer = readFileSync(imagePath);
|
||||
const base64Image = imageBuffer.toString("base64");
|
||||
|
||||
// Define a tool that returns both text and an image
|
||||
const getImageSchema = Type.Object({});
|
||||
const getImageTool: Tool<typeof getImageSchema> = {
|
||||
name: "get_circle_with_description",
|
||||
description: "Returns a circle image with a text description",
|
||||
parameters: getImageSchema,
|
||||
};
|
||||
|
||||
const context: Context = {
|
||||
systemPrompt: "You are a helpful assistant that uses tools when asked.",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Use the get_circle_with_description tool and tell me what you learned.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
tools: [getImageTool],
|
||||
};
|
||||
|
||||
// First request - LLM should call the tool
|
||||
const firstResponse = await complete(model, context, options);
|
||||
expect(firstResponse.stopReason).toBe("toolUse");
|
||||
|
||||
// Find the tool call
|
||||
const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
|
||||
expect(toolCall).toBeTruthy();
|
||||
if (!toolCall || toolCall.type !== "toolCall") {
|
||||
throw new Error("Expected tool call");
|
||||
}
|
||||
expect(toolCall.name).toBe("get_circle_with_description");
|
||||
|
||||
// Add the tool call to context
|
||||
context.messages.push(firstResponse);
|
||||
|
||||
// Create tool result with BOTH text and image
|
||||
const toolResult: ToolResultMessage = {
|
||||
role: "toolResult",
|
||||
toolCallId: toolCall.id,
|
||||
toolName: toolCall.name,
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: "This is a geometric shape with specific properties: it has a diameter of 100 pixels.",
|
||||
},
|
||||
{
|
||||
type: "image",
|
||||
data: base64Image,
|
||||
mimeType: "image/png",
|
||||
},
|
||||
],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
context.messages.push(toolResult);
|
||||
|
||||
// Second request - LLM should describe both the text and image from the tool result
|
||||
const secondResponse = await complete(model, context, options);
|
||||
expect(secondResponse.stopReason).toBe("stop");
|
||||
expect(secondResponse.errorMessage).toBeFalsy();
|
||||
|
||||
// Verify the LLM can see both text and image
|
||||
const textContent = secondResponse.content.find((b) => b.type === "text");
|
||||
expect(textContent).toBeTruthy();
|
||||
if (textContent && textContent.type === "text") {
|
||||
const lowerContent = textContent.text.toLowerCase();
|
||||
// Should mention details from the text (diameter/pixels)
|
||||
expect(lowerContent.match(/diameter|100|pixel/)).toBeTruthy();
|
||||
// Should also mention the visual properties (red and circle)
|
||||
expect(lowerContent).toContain("red");
|
||||
expect(lowerContent).toContain("circle");
|
||||
}
|
||||
}
|
||||
|
||||
describe("Tool Results with Images", () => {
|
||||
describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider (gemini-2.5-flash)", () => {
|
||||
const llm = getModel("google", "gemini-2.5-flash");
|
||||
|
||||
it("should handle tool result with only image", async () => {
|
||||
await handleToolWithImageResult(llm);
|
||||
});
|
||||
|
||||
it("should handle tool result with text and image", async () => {
|
||||
await handleToolWithTextAndImageResult(llm);
|
||||
});
|
||||
});
|
||||
|
||||
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider (gpt-4o-mini)", () => {
|
||||
const llm: Model<"openai-completions"> = { ...getModel("openai", "gpt-4o-mini"), api: "openai-completions" };
|
||||
|
||||
it("should handle tool result with only image", async () => {
|
||||
await handleToolWithImageResult(llm);
|
||||
});
|
||||
|
||||
it("should handle tool result with text and image", async () => {
|
||||
await handleToolWithTextAndImageResult(llm);
|
||||
});
|
||||
});
|
||||
|
||||
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider (gpt-5-mini)", () => {
|
||||
const llm = getModel("openai", "gpt-5-mini");
|
||||
|
||||
it("should handle tool result with only image", async () => {
|
||||
await handleToolWithImageResult(llm);
|
||||
});
|
||||
|
||||
it("should handle tool result with text and image", async () => {
|
||||
await handleToolWithTextAndImageResult(llm);
|
||||
});
|
||||
});
|
||||
|
||||
describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider (claude-haiku-4-5)", () => {
|
||||
const model = getModel("anthropic", "claude-haiku-4-5");
|
||||
|
||||
it("should handle tool result with only image", async () => {
|
||||
await handleToolWithImageResult(model);
|
||||
});
|
||||
|
||||
it("should handle tool result with text and image", async () => {
|
||||
await handleToolWithTextAndImageResult(model);
|
||||
});
|
||||
});
|
||||
|
||||
describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider (claude-sonnet-4-5)", () => {
|
||||
const model = getModel("anthropic", "claude-sonnet-4-5");
|
||||
|
||||
it("should handle tool result with only image", async () => {
|
||||
await handleToolWithImageResult(model);
|
||||
});
|
||||
|
||||
it("should handle tool result with text and image", async () => {
|
||||
await handleToolWithTextAndImageResult(model);
|
||||
});
|
||||
});
|
||||
|
||||
describe.skipIf(!process.env.OPENROUTER_API_KEY)("OpenRouter Provider (glm-4.5v)", () => {
|
||||
const llm = getModel("openrouter", "z-ai/glm-4.5v");
|
||||
|
||||
it("should handle tool result with only image", async () => {
|
||||
await handleToolWithImageResult(llm);
|
||||
});
|
||||
|
||||
it("should handle tool result with text and image", async () => {
|
||||
await handleToolWithTextAndImageResult(llm);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -305,7 +305,7 @@ async function multiTurn<TApi extends Api>(model: Model<TApi>, options?: Options
|
|||
role: "toolResult",
|
||||
toolCallId: block.id,
|
||||
toolName: block.name,
|
||||
output: `${result}`,
|
||||
content: [{ type: "text", text: `${result}` }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ describe("Tool Validation with TypeBox and AJV", () => {
|
|||
parameters: testSchema,
|
||||
execute: async (_toolCallId, args) => {
|
||||
return {
|
||||
output: `Processed: ${args.name}, ${args.age}, ${args.email}`,
|
||||
content: [{ type: "text", text: `Processed: ${args.name}, ${args.age}, ${args.email}` }],
|
||||
details: undefined,
|
||||
};
|
||||
},
|
||||
|
|
@ -130,7 +130,11 @@ describe("Tool Validation with TypeBox and AJV", () => {
|
|||
|
||||
const result = await testTool.execute("test-id", validInput as TestParams);
|
||||
|
||||
expect(result.output).toBe("Processed: John Doe, 30, john@example.com");
|
||||
const textOutput = result.content
|
||||
.filter((c: any) => c.type === "text")
|
||||
.map((c: any) => c.text)
|
||||
.join("\n");
|
||||
expect(textOutput).toBe("Processed: John Doe, 30, john@example.com");
|
||||
expect(result.details).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -62,7 +62,10 @@ async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, option
|
|||
role: "toolResult",
|
||||
toolCallId: "test_1",
|
||||
toolName: "test_tool",
|
||||
output: `Test with emoji 🙈 and other characters:
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: `Test with emoji 🙈 and other characters:
|
||||
- Monkey emoji: 🙈
|
||||
- Thumbs up: 👍
|
||||
- Heart: ❤️
|
||||
|
|
@ -73,6 +76,8 @@ async function testEmojiInToolResults<TApi extends Api>(llm: Model<TApi>, option
|
|||
- Chinese: 你好
|
||||
- Mathematical symbols: ∑∫∂√
|
||||
- Special quotes: "curly" 'quotes'`,
|
||||
},
|
||||
],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
|
@ -141,7 +146,10 @@ async function testRealWorldLinkedInData<TApi extends Api>(llm: Model<TApi>, opt
|
|||
role: "toolResult",
|
||||
toolCallId: "linkedin_1",
|
||||
toolName: "linkedin_skill",
|
||||
output: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: `Post: Hab einen "Generative KI für Nicht-Techniker" Workshop gebaut.
|
||||
Unanswered Comments: 2
|
||||
|
||||
=> {
|
||||
|
|
@ -156,6 +164,8 @@ Unanswered Comments: 2
|
|||
}
|
||||
]
|
||||
}`,
|
||||
},
|
||||
],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
|
@ -226,7 +236,7 @@ async function testUnpairedHighSurrogate<TApi extends Api>(llm: Model<TApi>, opt
|
|||
role: "toolResult",
|
||||
toolCallId: "test_2",
|
||||
toolName: "test_tool",
|
||||
output: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized`,
|
||||
content: [{ type: "text", text: `Text with unpaired surrogate: ${unpairedSurrogate} <- should be sanitized` }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue