mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-16 22:03:45 +00:00
Add image support in tool results across all providers
Tool results now use content blocks and can include both text and images. All providers (Anthropic, Google, OpenAI Completions, OpenAI Responses) correctly pass images from tool results to LLMs. - Update ToolResultMessage type to use content blocks - Add placeholder text for image-only tool results in Google/Anthropic - OpenAI providers send tool result + follow-up user message with images - Fix Anthropic JSON parsing for empty tool arguments - Add comprehensive tests for image-only and text+image tool results - Update README with tool result content blocks API
This commit is contained in:
parent
9dac37d836
commit
84dcab219b
37 changed files with 720 additions and 544 deletions
|
|
@ -216,11 +216,15 @@ async function executeToolCalls<T>(
|
|||
isError,
|
||||
});
|
||||
|
||||
// Convert result to content blocks
|
||||
const content: ToolResultMessage<T>["content"] =
|
||||
typeof resultOrError === "string" ? [{ type: "text", text: resultOrError }] : resultOrError.content;
|
||||
|
||||
const toolResultMessage: ToolResultMessage<T> = {
|
||||
role: "toolResult",
|
||||
toolCallId: toolCall.id,
|
||||
toolName: toolCall.name,
|
||||
output: typeof resultOrError === "string" ? resultOrError : resultOrError.output,
|
||||
content,
|
||||
details: typeof resultOrError === "string" ? ({} as T) : resultOrError.details,
|
||||
isError,
|
||||
timestamp: Date.now(),
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
import { type Static, Type } from "@sinclair/typebox";
|
||||
import type { AgentTool } from "../../agent/types.js";
|
||||
import type { AgentTool, AgentToolResult } from "../../agent/types.js";
|
||||
|
||||
export interface CalculateResult {
|
||||
output: string;
|
||||
export interface CalculateResult extends AgentToolResult<undefined> {
|
||||
content: Array<{ type: "text"; text: string }>;
|
||||
details: undefined;
|
||||
}
|
||||
|
||||
export function calculate(expression: string): CalculateResult {
|
||||
try {
|
||||
const result = new Function("return " + expression)();
|
||||
return { output: `${expression} = ${result}`, details: undefined };
|
||||
return { content: [{ type: "text", text: `${expression} = ${result}` }], details: undefined };
|
||||
} catch (e: any) {
|
||||
throw new Error(e.message || String(e));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,20 +8,22 @@ export async function getCurrentTime(timezone?: string): Promise<GetCurrentTimeR
|
|||
const date = new Date();
|
||||
if (timezone) {
|
||||
try {
|
||||
const timeStr = date.toLocaleString("en-US", {
|
||||
timeZone: timezone,
|
||||
dateStyle: "full",
|
||||
timeStyle: "long",
|
||||
});
|
||||
return {
|
||||
output: date.toLocaleString("en-US", {
|
||||
timeZone: timezone,
|
||||
dateStyle: "full",
|
||||
timeStyle: "long",
|
||||
}),
|
||||
content: [{ type: "text", text: timeStr }],
|
||||
details: { utcTimestamp: date.getTime() },
|
||||
};
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid timezone: ${timezone}. Current UTC time: ${date.toISOString()}`);
|
||||
}
|
||||
}
|
||||
const timeStr = date.toLocaleString("en-US", { dateStyle: "full", timeStyle: "long" });
|
||||
return {
|
||||
output: date.toLocaleString("en-US", { dateStyle: "full", timeStyle: "long" }),
|
||||
content: [{ type: "text", text: timeStr }],
|
||||
details: { utcTimestamp: date.getTime() },
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,17 +2,19 @@ import type { Static, TSchema } from "@sinclair/typebox";
|
|||
import type {
|
||||
AssistantMessage,
|
||||
AssistantMessageEvent,
|
||||
ImageContent,
|
||||
Message,
|
||||
Model,
|
||||
SimpleStreamOptions,
|
||||
TextContent,
|
||||
Tool,
|
||||
ToolResultMessage,
|
||||
} from "../types.js";
|
||||
|
||||
export interface AgentToolResult<T> {
|
||||
// Output of the tool to be given to the LLM in ToolResultMessage.content
|
||||
output: string;
|
||||
// Details to be displayed in a UI or loggedty
|
||||
// Content blocks supporting text and images
|
||||
content: (TextContent | ImageContent)[];
|
||||
// Details to be displayed in a UI or logged
|
||||
details: T;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import type {
|
|||
Api,
|
||||
AssistantMessage,
|
||||
Context,
|
||||
ImageContent,
|
||||
Message,
|
||||
Model,
|
||||
StopReason,
|
||||
|
|
@ -26,6 +27,58 @@ import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
|
|||
import { validateToolArguments } from "../utils/validation.js";
|
||||
import { transformMessages } from "./transorm-messages.js";
|
||||
|
||||
/**
|
||||
* Convert content blocks to Anthropic API format
|
||||
*/
|
||||
function convertContentBlocks(content: (TextContent | ImageContent)[]):
|
||||
| string
|
||||
| Array<
|
||||
| { type: "text"; text: string }
|
||||
| {
|
||||
type: "image";
|
||||
source: {
|
||||
type: "base64";
|
||||
media_type: "image/jpeg" | "image/png" | "image/gif" | "image/webp";
|
||||
data: string;
|
||||
};
|
||||
}
|
||||
> {
|
||||
// If only text blocks, return as concatenated string for simplicity
|
||||
const hasImages = content.some((c) => c.type === "image");
|
||||
if (!hasImages) {
|
||||
return sanitizeSurrogates(content.map((c) => (c as TextContent).text).join("\n"));
|
||||
}
|
||||
|
||||
// If we have images, convert to content block array
|
||||
const blocks = content.map((block) => {
|
||||
if (block.type === "text") {
|
||||
return {
|
||||
type: "text" as const,
|
||||
text: sanitizeSurrogates(block.text),
|
||||
};
|
||||
}
|
||||
return {
|
||||
type: "image" as const,
|
||||
source: {
|
||||
type: "base64" as const,
|
||||
media_type: block.mimeType as "image/jpeg" | "image/png" | "image/gif" | "image/webp",
|
||||
data: block.data,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
// If only images (no text), add placeholder text block
|
||||
const hasText = blocks.some((b) => b.type === "text");
|
||||
if (!hasText) {
|
||||
blocks.unshift({
|
||||
type: "text" as const,
|
||||
text: "(see attached image)",
|
||||
});
|
||||
}
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
export interface AnthropicOptions extends StreamOptions {
|
||||
thinkingEnabled?: boolean;
|
||||
thinkingBudgetTokens?: number;
|
||||
|
|
@ -171,7 +224,7 @@ export const streamAnthropic: StreamFunction<"anthropic-messages"> = (
|
|||
partial: output,
|
||||
});
|
||||
} else if (block.type === "toolCall") {
|
||||
block.arguments = JSON.parse(block.partialJson);
|
||||
block.arguments = parseStreamingJson(block.partialJson);
|
||||
|
||||
// Validate tool arguments if tool definition is available
|
||||
if (context.tools) {
|
||||
|
|
@ -432,7 +485,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
|
|||
toolResults.push({
|
||||
type: "tool_result",
|
||||
tool_use_id: sanitizeToolCallId(msg.toolCallId),
|
||||
content: sanitizeSurrogates(msg.output),
|
||||
content: convertContentBlocks(msg.content),
|
||||
is_error: msg.isError,
|
||||
});
|
||||
|
||||
|
|
@ -443,7 +496,7 @@ function convertMessages(messages: Message[], model: Model<"anthropic-messages">
|
|||
toolResults.push({
|
||||
type: "tool_result",
|
||||
tool_use_id: sanitizeToolCallId(nextMsg.toolCallId),
|
||||
content: sanitizeSurrogates(nextMsg.output),
|
||||
content: convertContentBlocks(nextMsg.content),
|
||||
is_error: nextMsg.isError,
|
||||
});
|
||||
j++;
|
||||
|
|
|
|||
|
|
@ -377,20 +377,44 @@ function convertMessages(model: Model<"google-generative-ai">, context: Context)
|
|||
parts,
|
||||
});
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Build parts array with functionResponse and/or images
|
||||
const parts: Part[] = [];
|
||||
|
||||
// Extract text and image content
|
||||
const textResult = msg.content
|
||||
.filter((c) => c.type === "text")
|
||||
.map((c) => (c as any).text)
|
||||
.join("\n");
|
||||
const imageBlocks = model.input.includes("image") ? msg.content.filter((c) => c.type === "image") : [];
|
||||
|
||||
// Always add functionResponse with text result (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
const hasImages = imageBlocks.length > 0;
|
||||
|
||||
parts.push({
|
||||
functionResponse: {
|
||||
id: msg.toolCallId,
|
||||
name: msg.toolName,
|
||||
response: {
|
||||
result: hasText ? sanitizeSurrogates(textResult) : hasImages ? "(see attached image)" : "",
|
||||
isError: msg.isError,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Add any images as inlineData parts
|
||||
for (const imageBlock of imageBlocks) {
|
||||
parts.push({
|
||||
inlineData: {
|
||||
mimeType: (imageBlock as any).mimeType,
|
||||
data: (imageBlock as any).data,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
contents.push({
|
||||
role: "user",
|
||||
parts: [
|
||||
{
|
||||
functionResponse: {
|
||||
id: msg.toolCallId,
|
||||
name: msg.toolName,
|
||||
response: {
|
||||
result: sanitizeSurrogates(msg.output),
|
||||
isError: msg.isError,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
parts,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -386,11 +386,50 @@ function convertMessages(model: Model<"openai-completions">, context: Context):
|
|||
}
|
||||
params.push(assistantMsg);
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Extract text and image content
|
||||
const textResult = msg.content
|
||||
.filter((c) => c.type === "text")
|
||||
.map((c) => (c as any).text)
|
||||
.join("\n");
|
||||
const hasImages = msg.content.some((c) => c.type === "image");
|
||||
|
||||
// Always send tool result with text (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
params.push({
|
||||
role: "tool",
|
||||
content: sanitizeSurrogates(msg.output),
|
||||
content: sanitizeSurrogates(hasText ? textResult : "(see attached image)"),
|
||||
tool_call_id: msg.toolCallId,
|
||||
});
|
||||
|
||||
// If there are images and model supports them, send a follow-up user message with images
|
||||
if (hasImages && model.input.includes("image")) {
|
||||
const contentBlocks: Array<
|
||||
{ type: "text"; text: string } | { type: "image_url"; image_url: { url: string } }
|
||||
> = [];
|
||||
|
||||
// Add text prefix
|
||||
contentBlocks.push({
|
||||
type: "text",
|
||||
text: "Attached image(s) from tool result:",
|
||||
});
|
||||
|
||||
// Add images
|
||||
for (const block of msg.content) {
|
||||
if (block.type === "image") {
|
||||
contentBlocks.push({
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:${(block as any).mimeType};base64,${(block as any).data}`,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
params.push({
|
||||
role: "user",
|
||||
content: contentBlocks,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -435,11 +435,47 @@ function convertMessages(model: Model<"openai-responses">, context: Context): Re
|
|||
if (output.length === 0) continue;
|
||||
messages.push(...output);
|
||||
} else if (msg.role === "toolResult") {
|
||||
// Extract text and image content
|
||||
const textResult = msg.content
|
||||
.filter((c) => c.type === "text")
|
||||
.map((c) => (c as any).text)
|
||||
.join("\n");
|
||||
const hasImages = msg.content.some((c) => c.type === "image");
|
||||
|
||||
// Always send function_call_output with text (or placeholder if only images)
|
||||
const hasText = textResult.length > 0;
|
||||
messages.push({
|
||||
type: "function_call_output",
|
||||
call_id: msg.toolCallId.split("|")[0],
|
||||
output: sanitizeSurrogates(msg.output),
|
||||
output: sanitizeSurrogates(hasText ? textResult : "(see attached image)"),
|
||||
});
|
||||
|
||||
// If there are images and model supports them, send a follow-up user message with images
|
||||
if (hasImages && model.input.includes("image")) {
|
||||
const contentParts: ResponseInputContent[] = [];
|
||||
|
||||
// Add text prefix
|
||||
contentParts.push({
|
||||
type: "input_text",
|
||||
text: "Attached image(s) from tool result:",
|
||||
} satisfies ResponseInputText);
|
||||
|
||||
// Add images
|
||||
for (const block of msg.content) {
|
||||
if (block.type === "image") {
|
||||
contentParts.push({
|
||||
type: "input_image",
|
||||
detail: "auto",
|
||||
image_url: `data:${(block as any).mimeType};base64,${(block as any).data}`,
|
||||
} satisfies ResponseInputImage);
|
||||
}
|
||||
}
|
||||
|
||||
messages.push({
|
||||
role: "user",
|
||||
content: contentParts,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ export interface ToolResultMessage<TDetails = any> {
|
|||
role: "toolResult";
|
||||
toolCallId: string;
|
||||
toolName: string;
|
||||
output: string;
|
||||
content: (TextContent | ImageContent)[]; // Supports text and images
|
||||
details?: TDetails;
|
||||
isError: boolean;
|
||||
timestamp: number; // Unix timestamp in milliseconds
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue