co-mono/packages/ai/test/image-tool-result.test.ts
Mario Zechner bb50738f7e fix(ai): append system prompt to codex bridge message instead of converting to input
Previously the system prompt was converted to an input message in convertMessages,
then stripped out by filterPiSystemPrompts. Now the system prompt is passed directly
to transformRequestBody and appended after CODEX_PI_BRIDGE in the bridge message.
2026-01-05 06:03:07 +01:00

418 lines
14 KiB
TypeScript

import { readFileSync } from "node:fs";
import { join } from "node:path";
import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
import type { Api, Context, Model, Tool, ToolResultMessage } from "../src/index.js";
import { complete, getModel } from "../src/index.js";
import type { OptionsForApi } from "../src/types.js";
import { resolveApiKey } from "./oauth.js";
// Resolve OAuth tokens at module level (async, runs before tests)
const oauthTokens = await Promise.all([
resolveApiKey("anthropic"),
resolveApiKey("github-copilot"),
resolveApiKey("google-gemini-cli"),
resolveApiKey("google-antigravity"),
resolveApiKey("openai-codex"),
]);
const [anthropicOAuthToken, githubCopilotToken, geminiCliToken, antigravityToken, openaiCodexToken] = oauthTokens;
/**
* Test that tool results containing only images work correctly across all providers.
* This verifies that:
* 1. Tool results can contain image content blocks
* 2. Providers correctly pass images from tool results to the LLM
* 3. The LLM can see and describe images returned by tools
*/
async function handleToolWithImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
// Check if the model supports images
if (!model.input.includes("image")) {
console.log(`Skipping tool image result test - model ${model.id} doesn't support images`);
return;
}
// Read the test image
const imagePath = join(__dirname, "data", "red-circle.png");
const imageBuffer = readFileSync(imagePath);
const base64Image = imageBuffer.toString("base64");
// Define a tool that returns only an image (no text)
const getImageSchema = Type.Object({});
const getImageTool: Tool<typeof getImageSchema> = {
name: "get_circle",
description: "Returns a circle image for visualization",
parameters: getImageSchema,
};
const context: Context = {
systemPrompt: "You are a helpful assistant that uses tools when asked.",
messages: [
{
role: "user",
content: "Call the get_circle tool to get an image, and describe what you see, shapes, colors, etc.",
timestamp: Date.now(),
},
],
tools: [getImageTool],
};
// First request - LLM should call the tool
const firstResponse = await complete(model, context, options);
expect(firstResponse.stopReason).toBe("toolUse");
// Find the tool call
const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("Expected tool call");
}
expect(toolCall.name).toBe("get_circle");
// Add the tool call to context
context.messages.push(firstResponse);
// Create tool result with ONLY an image (no text)
const toolResult: ToolResultMessage = {
role: "toolResult",
toolCallId: toolCall.id,
toolName: toolCall.name,
content: [
{
type: "image",
data: base64Image,
mimeType: "image/png",
},
],
isError: false,
timestamp: Date.now(),
};
context.messages.push(toolResult);
// Second request - LLM should describe the image from the tool result
const secondResponse = await complete(model, context, options);
expect(secondResponse.stopReason).toBe("stop");
expect(secondResponse.errorMessage).toBeFalsy();
// Verify the LLM can see and describe the image
const textContent = secondResponse.content.find((b) => b.type === "text");
expect(textContent).toBeTruthy();
if (textContent && textContent.type === "text") {
const lowerContent = textContent.text.toLowerCase();
// Should mention red and circle since that's what the image shows
expect(lowerContent).toContain("red");
expect(lowerContent).toContain("circle");
}
}
/**
* Test that tool results containing both text and images work correctly across all providers.
* This verifies that:
* 1. Tool results can contain mixed content blocks (text + images)
* 2. Providers correctly pass both text and images from tool results to the LLM
* 3. The LLM can see both the text and images in tool results
*/
async function handleToolWithTextAndImageResult<TApi extends Api>(model: Model<TApi>, options?: OptionsForApi<TApi>) {
// Check if the model supports images
if (!model.input.includes("image")) {
console.log(`Skipping tool text+image result test - model ${model.id} doesn't support images`);
return;
}
// Read the test image
const imagePath = join(__dirname, "data", "red-circle.png");
const imageBuffer = readFileSync(imagePath);
const base64Image = imageBuffer.toString("base64");
// Define a tool that returns both text and an image
const getImageSchema = Type.Object({});
const getImageTool: Tool<typeof getImageSchema> = {
name: "get_circle_with_description",
description: "Returns a circle image with a text description",
parameters: getImageSchema,
};
const context: Context = {
systemPrompt: "You are a helpful assistant that uses tools when asked.",
messages: [
{
role: "user",
content:
"Use the get_circle_with_description tool and tell me what you learned. Also say what color the shape is.",
timestamp: Date.now(),
},
],
tools: [getImageTool],
};
// First request - LLM should call the tool
const firstResponse = await complete(model, context, options);
expect(firstResponse.stopReason).toBe("toolUse");
// Find the tool call
const toolCall = firstResponse.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("Expected tool call");
}
expect(toolCall.name).toBe("get_circle_with_description");
// Add the tool call to context
context.messages.push(firstResponse);
// Create tool result with BOTH text and image
const toolResult: ToolResultMessage = {
role: "toolResult",
toolCallId: toolCall.id,
toolName: toolCall.name,
content: [
{
type: "text",
text: "This is a geometric shape with specific properties: it has a diameter of 100 pixels.",
},
{
type: "image",
data: base64Image,
mimeType: "image/png",
},
],
isError: false,
timestamp: Date.now(),
};
context.messages.push(toolResult);
// Second request - LLM should describe both the text and image from the tool result
const secondResponse = await complete(model, context, options);
expect(secondResponse.stopReason).toBe("stop");
expect(secondResponse.errorMessage).toBeFalsy();
// Verify the LLM can see both text and image
const textContent = secondResponse.content.find((b) => b.type === "text");
expect(textContent).toBeTruthy();
if (textContent && textContent.type === "text") {
const lowerContent = textContent.text.toLowerCase();
// Should mention details from the text (diameter/pixels)
expect(lowerContent.match(/diameter|100|pixel/)).toBeTruthy();
// Should also mention the visual properties (red and circle)
expect(lowerContent).toContain("red");
expect(lowerContent).toContain("circle");
}
}
describe("Tool Results with Images", () => {
describe.skipIf(!process.env.GEMINI_API_KEY)("Google Provider (gemini-2.5-flash)", () => {
const llm = getModel("google", "gemini-2.5-flash");
it("should handle tool result with only image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Completions Provider (gpt-4o-mini)", () => {
const llm: Model<"openai-completions"> = { ...getModel("openai", "gpt-4o-mini"), api: "openai-completions" };
it("should handle tool result with only image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider (gpt-5-mini)", () => {
const llm = getModel("openai", "gpt-5-mini");
it("should handle tool result with only image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.ANTHROPIC_API_KEY)("Anthropic Provider (claude-haiku-4-5)", () => {
const model = getModel("anthropic", "claude-haiku-4-5");
it("should handle tool result with only image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithImageResult(model);
});
it("should handle tool result with text and image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithTextAndImageResult(model);
});
});
describe.skipIf(!process.env.OPENROUTER_API_KEY)("OpenRouter Provider (glm-4.5v)", () => {
const llm = getModel("openrouter", "z-ai/glm-4.5v");
it("should handle tool result with only image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithTextAndImageResult(llm);
});
});
describe.skipIf(!process.env.MISTRAL_API_KEY)("Mistral Provider (pixtral-12b)", () => {
const llm = getModel("mistral", "pixtral-12b");
it("should handle tool result with only image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithImageResult(llm);
});
it("should handle tool result with text and image", { retry: 3, timeout: 30000 }, async () => {
await handleToolWithTextAndImageResult(llm);
});
});
// =========================================================================
// OAuth-based providers (credentials from ~/.pi/agent/oauth.json)
// =========================================================================
describe("Anthropic OAuth Provider (claude-sonnet-4-5)", () => {
const model = getModel("anthropic", "claude-sonnet-4-5");
it.skipIf(!anthropicOAuthToken)(
"should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
await handleToolWithImageResult(model, { apiKey: anthropicOAuthToken });
},
);
it.skipIf(!anthropicOAuthToken)(
"should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
await handleToolWithTextAndImageResult(model, { apiKey: anthropicOAuthToken });
},
);
});
describe("GitHub Copilot Provider", () => {
it.skipIf(!githubCopilotToken)(
"gpt-4o - should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("github-copilot", "gpt-4o");
await handleToolWithImageResult(llm, { apiKey: githubCopilotToken });
},
);
it.skipIf(!githubCopilotToken)(
"gpt-4o - should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("github-copilot", "gpt-4o");
await handleToolWithTextAndImageResult(llm, { apiKey: githubCopilotToken });
},
);
it.skipIf(!githubCopilotToken)(
"claude-sonnet-4 - should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("github-copilot", "claude-sonnet-4");
await handleToolWithImageResult(llm, { apiKey: githubCopilotToken });
},
);
it.skipIf(!githubCopilotToken)(
"claude-sonnet-4 - should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("github-copilot", "claude-sonnet-4");
await handleToolWithTextAndImageResult(llm, { apiKey: githubCopilotToken });
},
);
});
describe("Google Gemini CLI Provider", () => {
it.skipIf(!geminiCliToken)(
"gemini-2.5-flash - should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("google-gemini-cli", "gemini-2.5-flash");
await handleToolWithImageResult(llm, { apiKey: geminiCliToken });
},
);
it.skipIf(!geminiCliToken)(
"gemini-2.5-flash - should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("google-gemini-cli", "gemini-2.5-flash");
await handleToolWithTextAndImageResult(llm, { apiKey: geminiCliToken });
},
);
});
describe("Google Antigravity Provider", () => {
it.skipIf(!antigravityToken)(
"gemini-3-flash - should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("google-antigravity", "gemini-3-flash");
await handleToolWithImageResult(llm, { apiKey: antigravityToken });
},
);
it.skipIf(!antigravityToken)(
"gemini-3-flash - should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("google-antigravity", "gemini-3-flash");
await handleToolWithTextAndImageResult(llm, { apiKey: antigravityToken });
},
);
/** These two don't work, the model simply won't call the tool, works in pi
it.skipIf(!antigravityToken)(
"claude-sonnet-4-5 - should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("google-antigravity", "claude-sonnet-4-5");
await handleToolWithImageResult(llm, { apiKey: antigravityToken });
},
);
it.skipIf(!antigravityToken)(
"claude-sonnet-4-5 - should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("google-antigravity", "claude-sonnet-4-5");
await handleToolWithTextAndImageResult(llm, { apiKey: antigravityToken });
},
);**/
// Note: gpt-oss-120b-medium does not support images, so not tested here
});
describe("OpenAI Codex Provider", () => {
it.skipIf(!openaiCodexToken)(
"gpt-5.2-xhigh - should handle tool result with only image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("openai-codex", "gpt-5.2-xhigh");
await handleToolWithImageResult(llm, { apiKey: openaiCodexToken });
},
);
it.skipIf(!openaiCodexToken)(
"gpt-5.2-xhigh - should handle tool result with text and image",
{ retry: 3, timeout: 30000 },
async () => {
const llm = getModel("openai-codex", "gpt-5.2-xhigh");
await handleToolWithTextAndImageResult(llm, { apiKey: openaiCodexToken });
},
);
});
});