feat(ai): Add image input tests for vision-capable models

- Added image tests to OpenAI Completions (gpt-4o-mini)
- Added image tests to Anthropic (claude-sonnet-4-0)
- Added image tests to Google (gemini-2.5-flash)
- Tests verify models can process and describe the red circle test image
This commit is contained in:
Mario Zechner 2025-08-30 18:37:17 +02:00
parent 4ac0c6ea28
commit 796e48b80e
10 changed files with 692 additions and 27 deletions

View file

@ -44,6 +44,7 @@
},
"devDependencies": {
"@types/node": "^24.3.0",
"canvas": "^3.2.0",
"vitest": "^3.2.4"
}
}

View file

@ -0,0 +1,34 @@
#!/usr/bin/env tsx
import { createCanvas } from "canvas";
import { writeFileSync } from "fs";
import { join, dirname } from "path";
import { fileURLToPath } from "url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Create a 200x200 canvas
const canvas = createCanvas(200, 200);
const ctx = canvas.getContext("2d");
// Fill background with white
ctx.fillStyle = "white";
ctx.fillRect(0, 0, 200, 200);
// Draw a red circle in the center
ctx.fillStyle = "red";
ctx.beginPath();
ctx.arc(100, 100, 50, 0, Math.PI * 2);
ctx.fill();
// Save the image
const buffer = canvas.toBuffer("image/png");
const outputPath = join(__dirname, "..", "test", "data", "red-circle.png");
// Ensure the directory exists
import { mkdirSync } from "fs";
mkdirSync(join(__dirname, "..", "test", "data"), { recursive: true });
writeFileSync(outputPath, buffer);
console.log(`Generated test image at: ${outputPath}`);

View file

@ -236,10 +236,37 @@ export class AnthropicLLM implements LLM<AnthropicLLMOptions> {
for (const msg of messages) {
if (msg.role === "user") {
params.push({
role: "user",
content: msg.content,
});
// Handle both string and array content
if (typeof msg.content === "string") {
params.push({
role: "user",
content: msg.content,
});
} else {
// Convert array content to Anthropic format
const blocks: ContentBlockParam[] = msg.content.map((item) => {
if (item.type === "text") {
return {
type: "text",
text: item.text,
};
} else {
// Image content
return {
type: "image",
source: {
type: "base64",
media_type: item.mimeType as "image/jpeg" | "image/png" | "image/gif" | "image/webp",
data: item.data,
},
};
}
});
params.push({
role: "user",
content: blocks,
});
}
} else if (msg.role === "assistant") {
const blocks: ContentBlockParam[] = [];

View file

@ -1,9 +1,11 @@
import {
type Content,
type FinishReason,
FunctionCallingConfigMode,
type GenerateContentConfig,
type GenerateContentParameters,
GoogleGenAI,
type Part,
} from "@google/genai";
import { calculateCost } from "../models.js";
import type {
@ -247,17 +249,39 @@ export class GoogleLLM implements LLM<GoogleLLMOptions> {
}
}
private convertMessages(messages: Message[]): any[] {
const contents: any[] = [];
private convertMessages(messages: Message[]): Content[] {
const contents: Content[] = [];
for (const msg of messages) {
if (msg.role === "user") {
contents.push({
role: "user",
parts: [{ text: msg.content }],
});
// Handle both string and array content
if (typeof msg.content === "string") {
contents.push({
role: "user",
parts: [{ text: msg.content }],
});
} else {
// Convert array content to Google format
const parts: Part[] = msg.content.map((item) => {
if (item.type === "text") {
return { text: item.text };
} else {
// Image content - Google uses inlineData
return {
inlineData: {
mimeType: item.mimeType,
data: item.data,
},
};
}
});
contents.push({
role: "user",
parts,
});
}
} else if (msg.role === "assistant") {
const parts: any[] = [];
const parts: Part[] = [];
// Add thinking if present
// Note: We include thinkingSignature in our response for multi-turn context,

View file

@ -1,5 +1,11 @@
import OpenAI from "openai";
import type { ChatCompletionChunk, ChatCompletionMessageParam } from "openai/resources/chat/completions.js";
import type {
ChatCompletionChunk,
ChatCompletionContentPart,
ChatCompletionContentPartImage,
ChatCompletionContentPartText,
ChatCompletionMessageParam,
} from "openai/resources/chat/completions.js";
import { calculateCost } from "../models.js";
import type {
AssistantMessage,
@ -264,10 +270,35 @@ export class OpenAICompletionsLLM implements LLM<OpenAICompletionsLLMOptions> {
// Convert messages
for (const msg of messages) {
if (msg.role === "user") {
params.push({
role: "user",
content: msg.content,
});
// Handle both string and array content
if (typeof msg.content === "string") {
params.push({
role: "user",
content: msg.content,
});
} else {
// Convert array content to OpenAI format
const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => {
if (item.type === "text") {
return {
type: "text",
text: item.text,
} satisfies ChatCompletionContentPartText;
} else {
// Image content - OpenAI uses data URLs
return {
type: "image_url",
image_url: {
url: `data:${item.mimeType};base64,${item.data}`,
},
} satisfies ChatCompletionContentPartImage;
}
});
params.push({
role: "user",
content,
});
}
} else if (msg.role === "assistant") {
const assistantMsg: ChatCompletionMessageParam = {
role: "assistant",

View file

@ -3,6 +3,9 @@ import type {
Tool as OpenAITool,
ResponseCreateParamsStreaming,
ResponseInput,
ResponseInputContent,
ResponseInputImage,
ResponseInputText,
ResponseReasoningItem,
} from "openai/resources/responses/responses.js";
import type {
@ -205,10 +208,34 @@ export class OpenAIResponsesLLM implements LLM<OpenAIResponsesLLMOptions> {
// Convert messages
for (const msg of messages) {
if (msg.role === "user") {
input.push({
role: "user",
content: [{ type: "input_text", text: msg.content }],
});
// Handle both string and array content
if (typeof msg.content === "string") {
input.push({
role: "user",
content: [{ type: "input_text", text: msg.content }],
});
} else {
// Convert array content to OpenAI Responses format
const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
if (item.type === "text") {
return {
type: "input_text",
text: item.text,
} satisfies ResponseInputText;
} else {
// Image content - OpenAI Responses uses data URLs
return {
type: "input_image",
detail: "auto",
image_url: `data:${item.mimeType};base64,${item.data}`,
} satisfies ResponseInputImage;
}
});
input.push({
role: "user",
content,
});
}
} else if (msg.role === "assistant") {
// Assistant messages - add both content and tool calls to output
const output: ResponseInput = [];

View file

@ -11,15 +11,27 @@ export interface LLM<T extends LLMOptions> {
getModel(): Model;
}
export interface TextContent {
type: "text";
text: string;
}
export interface ImageContent {
type: "image";
data: string; // base64 encoded image data
mimeType: string; // e.g., "image/jpeg", "image/png"
}
export interface UserMessage {
role: "user";
content: string;
content: string | (TextContent | ImageContent)[];
}
export interface AssistantMessage {
role: "assistant";
thinking?: string;
thinkingSignature?: string; // Leaky abstraction: needed for Anthropic
// Leaky abstraction: provider specific, does not translate to other providers
thinkingSignature?: string;
content?: string;
toolCalls?: {
id: string;

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.5 KiB

View file

@ -3,9 +3,15 @@ import { GoogleLLM } from "../src/providers/google.js";
import { OpenAICompletionsLLM } from "../src/providers/openai-completions.js";
import { OpenAIResponsesLLM } from "../src/providers/openai-responses.js";
import { AnthropicLLM } from "../src/providers/anthropic.js";
import type { LLM, LLMOptions, Context, Tool, AssistantMessage, Model } from "../src/types.js";
import type { LLM, LLMOptions, Context, Tool, AssistantMessage, Model, ImageContent } from "../src/types.js";
import { spawn, ChildProcess, execSync } from "child_process";
import { createLLM, getModel } from "../src/models.js";
import { readFileSync } from "fs";
import { join, dirname } from "path";
import { fileURLToPath } from "url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Calculator tool definition (same as examples)
const calculatorTool: Tool = {
@ -105,6 +111,46 @@ async function handleThinking<T extends LLMOptions>(llm: LLM<T>, options: T, req
}
}
async function handleImage<T extends LLMOptions>(llm: LLM<T>) {
// Check if the model supports images
const model = llm.getModel();
if (!model.input.includes("image")) {
console.log(`Skipping image test - model ${model.id} doesn't support images`);
return;
}
// Read the test image
const imagePath = join(__dirname, "data", "red-circle.png");
const imageBuffer = readFileSync(imagePath);
const base64Image = imageBuffer.toString("base64");
const imageContent: ImageContent = {
type: "image",
data: base64Image,
mimeType: "image/png",
};
const context: Context = {
messages: [
{
role: "user",
content: [
{ type: "text", text: "What do you see in this image? Please describe the shape and color." },
imageContent,
],
},
],
};
const response = await llm.complete(context);
// Check the response mentions red and circle
expect(response.content).toBeTruthy();
const lowerContent = response.content?.toLowerCase() || "";
expect(lowerContent).toContain("red");
expect(lowerContent).toContain("circle");
}
async function multiTurn<T extends LLMOptions>(llm: LLM<T>, thinkingOptions: T) {
const context: Context = {
systemPrompt: "You are a helpful assistant that can use tools to answer questions.",
@ -259,6 +305,10 @@ describe("AI Providers E2E Tests", () => {
it("should handle streaming", async () => {
await handleStreaming(llm);
});
it("should handle image input", async () => {
await handleImage(llm);
});
});
describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider", () => {
@ -287,6 +337,10 @@ describe("AI Providers E2E Tests", () => {
it("should handle multi-turn with thinking and tools", async () => {
await multiTurn(llm, {reasoningEffort: "medium"});
});
it("should handle image input", async () => {
await handleImage(llm);
});
});
describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider", () => {
@ -315,6 +369,10 @@ describe("AI Providers E2E Tests", () => {
it("should handle multi-turn with thinking and tools", async () => {
await multiTurn(llm, {thinking: { enabled: true, budgetTokens: 2048 }});
});
it("should handle image input", async () => {
await handleImage(llm);
});
});
describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider (via OpenAI Completions)", () => {