feat(ai): Add image input tests for vision-capable models

- Added image tests to OpenAI Completions (gpt-4o-mini) - Added image tests to Anthropic (claude-sonnet-4-0) - Added image tests to Google (gemini-2.5-flash) - Tests verify models can process and describe the red circle test image
2026-04-17 08:00:59 +00:00 · 2025-08-30 18:37:17 +02:00 · 2025-08-30 18:37:17 +02:00 · 796e48b80e
commit 796e48b80e
parent 4ac0c6ea28
10 changed files with 692 additions and 27 deletions
--- a/packages/ai/package.json
+++ b/packages/ai/package.json
@ -44,6 +44,7 @@
 	},
 	"devDependencies": {
 		"@types/node": "^24.3.0",
+		"canvas": "^3.2.0",
 		"vitest": "^3.2.4"
 	}
 }
--- a/packages/ai/scripts/generate-test-image.ts
+++ b/packages/ai/scripts/generate-test-image.ts
@ -0,0 +1,34 @@
+#!/usr/bin/env tsx
+
+import { createCanvas } from "canvas";
+import { writeFileSync } from "fs";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+// Create a 200x200 canvas
+const canvas = createCanvas(200, 200);
+const ctx = canvas.getContext("2d");
+
+// Fill background with white
+ctx.fillStyle = "white";
+ctx.fillRect(0, 0, 200, 200);
+
+// Draw a red circle in the center
+ctx.fillStyle = "red";
+ctx.beginPath();
+ctx.arc(100, 100, 50, 0, Math.PI * 2);
+ctx.fill();
+
+// Save the image
+const buffer = canvas.toBuffer("image/png");
+const outputPath = join(__dirname, "..", "test", "data", "red-circle.png");
+
+// Ensure the directory exists
+import { mkdirSync } from "fs";
+mkdirSync(join(__dirname, "..", "test", "data"), { recursive: true });
+
+writeFileSync(outputPath, buffer);
+console.log(`Generated test image at: ${outputPath}`);
--- a/packages/ai/src/providers/anthropic.ts
+++ b/packages/ai/src/providers/anthropic.ts
@ -236,10 +236,37 @@ export class AnthropicLLM implements LLM<AnthropicLLMOptions> {

 		for (const msg of messages) {
 			if (msg.role === "user") {
-				params.push({
-					role: "user",
-					content: msg.content,
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					params.push({
+						role: "user",
+						content: msg.content,
+					});
+				} else {
+					// Convert array content to Anthropic format
+					const blocks: ContentBlockParam[] = msg.content.map((item) => {
+						if (item.type === "text") {
+							return {
+								type: "text",
+								text: item.text,
+							};
+						} else {
+							// Image content
+							return {
+								type: "image",
+								source: {
+									type: "base64",
+									media_type: item.mimeType as "image/jpeg" | "image/png" | "image/gif" | "image/webp",
+									data: item.data,
+								},
+							};
+						}
+					});
+					params.push({
+						role: "user",
+						content: blocks,
+					});
+				}
 			} else if (msg.role === "assistant") {
 				const blocks: ContentBlockParam[] = [];

--- a/packages/ai/src/providers/google.ts
+++ b/packages/ai/src/providers/google.ts
@ -1,9 +1,11 @@
 import {
+	type Content,
 	type FinishReason,
 	FunctionCallingConfigMode,
 	type GenerateContentConfig,
 	type GenerateContentParameters,
 	GoogleGenAI,
+	type Part,
 } from "@google/genai";
 import { calculateCost } from "../models.js";
 import type {
@ -247,17 +249,39 @@ export class GoogleLLM implements LLM<GoogleLLMOptions> {
 		}
 	}

-	private convertMessages(messages: Message[]): any[] {
-		const contents: any[] = [];
+	private convertMessages(messages: Message[]): Content[] {
+		const contents: Content[] = [];

 		for (const msg of messages) {
 			if (msg.role === "user") {
-				contents.push({
-					role: "user",
-					parts: [{ text: msg.content }],
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					contents.push({
+						role: "user",
+						parts: [{ text: msg.content }],
+					});
+				} else {
+					// Convert array content to Google format
+					const parts: Part[] = msg.content.map((item) => {
+						if (item.type === "text") {
+							return { text: item.text };
+						} else {
+							// Image content - Google uses inlineData
+							return {
+								inlineData: {
+									mimeType: item.mimeType,
+									data: item.data,
+								},
+							};
+						}
+					});
+					contents.push({
+						role: "user",
+						parts,
+					});
+				}
 			} else if (msg.role === "assistant") {
-				const parts: any[] = [];
+				const parts: Part[] = [];

 				// Add thinking if present
 				// Note: We include thinkingSignature in our response for multi-turn context,
--- a/packages/ai/src/providers/openai-completions.ts
+++ b/packages/ai/src/providers/openai-completions.ts
@ -1,5 +1,11 @@
 import OpenAI from "openai";
-import type { ChatCompletionChunk, ChatCompletionMessageParam } from "openai/resources/chat/completions.js";
+import type {
+	ChatCompletionChunk,
+	ChatCompletionContentPart,
+	ChatCompletionContentPartImage,
+	ChatCompletionContentPartText,
+	ChatCompletionMessageParam,
+} from "openai/resources/chat/completions.js";
 import { calculateCost } from "../models.js";
 import type {
 	AssistantMessage,
@ -264,10 +270,35 @@ export class OpenAICompletionsLLM implements LLM<OpenAICompletionsLLMOptions> {
 		// Convert messages
 		for (const msg of messages) {
 			if (msg.role === "user") {
-				params.push({
-					role: "user",
-					content: msg.content,
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					params.push({
+						role: "user",
+						content: msg.content,
+					});
+				} else {
+					// Convert array content to OpenAI format
+					const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => {
+						if (item.type === "text") {
+							return {
+								type: "text",
+								text: item.text,
+							} satisfies ChatCompletionContentPartText;
+						} else {
+							// Image content - OpenAI uses data URLs
+							return {
+								type: "image_url",
+								image_url: {
+									url: `data:${item.mimeType};base64,${item.data}`,
+								},
+							} satisfies ChatCompletionContentPartImage;
+						}
+					});
+					params.push({
+						role: "user",
+						content,
+					});
+				}
 			} else if (msg.role === "assistant") {
 				const assistantMsg: ChatCompletionMessageParam = {
 					role: "assistant",
--- a/packages/ai/src/providers/openai-responses.ts
+++ b/packages/ai/src/providers/openai-responses.ts
@ -3,6 +3,9 @@ import type {
 	Tool as OpenAITool,
 	ResponseCreateParamsStreaming,
 	ResponseInput,
+	ResponseInputContent,
+	ResponseInputImage,
+	ResponseInputText,
 	ResponseReasoningItem,
 } from "openai/resources/responses/responses.js";
 import type {
@ -205,10 +208,34 @@ export class OpenAIResponsesLLM implements LLM<OpenAIResponsesLLMOptions> {
 		// Convert messages
 		for (const msg of messages) {
 			if (msg.role === "user") {
-				input.push({
-					role: "user",
-					content: [{ type: "input_text", text: msg.content }],
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					input.push({
+						role: "user",
+						content: [{ type: "input_text", text: msg.content }],
+					});
+				} else {
+					// Convert array content to OpenAI Responses format
+					const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
+						if (item.type === "text") {
+							return {
+								type: "input_text",
+								text: item.text,
+							} satisfies ResponseInputText;
+						} else {
+							// Image content - OpenAI Responses uses data URLs
+							return {
+								type: "input_image",
+								detail: "auto",
+								image_url: `data:${item.mimeType};base64,${item.data}`,
+							} satisfies ResponseInputImage;
+						}
+					});
+					input.push({
+						role: "user",
+						content,
+					});
+				}
 			} else if (msg.role === "assistant") {
 				// Assistant messages - add both content and tool calls to output
 				const output: ResponseInput = [];
--- a/packages/ai/src/types.ts
+++ b/packages/ai/src/types.ts
@ -11,15 +11,27 @@ export interface LLM<T extends LLMOptions> {
 	getModel(): Model;
 }

+export interface TextContent {
+	type: "text";
+	text: string;
+}
+
+export interface ImageContent {
+	type: "image";
+	data: string; // base64 encoded image data
+	mimeType: string; // e.g., "image/jpeg", "image/png"
+}
+
 export interface UserMessage {
 	role: "user";
-	content: string;
+	content: string | (TextContent | ImageContent)[];
 }

 export interface AssistantMessage {
 	role: "assistant";
 	thinking?: string;
-	thinkingSignature?: string; // Leaky abstraction: needed for Anthropic
+	// Leaky abstraction: provider specific, does not translate to other providers
+	thinkingSignature?: string;
 	content?: string;
 	toolCalls?: {
 		id: string;
--- a/packages/ai/test/data/red-circle.png
+++ b/packages/ai/test/data/red-circle.png
--- a/packages/ai/test/providers.test.ts
+++ b/packages/ai/test/providers.test.ts
@ -3,9 +3,15 @@ import { GoogleLLM } from "../src/providers/google.js";
 import { OpenAICompletionsLLM } from "../src/providers/openai-completions.js";
 import { OpenAIResponsesLLM } from "../src/providers/openai-responses.js";
 import { AnthropicLLM } from "../src/providers/anthropic.js";
-import type { LLM, LLMOptions, Context, Tool, AssistantMessage, Model } from "../src/types.js";
+import type { LLM, LLMOptions, Context, Tool, AssistantMessage, Model, ImageContent } from "../src/types.js";
 import { spawn, ChildProcess, execSync } from "child_process";
 import { createLLM, getModel } from "../src/models.js";
+import { readFileSync } from "fs";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);

 // Calculator tool definition (same as examples)
 const calculatorTool: Tool = {
@ -105,6 +111,46 @@ async function handleThinking<T extends LLMOptions>(llm: LLM<T>, options: T, req
    }
 }

+async function handleImage<T extends LLMOptions>(llm: LLM<T>) {
+    // Check if the model supports images
+    const model = llm.getModel();
+    if (!model.input.includes("image")) {
+        console.log(`Skipping image test - model ${model.id} doesn't support images`);
+        return;
+    }
+
+    // Read the test image
+    const imagePath = join(__dirname, "data", "red-circle.png");
+    const imageBuffer = readFileSync(imagePath);
+    const base64Image = imageBuffer.toString("base64");
+
+    const imageContent: ImageContent = {
+        type: "image",
+        data: base64Image,
+        mimeType: "image/png",
+    };
+
+    const context: Context = {
+        messages: [
+            {
+                role: "user",
+                content: [
+                    { type: "text", text: "What do you see in this image? Please describe the shape and color." },
+                    imageContent,
+                ],
+            },
+        ],
+    };
+
+    const response = await llm.complete(context);
+
+    // Check the response mentions red and circle
+    expect(response.content).toBeTruthy();
+    const lowerContent = response.content?.toLowerCase() || "";
+    expect(lowerContent).toContain("red");
+    expect(lowerContent).toContain("circle");
+}
+
 async function multiTurn<T extends LLMOptions>(llm: LLM<T>, thinkingOptions: T) {
    const context: Context = {
        systemPrompt: "You are a helpful assistant that can use tools to answer questions.",
@ -259,6 +305,10 @@ describe("AI Providers E2E Tests", () => {
        it("should handle streaming", async () => {
            await handleStreaming(llm);
        });
+
+        it("should handle image input", async () => {
+            await handleImage(llm);
+        });
    });

    describe.skipIf(!process.env.OPENAI_API_KEY)("OpenAI Responses Provider", () => {
@ -287,6 +337,10 @@ describe("AI Providers E2E Tests", () => {
        it("should handle multi-turn with thinking and tools", async () => {
            await multiTurn(llm, {reasoningEffort: "medium"});
        });
+
+        it("should handle image input", async () => {
+            await handleImage(llm);
+        });
    });

    describe.skipIf(!process.env.ANTHROPIC_OAUTH_TOKEN)("Anthropic Provider", () => {
@ -315,6 +369,10 @@ describe("AI Providers E2E Tests", () => {
        it("should handle multi-turn with thinking and tools", async () => {
            await multiTurn(llm, {thinking: { enabled: true, budgetTokens: 2048 }});
        });
+
+        it("should handle image input", async () => {
+            await handleImage(llm);
+        });
    });

    describe.skipIf(!process.env.XAI_API_KEY)("xAI Provider (via OpenAI Completions)", () => {