feat(ai): Add image input tests for vision-capable models

- Added image tests to OpenAI Completions (gpt-4o-mini) - Added image tests to Anthropic (claude-sonnet-4-0) - Added image tests to Google (gemini-2.5-flash) - Tests verify models can process and describe the red circle test image
2026-04-21 04:00:10 +00:00 · 2025-08-30 18:37:17 +02:00 · 2025-08-30 18:37:17 +02:00 · 796e48b80e
commit 796e48b80e
parent 4ac0c6ea28
10 changed files with 692 additions and 27 deletions
--- a/packages/ai/src/providers/anthropic.ts
+++ b/packages/ai/src/providers/anthropic.ts
@ -236,10 +236,37 @@ export class AnthropicLLM implements LLM<AnthropicLLMOptions> {

 		for (const msg of messages) {
 			if (msg.role === "user") {
-				params.push({
-					role: "user",
-					content: msg.content,
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					params.push({
+						role: "user",
+						content: msg.content,
+					});
+				} else {
+					// Convert array content to Anthropic format
+					const blocks: ContentBlockParam[] = msg.content.map((item) => {
+						if (item.type === "text") {
+							return {
+								type: "text",
+								text: item.text,
+							};
+						} else {
+							// Image content
+							return {
+								type: "image",
+								source: {
+									type: "base64",
+									media_type: item.mimeType as "image/jpeg" | "image/png" | "image/gif" | "image/webp",
+									data: item.data,
+								},
+							};
+						}
+					});
+					params.push({
+						role: "user",
+						content: blocks,
+					});
+				}
 			} else if (msg.role === "assistant") {
 				const blocks: ContentBlockParam[] = [];

--- a/packages/ai/src/providers/google.ts
+++ b/packages/ai/src/providers/google.ts
@ -1,9 +1,11 @@
 import {
+	type Content,
 	type FinishReason,
 	FunctionCallingConfigMode,
 	type GenerateContentConfig,
 	type GenerateContentParameters,
 	GoogleGenAI,
+	type Part,
 } from "@google/genai";
 import { calculateCost } from "../models.js";
 import type {
@ -247,17 +249,39 @@ export class GoogleLLM implements LLM<GoogleLLMOptions> {
 		}
 	}

-	private convertMessages(messages: Message[]): any[] {
-		const contents: any[] = [];
+	private convertMessages(messages: Message[]): Content[] {
+		const contents: Content[] = [];

 		for (const msg of messages) {
 			if (msg.role === "user") {
-				contents.push({
-					role: "user",
-					parts: [{ text: msg.content }],
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					contents.push({
+						role: "user",
+						parts: [{ text: msg.content }],
+					});
+				} else {
+					// Convert array content to Google format
+					const parts: Part[] = msg.content.map((item) => {
+						if (item.type === "text") {
+							return { text: item.text };
+						} else {
+							// Image content - Google uses inlineData
+							return {
+								inlineData: {
+									mimeType: item.mimeType,
+									data: item.data,
+								},
+							};
+						}
+					});
+					contents.push({
+						role: "user",
+						parts,
+					});
+				}
 			} else if (msg.role === "assistant") {
-				const parts: any[] = [];
+				const parts: Part[] = [];

 				// Add thinking if present
 				// Note: We include thinkingSignature in our response for multi-turn context,
--- a/packages/ai/src/providers/openai-completions.ts
+++ b/packages/ai/src/providers/openai-completions.ts
@ -1,5 +1,11 @@
 import OpenAI from "openai";
-import type { ChatCompletionChunk, ChatCompletionMessageParam } from "openai/resources/chat/completions.js";
+import type {
+	ChatCompletionChunk,
+	ChatCompletionContentPart,
+	ChatCompletionContentPartImage,
+	ChatCompletionContentPartText,
+	ChatCompletionMessageParam,
+} from "openai/resources/chat/completions.js";
 import { calculateCost } from "../models.js";
 import type {
 	AssistantMessage,
@ -264,10 +270,35 @@ export class OpenAICompletionsLLM implements LLM<OpenAICompletionsLLMOptions> {
 		// Convert messages
 		for (const msg of messages) {
 			if (msg.role === "user") {
-				params.push({
-					role: "user",
-					content: msg.content,
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					params.push({
+						role: "user",
+						content: msg.content,
+					});
+				} else {
+					// Convert array content to OpenAI format
+					const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => {
+						if (item.type === "text") {
+							return {
+								type: "text",
+								text: item.text,
+							} satisfies ChatCompletionContentPartText;
+						} else {
+							// Image content - OpenAI uses data URLs
+							return {
+								type: "image_url",
+								image_url: {
+									url: `data:${item.mimeType};base64,${item.data}`,
+								},
+							} satisfies ChatCompletionContentPartImage;
+						}
+					});
+					params.push({
+						role: "user",
+						content,
+					});
+				}
 			} else if (msg.role === "assistant") {
 				const assistantMsg: ChatCompletionMessageParam = {
 					role: "assistant",
--- a/packages/ai/src/providers/openai-responses.ts
+++ b/packages/ai/src/providers/openai-responses.ts
@ -3,6 +3,9 @@ import type {
 	Tool as OpenAITool,
 	ResponseCreateParamsStreaming,
 	ResponseInput,
+	ResponseInputContent,
+	ResponseInputImage,
+	ResponseInputText,
 	ResponseReasoningItem,
 } from "openai/resources/responses/responses.js";
 import type {
@ -205,10 +208,34 @@ export class OpenAIResponsesLLM implements LLM<OpenAIResponsesLLMOptions> {
 		// Convert messages
 		for (const msg of messages) {
 			if (msg.role === "user") {
-				input.push({
-					role: "user",
-					content: [{ type: "input_text", text: msg.content }],
-				});
+				// Handle both string and array content
+				if (typeof msg.content === "string") {
+					input.push({
+						role: "user",
+						content: [{ type: "input_text", text: msg.content }],
+					});
+				} else {
+					// Convert array content to OpenAI Responses format
+					const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
+						if (item.type === "text") {
+							return {
+								type: "input_text",
+								text: item.text,
+							} satisfies ResponseInputText;
+						} else {
+							// Image content - OpenAI Responses uses data URLs
+							return {
+								type: "input_image",
+								detail: "auto",
+								image_url: `data:${item.mimeType};base64,${item.data}`,
+							} satisfies ResponseInputImage;
+						}
+					});
+					input.push({
+						role: "user",
+						content,
+					});
+				}
 			} else if (msg.role === "assistant") {
 				// Assistant messages - add both content and tool calls to output
 				const output: ResponseInput = [];