feat(ai): Add image input tests for vision-capable models

- Added image tests to OpenAI Completions (gpt-4o-mini)
- Added image tests to Anthropic (claude-sonnet-4-0)
- Added image tests to Google (gemini-2.5-flash)
- Tests verify models can process and describe the red circle test image
This commit is contained in:
Mario Zechner 2025-08-30 18:37:17 +02:00
parent 4ac0c6ea28
commit 796e48b80e
10 changed files with 692 additions and 27 deletions

View file

@ -236,10 +236,37 @@ export class AnthropicLLM implements LLM<AnthropicLLMOptions> {
for (const msg of messages) {
if (msg.role === "user") {
params.push({
role: "user",
content: msg.content,
});
// Handle both string and array content
if (typeof msg.content === "string") {
params.push({
role: "user",
content: msg.content,
});
} else {
// Convert array content to Anthropic format
const blocks: ContentBlockParam[] = msg.content.map((item) => {
if (item.type === "text") {
return {
type: "text",
text: item.text,
};
} else {
// Image content
return {
type: "image",
source: {
type: "base64",
media_type: item.mimeType as "image/jpeg" | "image/png" | "image/gif" | "image/webp",
data: item.data,
},
};
}
});
params.push({
role: "user",
content: blocks,
});
}
} else if (msg.role === "assistant") {
const blocks: ContentBlockParam[] = [];

View file

@ -1,9 +1,11 @@
import {
type Content,
type FinishReason,
FunctionCallingConfigMode,
type GenerateContentConfig,
type GenerateContentParameters,
GoogleGenAI,
type Part,
} from "@google/genai";
import { calculateCost } from "../models.js";
import type {
@ -247,17 +249,39 @@ export class GoogleLLM implements LLM<GoogleLLMOptions> {
}
}
private convertMessages(messages: Message[]): any[] {
const contents: any[] = [];
private convertMessages(messages: Message[]): Content[] {
const contents: Content[] = [];
for (const msg of messages) {
if (msg.role === "user") {
contents.push({
role: "user",
parts: [{ text: msg.content }],
});
// Handle both string and array content
if (typeof msg.content === "string") {
contents.push({
role: "user",
parts: [{ text: msg.content }],
});
} else {
// Convert array content to Google format
const parts: Part[] = msg.content.map((item) => {
if (item.type === "text") {
return { text: item.text };
} else {
// Image content - Google uses inlineData
return {
inlineData: {
mimeType: item.mimeType,
data: item.data,
},
};
}
});
contents.push({
role: "user",
parts,
});
}
} else if (msg.role === "assistant") {
const parts: any[] = [];
const parts: Part[] = [];
// Add thinking if present
// Note: We include thinkingSignature in our response for multi-turn context,

View file

@ -1,5 +1,11 @@
import OpenAI from "openai";
import type { ChatCompletionChunk, ChatCompletionMessageParam } from "openai/resources/chat/completions.js";
import type {
ChatCompletionChunk,
ChatCompletionContentPart,
ChatCompletionContentPartImage,
ChatCompletionContentPartText,
ChatCompletionMessageParam,
} from "openai/resources/chat/completions.js";
import { calculateCost } from "../models.js";
import type {
AssistantMessage,
@ -264,10 +270,35 @@ export class OpenAICompletionsLLM implements LLM<OpenAICompletionsLLMOptions> {
// Convert messages
for (const msg of messages) {
if (msg.role === "user") {
params.push({
role: "user",
content: msg.content,
});
// Handle both string and array content
if (typeof msg.content === "string") {
params.push({
role: "user",
content: msg.content,
});
} else {
// Convert array content to OpenAI format
const content: ChatCompletionContentPart[] = msg.content.map((item): ChatCompletionContentPart => {
if (item.type === "text") {
return {
type: "text",
text: item.text,
} satisfies ChatCompletionContentPartText;
} else {
// Image content - OpenAI uses data URLs
return {
type: "image_url",
image_url: {
url: `data:${item.mimeType};base64,${item.data}`,
},
} satisfies ChatCompletionContentPartImage;
}
});
params.push({
role: "user",
content,
});
}
} else if (msg.role === "assistant") {
const assistantMsg: ChatCompletionMessageParam = {
role: "assistant",

View file

@ -3,6 +3,9 @@ import type {
Tool as OpenAITool,
ResponseCreateParamsStreaming,
ResponseInput,
ResponseInputContent,
ResponseInputImage,
ResponseInputText,
ResponseReasoningItem,
} from "openai/resources/responses/responses.js";
import type {
@ -205,10 +208,34 @@ export class OpenAIResponsesLLM implements LLM<OpenAIResponsesLLMOptions> {
// Convert messages
for (const msg of messages) {
if (msg.role === "user") {
input.push({
role: "user",
content: [{ type: "input_text", text: msg.content }],
});
// Handle both string and array content
if (typeof msg.content === "string") {
input.push({
role: "user",
content: [{ type: "input_text", text: msg.content }],
});
} else {
// Convert array content to OpenAI Responses format
const content: ResponseInputContent[] = msg.content.map((item): ResponseInputContent => {
if (item.type === "text") {
return {
type: "input_text",
text: item.text,
} satisfies ResponseInputText;
} else {
// Image content - OpenAI Responses uses data URLs
return {
type: "input_image",
detail: "auto",
image_url: `data:${item.mimeType};base64,${item.data}`,
} satisfies ResponseInputImage;
}
});
input.push({
role: "user",
content,
});
}
} else if (msg.role === "assistant") {
// Assistant messages - add both content and tool calls to output
const output: ResponseInput = [];