From 4ac0c6ea28b2e74c3d034e23b5ea464026ed6a47 Mon Sep 17 00:00:00 2001 From: Mario Zechner Date: Sat, 30 Aug 2025 18:12:36 +0200 Subject: [PATCH] docs(ai): Simplify ImageContent interface to base64-only approach - Change ImageContent to simple { type, data, mimeType } structure - Remove URL and file path support from core interface - Simplify provider converters to work with base64 data only - Update validation and implementation considerations - Clarify that preprocessing is user's responsibility --- packages/ai/docs/images.md | 144 +++++++++++-------------------------- 1 file changed, 41 insertions(+), 103 deletions(-) diff --git a/packages/ai/docs/images.md b/packages/ai/docs/images.md index 815fe2e4..82350015 100644 --- a/packages/ai/docs/images.md +++ b/packages/ai/docs/images.md @@ -154,14 +154,9 @@ This document describes how to submit images to different LLM provider APIs and ```typescript interface ImageContent { type: "image"; - source: ImageSource; - alt?: string; // Optional alt text for accessibility + data: string; // base64 encoded image data + mimeType: string; // e.g., "image/jpeg", "image/png" } - -type ImageSource = - | { type: "base64"; data: string; mimeType: string } - | { type: "url"; url: string } - | { type: "file"; path: string }; // Local file path ``` ### Unified Message Structure @@ -198,36 +193,14 @@ if (model.input.includes("image")) { function toAnthropicContent(content: (TextContent | ImageContent)[]) { return content.map(item => { if (item.type === "image") { - if (item.source.type === "base64") { - return { - type: "image", - source: { - type: "base64", - media_type: item.source.mimeType, - data: item.source.data - } - }; - } else if (item.source.type === "url") { - return { - type: "image", - source: { - type: "url", - url: item.source.url - } - }; - } else if (item.source.type === "file") { - // Read file and convert to base64 - const data = fs.readFileSync(item.source.path).toString('base64'); - const mimeType = getMimeType(item.source.path); - return { - type: "image", - source: { - type: "base64", - media_type: mimeType, - data - } - }; - } + return { + type: "image", + source: { + type: "base64", + media_type: item.mimeType, + data: item.data + } + }; } return { type: "text", text: item.text }; }); @@ -237,29 +210,12 @@ function toAnthropicContent(content: (TextContent | ImageContent)[]) { function toOpenAIContent(content: (TextContent | ImageContent)[]) { return content.map(item => { if (item.type === "image") { - if (item.source.type === "base64") { - return { - type: "image_url", - image_url: { - url: `data:${item.source.mimeType};base64,${item.source.data}` - } - }; - } else if (item.source.type === "url") { - return { - type: "image_url", - image_url: { url: item.source.url } - }; - } else if (item.source.type === "file") { - // Read and convert to data URL - const data = fs.readFileSync(item.source.path).toString('base64'); - const mimeType = getMimeType(item.source.path); - return { - type: "image_url", - image_url: { - url: `data:${mimeType};base64,${data}` - } - }; - } + return { + type: "image_url", + image_url: { + url: `data:${item.mimeType};base64,${item.data}` + } + }; } return { type: "text", text: item.text }; }); @@ -269,27 +225,12 @@ function toOpenAIContent(content: (TextContent | ImageContent)[]) { function toGoogleContent(content: (TextContent | ImageContent)[]) { return content.map(item => { if (item.type === "image") { - if (item.source.type === "base64") { - return { - inline_data: { - mime_type: item.source.mimeType, - data: item.source.data - } - }; - } else if (item.source.type === "url") { - // Google doesn't support external URLs directly - // Would need to fetch and convert to base64 - throw new Error("Google GenAI requires base64 or File API for images"); - } else if (item.source.type === "file") { - const data = fs.readFileSync(item.source.path).toString('base64'); - const mimeType = getMimeType(item.source.path); - return { - inline_data: { - mime_type: mimeType, - data - } - }; - } + return { + inline_data: { + mime_type: item.mimeType, + data: item.data + } + }; } return { text: item.text }; }); @@ -332,23 +273,18 @@ const PROVIDER_CONSTRAINTS: Record = { }; async function validateImage( - source: ImageSource, + image: ImageContent, provider: string ): Promise { const constraints = PROVIDER_CONSTRAINTS[provider]; - // Get image data - let imageBuffer: Buffer; - if (source.type === "file") { - imageBuffer = await fs.readFile(source.path); - } else if (source.type === "base64") { - imageBuffer = Buffer.from(source.data, 'base64'); - } else { - // For URLs, might need to fetch and validate - return; + // Check MIME type + if (!constraints.supportedFormats.includes(image.mimeType)) { + throw new Error(`Unsupported image format: ${image.mimeType}`); } // Check size + const imageBuffer = Buffer.from(image.data, 'base64'); const sizeMB = imageBuffer.length / (1024 * 1024); if (sizeMB > constraints.maxSizeMB) { throw new Error(`Image exceeds ${constraints.maxSizeMB}MB limit`); @@ -360,25 +296,27 @@ async function validateImage( ## Implementation Considerations -1. **Automatic Format Conversion**: - - Convert URLs to base64 for providers that don't support URLs - - Handle file paths by reading and encoding files - - Optimize image size/quality when needed +1. **Preprocessing**: + - User is responsible for converting images to base64 before passing to API + - Utility functions could be provided for common conversions (file to base64, URL to base64) + - Image optimization (resize/compress) should happen before encoding 2. **Error Handling**: - - Validate image formats before sending - - Check model capabilities + - Validate MIME types and sizes before sending + - Check model capabilities (via `model.input.includes("image")`) - Provide clear error messages for unsupported features 3. **Performance**: - - Cache base64 encodings for reused images - - Stream large images when possible - - Consider using provider-specific file upload APIs for large images + - Base64 encoding increases payload size by ~33% + - Consider image compression before encoding + - For Google GenAI, be aware of 20MB total request limit 4. **Token Counting**: - - Images consume tokens (varies by provider) + - Images consume tokens (varies by provider and image size) - Include image token estimates in usage calculations + - Anthropic: ~1 token per ~3-4 bytes of base64 data + - OpenAI: Detailed images consume more tokens than low-detail 5. **Fallback Strategies**: - - If model doesn't support images, extract text description - - Offer image-to-text preprocessing options \ No newline at end of file + - If model doesn't support images, throw error or ignore images + - Consider offering text-only fallback for non-vision models \ No newline at end of file