docs(ai): Simplify ImageContent interface to base64-only approach

- Change ImageContent to simple { type, data, mimeType } structure
- Remove URL and file path support from core interface
- Simplify provider converters to work with base64 data only
- Update validation and implementation considerations
- Clarify that preprocessing is user's responsibility
This commit is contained in:
Mario Zechner 2025-08-30 18:12:36 +02:00
parent 0b50c3f36d
commit 4ac0c6ea28

View file

@ -154,14 +154,9 @@ This document describes how to submit images to different LLM provider APIs and
```typescript ```typescript
interface ImageContent { interface ImageContent {
type: "image"; type: "image";
source: ImageSource; data: string; // base64 encoded image data
alt?: string; // Optional alt text for accessibility mimeType: string; // e.g., "image/jpeg", "image/png"
} }
type ImageSource =
| { type: "base64"; data: string; mimeType: string }
| { type: "url"; url: string }
| { type: "file"; path: string }; // Local file path
``` ```
### Unified Message Structure ### Unified Message Structure
@ -198,36 +193,14 @@ if (model.input.includes("image")) {
function toAnthropicContent(content: (TextContent | ImageContent)[]) { function toAnthropicContent(content: (TextContent | ImageContent)[]) {
return content.map(item => { return content.map(item => {
if (item.type === "image") { if (item.type === "image") {
if (item.source.type === "base64") { return {
return { type: "image",
type: "image", source: {
source: { type: "base64",
type: "base64", media_type: item.mimeType,
media_type: item.source.mimeType, data: item.data
data: item.source.data }
} };
};
} else if (item.source.type === "url") {
return {
type: "image",
source: {
type: "url",
url: item.source.url
}
};
} else if (item.source.type === "file") {
// Read file and convert to base64
const data = fs.readFileSync(item.source.path).toString('base64');
const mimeType = getMimeType(item.source.path);
return {
type: "image",
source: {
type: "base64",
media_type: mimeType,
data
}
};
}
} }
return { type: "text", text: item.text }; return { type: "text", text: item.text };
}); });
@ -237,29 +210,12 @@ function toAnthropicContent(content: (TextContent | ImageContent)[]) {
function toOpenAIContent(content: (TextContent | ImageContent)[]) { function toOpenAIContent(content: (TextContent | ImageContent)[]) {
return content.map(item => { return content.map(item => {
if (item.type === "image") { if (item.type === "image") {
if (item.source.type === "base64") { return {
return { type: "image_url",
type: "image_url", image_url: {
image_url: { url: `data:${item.mimeType};base64,${item.data}`
url: `data:${item.source.mimeType};base64,${item.source.data}` }
} };
};
} else if (item.source.type === "url") {
return {
type: "image_url",
image_url: { url: item.source.url }
};
} else if (item.source.type === "file") {
// Read and convert to data URL
const data = fs.readFileSync(item.source.path).toString('base64');
const mimeType = getMimeType(item.source.path);
return {
type: "image_url",
image_url: {
url: `data:${mimeType};base64,${data}`
}
};
}
} }
return { type: "text", text: item.text }; return { type: "text", text: item.text };
}); });
@ -269,27 +225,12 @@ function toOpenAIContent(content: (TextContent | ImageContent)[]) {
function toGoogleContent(content: (TextContent | ImageContent)[]) { function toGoogleContent(content: (TextContent | ImageContent)[]) {
return content.map(item => { return content.map(item => {
if (item.type === "image") { if (item.type === "image") {
if (item.source.type === "base64") { return {
return { inline_data: {
inline_data: { mime_type: item.mimeType,
mime_type: item.source.mimeType, data: item.data
data: item.source.data }
} };
};
} else if (item.source.type === "url") {
// Google doesn't support external URLs directly
// Would need to fetch and convert to base64
throw new Error("Google GenAI requires base64 or File API for images");
} else if (item.source.type === "file") {
const data = fs.readFileSync(item.source.path).toString('base64');
const mimeType = getMimeType(item.source.path);
return {
inline_data: {
mime_type: mimeType,
data
}
};
}
} }
return { text: item.text }; return { text: item.text };
}); });
@ -332,23 +273,18 @@ const PROVIDER_CONSTRAINTS: Record<string, ImageConstraints> = {
}; };
async function validateImage( async function validateImage(
source: ImageSource, image: ImageContent,
provider: string provider: string
): Promise<void> { ): Promise<void> {
const constraints = PROVIDER_CONSTRAINTS[provider]; const constraints = PROVIDER_CONSTRAINTS[provider];
// Get image data // Check MIME type
let imageBuffer: Buffer; if (!constraints.supportedFormats.includes(image.mimeType)) {
if (source.type === "file") { throw new Error(`Unsupported image format: ${image.mimeType}`);
imageBuffer = await fs.readFile(source.path);
} else if (source.type === "base64") {
imageBuffer = Buffer.from(source.data, 'base64');
} else {
// For URLs, might need to fetch and validate
return;
} }
// Check size // Check size
const imageBuffer = Buffer.from(image.data, 'base64');
const sizeMB = imageBuffer.length / (1024 * 1024); const sizeMB = imageBuffer.length / (1024 * 1024);
if (sizeMB > constraints.maxSizeMB) { if (sizeMB > constraints.maxSizeMB) {
throw new Error(`Image exceeds ${constraints.maxSizeMB}MB limit`); throw new Error(`Image exceeds ${constraints.maxSizeMB}MB limit`);
@ -360,25 +296,27 @@ async function validateImage(
## Implementation Considerations ## Implementation Considerations
1. **Automatic Format Conversion**: 1. **Preprocessing**:
- Convert URLs to base64 for providers that don't support URLs - User is responsible for converting images to base64 before passing to API
- Handle file paths by reading and encoding files - Utility functions could be provided for common conversions (file to base64, URL to base64)
- Optimize image size/quality when needed - Image optimization (resize/compress) should happen before encoding
2. **Error Handling**: 2. **Error Handling**:
- Validate image formats before sending - Validate MIME types and sizes before sending
- Check model capabilities - Check model capabilities (via `model.input.includes("image")`)
- Provide clear error messages for unsupported features - Provide clear error messages for unsupported features
3. **Performance**: 3. **Performance**:
- Cache base64 encodings for reused images - Base64 encoding increases payload size by ~33%
- Stream large images when possible - Consider image compression before encoding
- Consider using provider-specific file upload APIs for large images - For Google GenAI, be aware of 20MB total request limit
4. **Token Counting**: 4. **Token Counting**:
- Images consume tokens (varies by provider) - Images consume tokens (varies by provider and image size)
- Include image token estimates in usage calculations - Include image token estimates in usage calculations
- Anthropic: ~1 token per ~3-4 bytes of base64 data
- OpenAI: Detailed images consume more tokens than low-detail
5. **Fallback Strategies**: 5. **Fallback Strategies**:
- If model doesn't support images, extract text description - If model doesn't support images, throw error or ignore images
- Offer image-to-text preprocessing options - Consider offering text-only fallback for non-vision models