diff --git a/packages/ai/test/image-limits.test.ts b/packages/ai/test/image-limits.test.ts index 7be5cdd5..e12e99db 100644 --- a/packages/ai/test/image-limits.test.ts +++ b/packages/ai/test/image-limits.test.ts @@ -2,61 +2,67 @@ * Image limits test suite * * Tests provider-specific image limitations: - * - Maximum number of images in a context + * - Maximum number of images in a context (with small 100x100 images) * - Maximum image size (bytes) * - Maximum image dimensions - * - Maximum 8k x 8k images (stress test) + * - Maximum payload (realistic large images stress test) * * ============================================================================ * DISCOVERED LIMITS (Dec 2025): * ============================================================================ * - * | Provider | Model | Max Images | Max Size | Max Dim | Max 8k Imgs | - * |-------------|--------------------|------------|----------|----------|-------------| - * | Anthropic | claude-3-5-haiku | 100 | 5MB | 8000px | 100 | - * | OpenAI | gpt-4o-mini | 500 | ≥25MB | ≥20000px | 100-200* | - * | Gemini | gemini-2.5-flash | ~2000** | ≥40MB | 8000px | (untested) | - * | Mistral | pixtral-12b | 8 | ~15MB | 8000px | 8 | - * | xAI | grok-2-vision | ≥100 | 25MB | 8000px | 100-150* | - * | Groq | llama-4-scout-17b | 5 | ~5MB | ~5760px | 0*** | - * | zAI | glm-4.5v | ≥100 | ≥20MB | 8000px | 400**** | - * | OpenRouter | z-ai/glm-4.5v | ~40**** | ~10MB | ≥20000px | 40**** | + * BASIC LIMITS (small images): + * | Provider | Model | Max Images | Max Size | Max Dim | + * |-------------|--------------------|------------|----------|----------| + * | Anthropic | claude-3-5-haiku | 100 | 5MB | 8000px | + * | OpenAI | gpt-4o-mini | 500 | ≥25MB | ≥20000px | + * | Gemini | gemini-2.5-flash | ~2000* | ≥40MB | 8000px | + * | Mistral | pixtral-12b | 8 | ~15MB | 8000px | + * | xAI | grok-2-vision | ≥100 | 25MB | 8000px | + * | Groq | llama-4-scout-17b | 5 | ~5MB | ~5760px**| + * | zAI | glm-4.5v | *** | ≥20MB | 8000px | + * | OpenRouter | z-ai/glm-4.5v | *** | ~10MB | ≥20000px | + * + * REALISTIC PAYLOAD LIMITS (large images): + * | Provider | Image Size | Max Count | Total Payload | Limit Hit | + * |-------------|------------|-----------|---------------|---------------------| + * | Anthropic | ~3MB | 6 | ~18MB | Request too large | + * | OpenAI | ~15MB | 2 | ~30MB | Generic error | + * | Gemini | ~20MB | 10 | ~200MB | String length | + * | Mistral | ~10MB | 4 | ~40MB | 413 Payload too large| + * | xAI | ~20MB | 1 | ~20MB | 413 Entity too large| + * | Groq | 5760px | 5 | N/A | 5 image limit | + * | zAI | ~15MB | 2 | ~30MB | 50MB request limit | + * | OpenRouter | ~5MB | 2 | ~10MB | Provider error | * * Notes: - * - Anthropic: Docs mention a "many images" rule (>20 images = 2000px max), - * but testing shows 100 x 8k images work fine. Anthropic may auto-resize - * internally. Total request size capped at 32MB. Explicit error at 101+. - * - OpenAI: * 100 x 8k succeeded, 200 x 8k failed with timeout. Actual limit - * likely between 100-200. Documented size limit is 20MB but ≥25MB works. - * - Gemini: ** Very permissive on count, hits rate limits before image limits. - * - Mistral: Very restrictive (8 images max). Explicit error at 9+. - * - xAI: * 100 x 8k succeeded, 150 x 8k timed out. 25MB size limit exact. - * - Groq: *** Most restrictive. 5 images max, 33177600 pixels max (≈5760x5760). - * 8k images (64M pixels) exceed limit, so 0 supported. - * - zAI: **** Context-window limited (65536 tokens). 400 x 8k succeeded, - * 500 x 8k exceeded token limit. - * - OpenRouter: **** Context-window limited (65536 tokens), not explicit - * image limit. 40 x 8k succeeded, 50 x 8k exceeded token limit. + * - Anthropic: 100 image hard limit, 5MB per image, but ~18MB total request + * limit in practice (32MB documented but hit limit at ~24MB). + * - OpenAI: 500 image limit but total payload limited to ~30-45MB. + * - Gemini: * Very permissive. 10 x 20MB = 200MB worked! + * - Mistral: 8 images max, ~40MB total payload. + * - xAI: 25MB per image but strict request size limit (~20MB total). + * - Groq: ** Most restrictive. 5 images max, 33177600 pixels max (≈5760x5760). + * - zAI: 50MB request limit (explicit in error message). + * - OpenRouter: *** Context-window limited (65536 tokens). * * ============================================================================ * PRACTICAL RECOMMENDATIONS FOR CODING AGENTS: * ============================================================================ * * Conservative cross-provider safe limits: - * - Max 5 images per request (for Groq compatibility) - * - Max 5MB per image (for Anthropic/Groq) + * - Max 2 images per request at ~5MB each (~10MB total) * - Max 5760px dimension (for Groq pixel limit) * * If excluding Groq: - * - Max 8 images per request (for Mistral) - * - Max 5MB per image (for Anthropic) - * - Max 8000px dimension (common limit) + * - Max 4 images per request at ~5MB each (~20MB total) + * - Max 8000px dimension * * For Anthropic-only (most common case): - * - Max 100 images per request + * - Max 6 images at ~3MB each OR 100 images at <200KB each * - Max 5MB per image * - Max 8000px dimension - * - Max 32MB total request size + * - Stay under ~18MB total request size * * ============================================================================ */ @@ -835,43 +841,48 @@ describe("Image Limits E2E Tests", () => { }); // ========================================================================= - // MAX 8K IMAGES TEST + // MAX SIZE IMAGES TEST // ========================================================================= - // Tests how many 8000x8000 images each provider can handle. - // This is important for: - // 1. Reproducing Anthropic's "many images" rule (>20 images = 2000px max) - // 2. Finding practical limits for prompt caching optimization + // Tests how many images at (or near) max allowed size each provider can handle. + // This tests realistic payload limits, not just image count with tiny files. + // + // Note: A real 8kx8k noise PNG is ~183MB (exceeds all provider limits). + // So we test with images sized near each provider's actual size limit. // ========================================================================= - describe("Max 8K Images (large image stress test)", () => { - // Generate a single 8k image to reuse - // Note: solid color compresses well but still has 8000x8000 pixel dimensions - let image8k: string; + describe("Max Size Images (realistic payload stress test)", () => { + // Generate images at specific sizes for each provider's limit + const imageCache: Map = new Map(); - beforeAll(() => { - console.log("Generating 8000x8000 test image..."); - image8k = generateImage(8000, 8000, "stress-8k.png"); - const sizeBytes = Buffer.from(image8k, "base64").length; - console.log( - ` 8k image size: ${(sizeBytes / 1024 / 1024).toFixed(2)}MB (compressed, but still 8000x8000 dimensions)`, - ); - }); + function getImageAtSize(targetMB: number): string { + if (imageCache.has(targetMB)) { + return imageCache.get(targetMB)!; + } + console.log(` Generating ~${targetMB}MB noise image...`); + const imageBase64 = generateImageWithSize(targetMB * 1024 * 1024, `stress-${targetMB}mb.png`); + const actualSize = Buffer.from(imageBase64, "base64").length; + console.log(` Actual size: ${(actualSize / 1024 / 1024).toFixed(2)}MB`); + imageCache.set(targetMB, imageBase64); + return imageBase64; + } - // Anthropic - known 100 image limit, testing if 8k dimensions change this + // Anthropic - 5MB per image limit, 32MB total request, 100 image count + // Using 3MB to stay under 5MB limit (generateImageWithSize has overhead) it.skipIf(!process.env.ANTHROPIC_API_KEY)( - "Anthropic: max 8k images before rejection", + "Anthropic: max ~3MB images before rejection", { timeout: 900000 }, async () => { const model = getModel("anthropic", "claude-3-5-haiku-20241022"); - // Known limit is 100 images - test around that boundary - const counts = [10, 20, 50, 80, 100, 110, 120]; + const image3mb = getImageAtSize(3); + // 32MB total limit / ~4MB actual = ~8 images + const counts = [1, 2, 4, 6, 8, 10, 12]; let lastSuccess = 0; let lastError: string | undefined; for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); + console.log(` Testing ${count} x ~3MB images...`); + const result = await testImageCount(model, count, image3mb); if (result.success) { lastSuccess = count; console.log(` SUCCESS`); @@ -882,142 +893,28 @@ describe("Image Limits E2E Tests", () => { } } - console.log(`\n Anthropic max 8k images: ${lastSuccess} (last error: ${lastError})`); - expect(lastSuccess).toBeGreaterThanOrEqual(5); - }, - ); - - // OpenAI - known 500 image limit - it.skipIf(!process.env.OPENAI_API_KEY)( - "OpenAI: max 8k images before rejection", - { timeout: 1800000 }, - async () => { - const model = getModel("openai", "gpt-4o-mini"); - // Known limit is 500 images - test around that boundary - const counts = [50, 100, 200, 300, 400, 500, 550]; - - let lastSuccess = 0; - let lastError: string | undefined; - - for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); - if (result.success) { - lastSuccess = count; - console.log(` SUCCESS`); - } else { - lastError = result.error; - console.log(` FAILED: ${result.error?.substring(0, 150)}`); - break; - } - } - - console.log(`\n OpenAI max 8k images: ${lastSuccess} (last error: ${lastError})`); - expect(lastSuccess).toBeGreaterThanOrEqual(5); - }, - ); - - // Gemini - known to be very permissive (~2000+ small images), but 8k may differ - it.skipIf(!process.env.GOOGLE_API_KEY)( - "Gemini: max 8k images before rejection", - { timeout: 1800000 }, - async () => { - const model = getModel("google", "gemini-2.5-flash"); - // Test progressively - 8k images are large so limit may be lower - const counts = [10, 50, 100, 200, 500, 1000]; - - let lastSuccess = 0; - let lastError: string | undefined; - - for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); - if (result.success) { - lastSuccess = count; - console.log(` SUCCESS`); - } else { - lastError = result.error; - console.log(` FAILED: ${result.error?.substring(0, 150)}`); - break; - } - } - - console.log(`\n Gemini max 8k images: ${lastSuccess} (last error: ${lastError})`); - expect(lastSuccess).toBeGreaterThanOrEqual(5); - }, - ); - - // Mistral - known 8 image limit - it.skipIf(!process.env.MISTRAL_API_KEY)( - "Mistral: max 8k images before rejection", - { timeout: 600000 }, - async () => { - const model = getModel("mistral", "pixtral-12b"); - // Known limit is 8 images - test around that boundary - const counts = [1, 2, 4, 6, 8, 9, 10]; - - let lastSuccess = 0; - let lastError: string | undefined; - - for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); - if (result.success) { - lastSuccess = count; - console.log(` SUCCESS`); - } else { - lastError = result.error; - console.log(` FAILED: ${result.error?.substring(0, 150)}`); - break; - } - } - - console.log(`\n Mistral max 8k images: ${lastSuccess} (last error: ${lastError})`); + console.log(`\n Anthropic max ~3MB images: ${lastSuccess} (last error: ${lastError})`); expect(lastSuccess).toBeGreaterThanOrEqual(1); }, ); - // xAI - tested up to 100 small images successfully - it.skipIf(!process.env.XAI_API_KEY)("xAI: max 8k images before rejection", { timeout: 1200000 }, async () => { - const model = getModel("xai", "grok-2-vision"); - // Test around the expected boundary - const counts = [10, 50, 100, 150, 200]; - - let lastSuccess = 0; - let lastError: string | undefined; - - for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); - if (result.success) { - lastSuccess = count; - console.log(` SUCCESS`); - } else { - lastError = result.error; - console.log(` FAILED: ${result.error?.substring(0, 150)}`); - break; - } - } - - console.log(`\n xAI max 8k images: ${lastSuccess} (last error: ${lastError})`); - expect(lastSuccess).toBeGreaterThanOrEqual(5); - }); - - // Groq - very limited (5 images, ~5760px max) - it.skipIf(!process.env.GROQ_API_KEY)( - "Groq: max 8k images before rejection (expect 0 - exceeds pixel limit)", - { timeout: 600000 }, + // OpenAI - 20MB per image documented, we found ≥25MB works + // Test with 15MB images to stay safely under limit + it.skipIf(!process.env.OPENAI_API_KEY)( + "OpenAI: max ~15MB images before rejection", + { timeout: 1800000 }, async () => { - const model = getModel("groq", "meta-llama/llama-4-scout-17b-16e-instruct"); - // 8k images exceed Groq's 33177600 pixel limit, so even 1 should fail - const counts = [1, 2, 3]; + const model = getModel("openai", "gpt-4o-mini"); + const image15mb = getImageAtSize(15); + // Test progressively + const counts = [1, 2, 5, 10, 20]; let lastSuccess = 0; let lastError: string | undefined; for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); + console.log(` Testing ${count} x ~15MB images...`); + const result = await testImageCount(model, count, image15mb); if (result.success) { lastSuccess = count; console.log(` SUCCESS`); @@ -1028,53 +925,28 @@ describe("Image Limits E2E Tests", () => { } } - console.log(`\n Groq max 8k images: ${lastSuccess} (last error: ${lastError})`); - // Groq should fail even with 1 image at 8k (64M pixels > 33M limit) - expect(lastSuccess).toBeGreaterThanOrEqual(0); + console.log(`\n OpenAI max ~15MB images: ${lastSuccess} (last error: ${lastError})`); + expect(lastSuccess).toBeGreaterThanOrEqual(1); }, ); - // zAI - tested up to 100 small images successfully, very permissive - it.skipIf(!process.env.ZAI_API_KEY)("zAI: max 8k images before rejection", { timeout: 1800000 }, async () => { - const model = getModel("zai", "glm-4.5v"); - // Very permissive - extend to find actual limit - const counts = [50, 100, 200, 300, 400, 500]; - - let lastSuccess = 0; - let lastError: string | undefined; - - for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); - if (result.success) { - lastSuccess = count; - console.log(` SUCCESS`); - } else { - lastError = result.error; - console.log(` FAILED: ${result.error?.substring(0, 150)}`); - break; - } - } - - console.log(`\n zAI max 8k images: ${lastSuccess} (last error: ${lastError})`); - expect(lastSuccess).toBeGreaterThanOrEqual(5); - }); - - // OpenRouter - context-window limited (~40 small images), 8k will be fewer - it.skipIf(!process.env.OPENROUTER_API_KEY)( - "OpenRouter: max 8k images before rejection", - { timeout: 900000 }, + // Gemini - very permissive, ≥40MB per image works + // Test with 20MB images + it.skipIf(!process.env.GEMINI_API_KEY)( + "Gemini: max ~20MB images before rejection", + { timeout: 1800000 }, async () => { - const model = getModel("openrouter", "z-ai/glm-4.5v"); - // 8k images consume more tokens, so limit will be lower than 40 - const counts = [1, 2, 5, 10, 20, 30, 40, 50]; + const model = getModel("google", "gemini-2.5-flash"); + const image20mb = getImageAtSize(20); + // Test progressively + const counts = [1, 2, 5, 10, 20, 50]; let lastSuccess = 0; let lastError: string | undefined; for (const count of counts) { - console.log(` Testing ${count} x 8k images...`); - const result = await testImageCount(model, count, image8k); + console.log(` Testing ${count} x ~20MB images...`); + const result = await testImageCount(model, count, image20mb); if (result.success) { lastSuccess = count; console.log(` SUCCESS`); @@ -1085,7 +957,162 @@ describe("Image Limits E2E Tests", () => { } } - console.log(`\n OpenRouter max 8k images: ${lastSuccess} (last error: ${lastError})`); + console.log(`\n Gemini max ~20MB images: ${lastSuccess} (last error: ${lastError})`); + expect(lastSuccess).toBeGreaterThanOrEqual(1); + }, + ); + + // Mistral - 8 image limit, ~15MB per image + // Test with 10MB images (safely under limit) + it.skipIf(!process.env.MISTRAL_API_KEY)( + "Mistral: max ~10MB images before rejection", + { timeout: 600000 }, + async () => { + const model = getModel("mistral", "pixtral-12b"); + const image10mb = getImageAtSize(10); + // Known limit is 8 images + const counts = [1, 2, 4, 6, 8, 9]; + + let lastSuccess = 0; + let lastError: string | undefined; + + for (const count of counts) { + console.log(` Testing ${count} x ~10MB images...`); + const result = await testImageCount(model, count, image10mb); + if (result.success) { + lastSuccess = count; + console.log(` SUCCESS`); + } else { + lastError = result.error; + console.log(` FAILED: ${result.error?.substring(0, 150)}`); + break; + } + } + + console.log(`\n Mistral max ~10MB images: ${lastSuccess} (last error: ${lastError})`); + expect(lastSuccess).toBeGreaterThanOrEqual(1); + }, + ); + + // xAI - 25MB per image limit (26214400 bytes exact) + // Test with 20MB images (safely under limit) + it.skipIf(!process.env.XAI_API_KEY)("xAI: max ~20MB images before rejection", { timeout: 1200000 }, async () => { + const model = getModel("xai", "grok-2-vision"); + const image20mb = getImageAtSize(20); + // Test progressively + const counts = [1, 2, 5, 10, 20]; + + let lastSuccess = 0; + let lastError: string | undefined; + + for (const count of counts) { + console.log(` Testing ${count} x ~20MB images...`); + const result = await testImageCount(model, count, image20mb); + if (result.success) { + lastSuccess = count; + console.log(` SUCCESS`); + } else { + lastError = result.error; + console.log(` FAILED: ${result.error?.substring(0, 150)}`); + break; + } + } + + console.log(`\n xAI max ~20MB images: ${lastSuccess} (last error: ${lastError})`); + expect(lastSuccess).toBeGreaterThanOrEqual(1); + }); + + // Groq - very limited (5 images, ~5760px max due to 33M pixel limit) + // 8k images (64M pixels) exceed limit, so test with 5760px images instead + it.skipIf(!process.env.GROQ_API_KEY)( + "Groq: max 5760px images before rejection", + { timeout: 600000 }, + async () => { + const model = getModel("groq", "meta-llama/llama-4-scout-17b-16e-instruct"); + // Generate 5760x5760 image (33177600 pixels = Groq's limit) + console.log(" Generating 5760x5760 test image for Groq..."); + const image5760 = generateImage(5760, 5760, "stress-5760.png"); + + // Known limit is 5 images + const counts = [1, 2, 3, 4, 5, 6]; + + let lastSuccess = 0; + let lastError: string | undefined; + + for (const count of counts) { + console.log(` Testing ${count} x 5760px images...`); + const result = await testImageCount(model, count, image5760); + if (result.success) { + lastSuccess = count; + console.log(` SUCCESS`); + } else { + lastError = result.error; + console.log(` FAILED: ${result.error?.substring(0, 150)}`); + break; + } + } + + console.log(`\n Groq max 5760px images: ${lastSuccess} (last error: ${lastError})`); + expect(lastSuccess).toBeGreaterThanOrEqual(1); + }, + ); + + // zAI - ≥20MB per image, context-window limited (65k tokens) + // Test with 15MB images + it.skipIf(!process.env.ZAI_API_KEY)("zAI: max ~15MB images before rejection", { timeout: 1200000 }, async () => { + const model = getModel("zai", "glm-4.5v"); + const image15mb = getImageAtSize(15); + // Context-limited, test progressively + const counts = [1, 2, 5, 10, 20]; + + let lastSuccess = 0; + let lastError: string | undefined; + + for (const count of counts) { + console.log(` Testing ${count} x ~15MB images...`); + const result = await testImageCount(model, count, image15mb); + if (result.success) { + lastSuccess = count; + console.log(` SUCCESS`); + } else { + lastError = result.error; + console.log(` FAILED: ${result.error?.substring(0, 150)}`); + break; + } + } + + console.log(`\n zAI max ~15MB images: ${lastSuccess} (last error: ${lastError})`); + expect(lastSuccess).toBeGreaterThanOrEqual(1); + }); + + // OpenRouter - ~10MB per image, context-window limited (65k tokens) + // Test with 5MB images (safer size) + it.skipIf(!process.env.OPENROUTER_API_KEY)( + "OpenRouter: max ~5MB images before rejection", + { timeout: 900000 }, + async () => { + const model = getModel("openrouter", "z-ai/glm-4.5v"); + const image5mb = getImageAtSize(5); + // Context-limited, test progressively + const counts = [1, 2, 5, 10, 20]; + + let lastSuccess = 0; + let lastError: string | undefined; + + for (const count of counts) { + console.log(` Testing ${count} x ~5MB images...`); + const result = await testImageCount(model, count, image5mb); + if (result.success) { + lastSuccess = count; + console.log(` SUCCESS`); + } else { + lastError = result.error; + console.log(` FAILED: ${result.error?.substring(0, 150)}`); + break; + } + } + + console.log(`\n OpenRouter max ~5MB images: ${lastSuccess} (last error: ${lastError})`); expect(lastSuccess).toBeGreaterThanOrEqual(1); }, ); diff --git a/packages/coding-agent/CHANGELOG.md b/packages/coding-agent/CHANGELOG.md index bdacf67b..e041c237 100644 --- a/packages/coding-agent/CHANGELOG.md +++ b/packages/coding-agent/CHANGELOG.md @@ -2,6 +2,12 @@ ## [Unreleased] +### Fixed + +- Fixed tool execution showing green (success) background while still running. Now correctly shows gray (pending) background until the tool completes. + +## [0.22.3] - 2025-12-16 + ### Added - **Streaming bash output**: Bash tool now streams output in real-time during execution. The TUI displays live progress with the last 5 lines visible (expandable with ctrl+o). ([#44](https://github.com/badlogic/pi-mono/issues/44)) diff --git a/packages/coding-agent/src/modes/interactive/components/tool-execution.ts b/packages/coding-agent/src/modes/interactive/components/tool-execution.ts index fcbf63ce..36f4aaf6 100644 --- a/packages/coding-agent/src/modes/interactive/components/tool-execution.ts +++ b/packages/coding-agent/src/modes/interactive/components/tool-execution.ts @@ -44,7 +44,7 @@ export class ToolExecutionComponent extends Container { private args: any; private expanded = false; private showImages: boolean; - private isPartial = false; + private isPartial = true; private result?: { content: Array<{ type: string; text?: string; data?: string; mimeType?: string }>; isError: boolean;