diff --git a/packages/ai/test/image-limits.test.ts b/packages/ai/test/image-limits.test.ts
index 7be5cdd5..e12e99db 100644
--- a/packages/ai/test/image-limits.test.ts
+++ b/packages/ai/test/image-limits.test.ts
@@ -2,61 +2,67 @@
  * Image limits test suite
  *
  * Tests provider-specific image limitations:
- * - Maximum number of images in a context
+ * - Maximum number of images in a context (with small 100x100 images)
  * - Maximum image size (bytes)
  * - Maximum image dimensions
- * - Maximum 8k x 8k images (stress test)
+ * - Maximum payload (realistic large images stress test)
  *
  * ============================================================================
  * DISCOVERED LIMITS (Dec 2025):
  * ============================================================================
  *
- * | Provider    | Model              | Max Images | Max Size | Max Dim  | Max 8k Imgs |
- * |-------------|--------------------|------------|----------|----------|-------------|
- * | Anthropic   | claude-3-5-haiku   | 100        | 5MB      | 8000px   | 100         |
- * | OpenAI      | gpt-4o-mini        | 500        | ≥25MB    | ≥20000px | 100-200*    |
- * | Gemini      | gemini-2.5-flash   | ~2000**    | ≥40MB    | 8000px   | (untested)  |
- * | Mistral     | pixtral-12b        | 8          | ~15MB    | 8000px   | 8           |
- * | xAI         | grok-2-vision      | ≥100       | 25MB     | 8000px   | 100-150*    |
- * | Groq        | llama-4-scout-17b  | 5          | ~5MB     | ~5760px  | 0***        |
- * | zAI         | glm-4.5v           | ≥100       | ≥20MB    | 8000px   | 400****     |
- * | OpenRouter  | z-ai/glm-4.5v      | ~40****    | ~10MB    | ≥20000px | 40****      |
+ * BASIC LIMITS (small images):
+ * | Provider    | Model              | Max Images | Max Size | Max Dim  |
+ * |-------------|--------------------|------------|----------|----------|
+ * | Anthropic   | claude-3-5-haiku   | 100        | 5MB      | 8000px   |
+ * | OpenAI      | gpt-4o-mini        | 500        | ≥25MB    | ≥20000px |
+ * | Gemini      | gemini-2.5-flash   | ~2000*     | ≥40MB    | 8000px   |
+ * | Mistral     | pixtral-12b        | 8          | ~15MB    | 8000px   |
+ * | xAI         | grok-2-vision      | ≥100       | 25MB     | 8000px   |
+ * | Groq        | llama-4-scout-17b  | 5          | ~5MB     | ~5760px**|
+ * | zAI         | glm-4.5v           | ***        | ≥20MB    | 8000px   |
+ * | OpenRouter  | z-ai/glm-4.5v      | ***        | ~10MB    | ≥20000px |
+ *
+ * REALISTIC PAYLOAD LIMITS (large images):
+ * | Provider    | Image Size | Max Count | Total Payload | Limit Hit          |
+ * |-------------|------------|-----------|---------------|---------------------|
+ * | Anthropic   | ~3MB       | 6         | ~18MB         | Request too large   |
+ * | OpenAI      | ~15MB      | 2         | ~30MB         | Generic error       |
+ * | Gemini      | ~20MB      | 10        | ~200MB        | String length       |
+ * | Mistral     | ~10MB      | 4         | ~40MB         | 413 Payload too large|
+ * | xAI         | ~20MB      | 1         | ~20MB         | 413 Entity too large|
+ * | Groq        | 5760px     | 5         | N/A           | 5 image limit       |
+ * | zAI         | ~15MB      | 2         | ~30MB         | 50MB request limit  |
+ * | OpenRouter  | ~5MB       | 2         | ~10MB         | Provider error      |
  *
  * Notes:
- * - Anthropic: Docs mention a "many images" rule (>20 images = 2000px max),
- *   but testing shows 100 x 8k images work fine. Anthropic may auto-resize
- *   internally. Total request size capped at 32MB. Explicit error at 101+.
- * - OpenAI: * 100 x 8k succeeded, 200 x 8k failed with timeout. Actual limit
- *   likely between 100-200. Documented size limit is 20MB but ≥25MB works.
- * - Gemini: ** Very permissive on count, hits rate limits before image limits.
- * - Mistral: Very restrictive (8 images max). Explicit error at 9+.
- * - xAI: * 100 x 8k succeeded, 150 x 8k timed out. 25MB size limit exact.
- * - Groq: *** Most restrictive. 5 images max, 33177600 pixels max (≈5760x5760).
- *   8k images (64M pixels) exceed limit, so 0 supported.
- * - zAI: **** Context-window limited (65536 tokens). 400 x 8k succeeded,
- *   500 x 8k exceeded token limit.
- * - OpenRouter: **** Context-window limited (65536 tokens), not explicit
- *   image limit. 40 x 8k succeeded, 50 x 8k exceeded token limit.
+ * - Anthropic: 100 image hard limit, 5MB per image, but ~18MB total request
+ *   limit in practice (32MB documented but hit limit at ~24MB).
+ * - OpenAI: 500 image limit but total payload limited to ~30-45MB.
+ * - Gemini: * Very permissive. 10 x 20MB = 200MB worked!
+ * - Mistral: 8 images max, ~40MB total payload.
+ * - xAI: 25MB per image but strict request size limit (~20MB total).
+ * - Groq: ** Most restrictive. 5 images max, 33177600 pixels max (≈5760x5760).
+ * - zAI: 50MB request limit (explicit in error message).
+ * - OpenRouter: *** Context-window limited (65536 tokens).
  *
  * ============================================================================
  * PRACTICAL RECOMMENDATIONS FOR CODING AGENTS:
  * ============================================================================
  *
  * Conservative cross-provider safe limits:
- * - Max 5 images per request (for Groq compatibility)
- * - Max 5MB per image (for Anthropic/Groq)
+ * - Max 2 images per request at ~5MB each (~10MB total)
  * - Max 5760px dimension (for Groq pixel limit)
  *
  * If excluding Groq:
- * - Max 8 images per request (for Mistral)
- * - Max 5MB per image (for Anthropic)
- * - Max 8000px dimension (common limit)
+ * - Max 4 images per request at ~5MB each (~20MB total)
+ * - Max 8000px dimension
  *
  * For Anthropic-only (most common case):
- * - Max 100 images per request
+ * - Max 6 images at ~3MB each OR 100 images at <200KB each
  * - Max 5MB per image
  * - Max 8000px dimension
- * - Max 32MB total request size
+ * - Stay under ~18MB total request size
  *
  * ============================================================================
  */
@@ -835,43 +841,48 @@ describe("Image Limits E2E Tests", () => {
 	});
 
 	// =========================================================================
-	// MAX 8K IMAGES TEST
+	// MAX SIZE IMAGES TEST
 	// =========================================================================
-	// Tests how many 8000x8000 images each provider can handle.
-	// This is important for:
-	// 1. Reproducing Anthropic's "many images" rule (>20 images = 2000px max)
-	// 2. Finding practical limits for prompt caching optimization
+	// Tests how many images at (or near) max allowed size each provider can handle.
+	// This tests realistic payload limits, not just image count with tiny files.
+	//
+	// Note: A real 8kx8k noise PNG is ~183MB (exceeds all provider limits).
+	// So we test with images sized near each provider's actual size limit.
 	// =========================================================================
 
-	describe("Max 8K Images (large image stress test)", () => {
-		// Generate a single 8k image to reuse
-		// Note: solid color compresses well but still has 8000x8000 pixel dimensions
-		let image8k: string;
+	describe("Max Size Images (realistic payload stress test)", () => {
+		// Generate images at specific sizes for each provider's limit
+		const imageCache: Map<number, string> = new Map();
 
-		beforeAll(() => {
-			console.log("Generating 8000x8000 test image...");
-			image8k = generateImage(8000, 8000, "stress-8k.png");
-			const sizeBytes = Buffer.from(image8k, "base64").length;
-			console.log(
-				`  8k image size: ${(sizeBytes / 1024 / 1024).toFixed(2)}MB (compressed, but still 8000x8000 dimensions)`,
-			);
-		});
+		function getImageAtSize(targetMB: number): string {
+			if (imageCache.has(targetMB)) {
+				return imageCache.get(targetMB)!;
+			}
+			console.log(`  Generating ~${targetMB}MB noise image...`);
+			const imageBase64 = generateImageWithSize(targetMB * 1024 * 1024, `stress-${targetMB}mb.png`);
+			const actualSize = Buffer.from(imageBase64, "base64").length;
+			console.log(`    Actual size: ${(actualSize / 1024 / 1024).toFixed(2)}MB`);
+			imageCache.set(targetMB, imageBase64);
+			return imageBase64;
+		}
 
-		// Anthropic - known 100 image limit, testing if 8k dimensions change this
+		// Anthropic - 5MB per image limit, 32MB total request, 100 image count
+		// Using 3MB to stay under 5MB limit (generateImageWithSize has overhead)
 		it.skipIf(!process.env.ANTHROPIC_API_KEY)(
-			"Anthropic: max 8k images before rejection",
+			"Anthropic: max ~3MB images before rejection",
 			{ timeout: 900000 },
 			async () => {
 				const model = getModel("anthropic", "claude-3-5-haiku-20241022");
-				// Known limit is 100 images - test around that boundary
-				const counts = [10, 20, 50, 80, 100, 110, 120];
+				const image3mb = getImageAtSize(3);
+				// 32MB total limit / ~4MB actual = ~8 images
+				const counts = [1, 2, 4, 6, 8, 10, 12];
 
 				let lastSuccess = 0;
 				let lastError: string | undefined;
 
 				for (const count of counts) {
-					console.log(`  Testing ${count} x 8k images...`);
-					const result = await testImageCount(model, count, image8k);
+					console.log(`  Testing ${count} x ~3MB images...`);
+					const result = await testImageCount(model, count, image3mb);
 					if (result.success) {
 						lastSuccess = count;
 						console.log(`    SUCCESS`);
@@ -882,142 +893,28 @@ describe("Image Limits E2E Tests", () => {
 					}
 				}
 
-				console.log(`\n  Anthropic max 8k images: ${lastSuccess} (last error: ${lastError})`);
-				expect(lastSuccess).toBeGreaterThanOrEqual(5);
-			},
-		);
-
-		// OpenAI - known 500 image limit
-		it.skipIf(!process.env.OPENAI_API_KEY)(
-			"OpenAI: max 8k images before rejection",
-			{ timeout: 1800000 },
-			async () => {
-				const model = getModel("openai", "gpt-4o-mini");
-				// Known limit is 500 images - test around that boundary
-				const counts = [50, 100, 200, 300, 400, 500, 550];
-
-				let lastSuccess = 0;
-				let lastError: string | undefined;
-
-				for (const count of counts) {
-					console.log(`  Testing ${count} x 8k images...`);
-					const result = await testImageCount(model, count, image8k);
-					if (result.success) {
-						lastSuccess = count;
-						console.log(`    SUCCESS`);
-					} else {
-						lastError = result.error;
-						console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
-						break;
-					}
-				}
-
-				console.log(`\n  OpenAI max 8k images: ${lastSuccess} (last error: ${lastError})`);
-				expect(lastSuccess).toBeGreaterThanOrEqual(5);
-			},
-		);
-
-		// Gemini - known to be very permissive (~2000+ small images), but 8k may differ
-		it.skipIf(!process.env.GOOGLE_API_KEY)(
-			"Gemini: max 8k images before rejection",
-			{ timeout: 1800000 },
-			async () => {
-				const model = getModel("google", "gemini-2.5-flash");
-				// Test progressively - 8k images are large so limit may be lower
-				const counts = [10, 50, 100, 200, 500, 1000];
-
-				let lastSuccess = 0;
-				let lastError: string | undefined;
-
-				for (const count of counts) {
-					console.log(`  Testing ${count} x 8k images...`);
-					const result = await testImageCount(model, count, image8k);
-					if (result.success) {
-						lastSuccess = count;
-						console.log(`    SUCCESS`);
-					} else {
-						lastError = result.error;
-						console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
-						break;
-					}
-				}
-
-				console.log(`\n  Gemini max 8k images: ${lastSuccess} (last error: ${lastError})`);
-				expect(lastSuccess).toBeGreaterThanOrEqual(5);
-			},
-		);
-
-		// Mistral - known 8 image limit
-		it.skipIf(!process.env.MISTRAL_API_KEY)(
-			"Mistral: max 8k images before rejection",
-			{ timeout: 600000 },
-			async () => {
-				const model = getModel("mistral", "pixtral-12b");
-				// Known limit is 8 images - test around that boundary
-				const counts = [1, 2, 4, 6, 8, 9, 10];
-
-				let lastSuccess = 0;
-				let lastError: string | undefined;
-
-				for (const count of counts) {
-					console.log(`  Testing ${count} x 8k images...`);
-					const result = await testImageCount(model, count, image8k);
-					if (result.success) {
-						lastSuccess = count;
-						console.log(`    SUCCESS`);
-					} else {
-						lastError = result.error;
-						console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
-						break;
-					}
-				}
-
-				console.log(`\n  Mistral max 8k images: ${lastSuccess} (last error: ${lastError})`);
+				console.log(`\n  Anthropic max ~3MB images: ${lastSuccess} (last error: ${lastError})`);
 				expect(lastSuccess).toBeGreaterThanOrEqual(1);
 			},
 		);
 
-		// xAI - tested up to 100 small images successfully
-		it.skipIf(!process.env.XAI_API_KEY)("xAI: max 8k images before rejection", { timeout: 1200000 }, async () => {
-			const model = getModel("xai", "grok-2-vision");
-			// Test around the expected boundary
-			const counts = [10, 50, 100, 150, 200];
-
-			let lastSuccess = 0;
-			let lastError: string | undefined;
-
-			for (const count of counts) {
-				console.log(`  Testing ${count} x 8k images...`);
-				const result = await testImageCount(model, count, image8k);
-				if (result.success) {
-					lastSuccess = count;
-					console.log(`    SUCCESS`);
-				} else {
-					lastError = result.error;
-					console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
-					break;
-				}
-			}
-
-			console.log(`\n  xAI max 8k images: ${lastSuccess} (last error: ${lastError})`);
-			expect(lastSuccess).toBeGreaterThanOrEqual(5);
-		});
-
-		// Groq - very limited (5 images, ~5760px max)
-		it.skipIf(!process.env.GROQ_API_KEY)(
-			"Groq: max 8k images before rejection (expect 0 - exceeds pixel limit)",
-			{ timeout: 600000 },
+		// OpenAI - 20MB per image documented, we found ≥25MB works
+		// Test with 15MB images to stay safely under limit
+		it.skipIf(!process.env.OPENAI_API_KEY)(
+			"OpenAI: max ~15MB images before rejection",
+			{ timeout: 1800000 },
 			async () => {
-				const model = getModel("groq", "meta-llama/llama-4-scout-17b-16e-instruct");
-				// 8k images exceed Groq's 33177600 pixel limit, so even 1 should fail
-				const counts = [1, 2, 3];
+				const model = getModel("openai", "gpt-4o-mini");
+				const image15mb = getImageAtSize(15);
+				// Test progressively
+				const counts = [1, 2, 5, 10, 20];
 
 				let lastSuccess = 0;
 				let lastError: string | undefined;
 
 				for (const count of counts) {
-					console.log(`  Testing ${count} x 8k images...`);
-					const result = await testImageCount(model, count, image8k);
+					console.log(`  Testing ${count} x ~15MB images...`);
+					const result = await testImageCount(model, count, image15mb);
 					if (result.success) {
 						lastSuccess = count;
 						console.log(`    SUCCESS`);
@@ -1028,53 +925,28 @@ describe("Image Limits E2E Tests", () => {
 					}
 				}
 
-				console.log(`\n  Groq max 8k images: ${lastSuccess} (last error: ${lastError})`);
-				// Groq should fail even with 1 image at 8k (64M pixels > 33M limit)
-				expect(lastSuccess).toBeGreaterThanOrEqual(0);
+				console.log(`\n  OpenAI max ~15MB images: ${lastSuccess} (last error: ${lastError})`);
+				expect(lastSuccess).toBeGreaterThanOrEqual(1);
 			},
 		);
 
-		// zAI - tested up to 100 small images successfully, very permissive
-		it.skipIf(!process.env.ZAI_API_KEY)("zAI: max 8k images before rejection", { timeout: 1800000 }, async () => {
-			const model = getModel("zai", "glm-4.5v");
-			// Very permissive - extend to find actual limit
-			const counts = [50, 100, 200, 300, 400, 500];
-
-			let lastSuccess = 0;
-			let lastError: string | undefined;
-
-			for (const count of counts) {
-				console.log(`  Testing ${count} x 8k images...`);
-				const result = await testImageCount(model, count, image8k);
-				if (result.success) {
-					lastSuccess = count;
-					console.log(`    SUCCESS`);
-				} else {
-					lastError = result.error;
-					console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
-					break;
-				}
-			}
-
-			console.log(`\n  zAI max 8k images: ${lastSuccess} (last error: ${lastError})`);
-			expect(lastSuccess).toBeGreaterThanOrEqual(5);
-		});
-
-		// OpenRouter - context-window limited (~40 small images), 8k will be fewer
-		it.skipIf(!process.env.OPENROUTER_API_KEY)(
-			"OpenRouter: max 8k images before rejection",
-			{ timeout: 900000 },
+		// Gemini - very permissive, ≥40MB per image works
+		// Test with 20MB images
+		it.skipIf(!process.env.GEMINI_API_KEY)(
+			"Gemini: max ~20MB images before rejection",
+			{ timeout: 1800000 },
 			async () => {
-				const model = getModel("openrouter", "z-ai/glm-4.5v");
-				// 8k images consume more tokens, so limit will be lower than 40
-				const counts = [1, 2, 5, 10, 20, 30, 40, 50];
+				const model = getModel("google", "gemini-2.5-flash");
+				const image20mb = getImageAtSize(20);
+				// Test progressively
+				const counts = [1, 2, 5, 10, 20, 50];
 
 				let lastSuccess = 0;
 				let lastError: string | undefined;
 
 				for (const count of counts) {
-					console.log(`  Testing ${count} x 8k images...`);
-					const result = await testImageCount(model, count, image8k);
+					console.log(`  Testing ${count} x ~20MB images...`);
+					const result = await testImageCount(model, count, image20mb);
 					if (result.success) {
 						lastSuccess = count;
 						console.log(`    SUCCESS`);
@@ -1085,7 +957,162 @@ describe("Image Limits E2E Tests", () => {
 					}
 				}
 
-				console.log(`\n  OpenRouter max 8k images: ${lastSuccess} (last error: ${lastError})`);
+				console.log(`\n  Gemini max ~20MB images: ${lastSuccess} (last error: ${lastError})`);
+				expect(lastSuccess).toBeGreaterThanOrEqual(1);
+			},
+		);
+
+		// Mistral - 8 image limit, ~15MB per image
+		// Test with 10MB images (safely under limit)
+		it.skipIf(!process.env.MISTRAL_API_KEY)(
+			"Mistral: max ~10MB images before rejection",
+			{ timeout: 600000 },
+			async () => {
+				const model = getModel("mistral", "pixtral-12b");
+				const image10mb = getImageAtSize(10);
+				// Known limit is 8 images
+				const counts = [1, 2, 4, 6, 8, 9];
+
+				let lastSuccess = 0;
+				let lastError: string | undefined;
+
+				for (const count of counts) {
+					console.log(`  Testing ${count} x ~10MB images...`);
+					const result = await testImageCount(model, count, image10mb);
+					if (result.success) {
+						lastSuccess = count;
+						console.log(`    SUCCESS`);
+					} else {
+						lastError = result.error;
+						console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
+						break;
+					}
+				}
+
+				console.log(`\n  Mistral max ~10MB images: ${lastSuccess} (last error: ${lastError})`);
+				expect(lastSuccess).toBeGreaterThanOrEqual(1);
+			},
+		);
+
+		// xAI - 25MB per image limit (26214400 bytes exact)
+		// Test with 20MB images (safely under limit)
+		it.skipIf(!process.env.XAI_API_KEY)("xAI: max ~20MB images before rejection", { timeout: 1200000 }, async () => {
+			const model = getModel("xai", "grok-2-vision");
+			const image20mb = getImageAtSize(20);
+			// Test progressively
+			const counts = [1, 2, 5, 10, 20];
+
+			let lastSuccess = 0;
+			let lastError: string | undefined;
+
+			for (const count of counts) {
+				console.log(`  Testing ${count} x ~20MB images...`);
+				const result = await testImageCount(model, count, image20mb);
+				if (result.success) {
+					lastSuccess = count;
+					console.log(`    SUCCESS`);
+				} else {
+					lastError = result.error;
+					console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
+					break;
+				}
+			}
+
+			console.log(`\n  xAI max ~20MB images: ${lastSuccess} (last error: ${lastError})`);
+			expect(lastSuccess).toBeGreaterThanOrEqual(1);
+		});
+
+		// Groq - very limited (5 images, ~5760px max due to 33M pixel limit)
+		// 8k images (64M pixels) exceed limit, so test with 5760px images instead
+		it.skipIf(!process.env.GROQ_API_KEY)(
+			"Groq: max 5760px images before rejection",
+			{ timeout: 600000 },
+			async () => {
+				const model = getModel("groq", "meta-llama/llama-4-scout-17b-16e-instruct");
+				// Generate 5760x5760 image (33177600 pixels = Groq's limit)
+				console.log("  Generating 5760x5760 test image for Groq...");
+				const image5760 = generateImage(5760, 5760, "stress-5760.png");
+
+				// Known limit is 5 images
+				const counts = [1, 2, 3, 4, 5, 6];
+
+				let lastSuccess = 0;
+				let lastError: string | undefined;
+
+				for (const count of counts) {
+					console.log(`  Testing ${count} x 5760px images...`);
+					const result = await testImageCount(model, count, image5760);
+					if (result.success) {
+						lastSuccess = count;
+						console.log(`    SUCCESS`);
+					} else {
+						lastError = result.error;
+						console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
+						break;
+					}
+				}
+
+				console.log(`\n  Groq max 5760px images: ${lastSuccess} (last error: ${lastError})`);
+				expect(lastSuccess).toBeGreaterThanOrEqual(1);
+			},
+		);
+
+		// zAI - ≥20MB per image, context-window limited (65k tokens)
+		// Test with 15MB images
+		it.skipIf(!process.env.ZAI_API_KEY)("zAI: max ~15MB images before rejection", { timeout: 1200000 }, async () => {
+			const model = getModel("zai", "glm-4.5v");
+			const image15mb = getImageAtSize(15);
+			// Context-limited, test progressively
+			const counts = [1, 2, 5, 10, 20];
+
+			let lastSuccess = 0;
+			let lastError: string | undefined;
+
+			for (const count of counts) {
+				console.log(`  Testing ${count} x ~15MB images...`);
+				const result = await testImageCount(model, count, image15mb);
+				if (result.success) {
+					lastSuccess = count;
+					console.log(`    SUCCESS`);
+				} else {
+					lastError = result.error;
+					console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
+					break;
+				}
+			}
+
+			console.log(`\n  zAI max ~15MB images: ${lastSuccess} (last error: ${lastError})`);
+			expect(lastSuccess).toBeGreaterThanOrEqual(1);
+		});
+
+		// OpenRouter - ~10MB per image, context-window limited (65k tokens)
+		// Test with 5MB images (safer size)
+		it.skipIf(!process.env.OPENROUTER_API_KEY)(
+			"OpenRouter: max ~5MB images before rejection",
+			{ timeout: 900000 },
+			async () => {
+				const model = getModel("openrouter", "z-ai/glm-4.5v");
+				const image5mb = getImageAtSize(5);
+				// Context-limited, test progressively
+				const counts = [1, 2, 5, 10, 20];
+
+				let lastSuccess = 0;
+				let lastError: string | undefined;
+
+				for (const count of counts) {
+					console.log(`  Testing ${count} x ~5MB images...`);
+					const result = await testImageCount(model, count, image5mb);
+					if (result.success) {
+						lastSuccess = count;
+						console.log(`    SUCCESS`);
+					} else {
+						lastError = result.error;
+						console.log(`    FAILED: ${result.error?.substring(0, 150)}`);
+						break;
+					}
+				}
+
+				console.log(`\n  OpenRouter max ~5MB images: ${lastSuccess} (last error: ${lastError})`);
 				expect(lastSuccess).toBeGreaterThanOrEqual(1);
 			},
 		);
diff --git a/packages/coding-agent/CHANGELOG.md b/packages/coding-agent/CHANGELOG.md
index bdacf67b..e041c237 100644
--- a/packages/coding-agent/CHANGELOG.md
+++ b/packages/coding-agent/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 ## [Unreleased]
 
+### Fixed
+
+- Fixed tool execution showing green (success) background while still running. Now correctly shows gray (pending) background until the tool completes.
+
+## [0.22.3] - 2025-12-16
+
 ### Added
 
 - **Streaming bash output**: Bash tool now streams output in real-time during execution. The TUI displays live progress with the last 5 lines visible (expandable with ctrl+o). ([#44](https://github.com/badlogic/pi-mono/issues/44))
diff --git a/packages/coding-agent/src/modes/interactive/components/tool-execution.ts b/packages/coding-agent/src/modes/interactive/components/tool-execution.ts
index fcbf63ce..36f4aaf6 100644
--- a/packages/coding-agent/src/modes/interactive/components/tool-execution.ts
+++ b/packages/coding-agent/src/modes/interactive/components/tool-execution.ts
@@ -44,7 +44,7 @@ export class ToolExecutionComponent extends Container {
 	private args: any;
 	private expanded = false;
 	private showImages: boolean;
-	private isPartial = false;
+	private isPartial = true;
 	private result?: {
 		content: Array<{ type: string; text?: string; data?: string; mimeType?: string }>;
 		isError: boolean;