From 1b6a147579cbe9e7b593259051d507319cc6b127 Mon Sep 17 00:00:00 2001
From: Mario Zechner <badlogicgames@gmail.com>
Date: Thu, 29 Jan 2026 02:22:06 +0100
Subject: [PATCH] feat(ai): add PI_CACHE_RETENTION env var for extended prompt
 caching

Adds support for extended cache retention via PI_CACHE_RETENTION=long:
- Anthropic: 5m -> 1h TTL
- OpenAI: in-memory -> 24h retention

Only applies to direct API calls (api.anthropic.com, api.openai.com).
Proxies and other providers are unaffected.

fixes #967
---
 packages/ai/CHANGELOG.md                      |   4 +
 packages/ai/README.md                         |  13 ++
 packages/ai/src/providers/anthropic.ts        |  23 +-
 packages/ai/src/providers/openai-responses.ts |  17 ++
 packages/ai/test/cache-retention.test.ts      | 199 ++++++++++++++++++
 packages/coding-agent/README.md               |   1 +
 6 files changed, 256 insertions(+), 1 deletion(-)
 create mode 100644 packages/ai/test/cache-retention.test.ts

diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md
index db90dbc1..4159ccc7 100644
--- a/packages/ai/CHANGELOG.md
+++ b/packages/ai/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+### Added
+
+- Added `PI_CACHE_RETENTION` environment variable to control cache TTL for Anthropic (5m vs 1h) and OpenAI (in-memory vs 24h). Set to `long` for extended retention. Only applies to direct API calls (api.anthropic.com, api.openai.com). ([#967](https://github.com/badlogic/pi-mono/issues/967))
+
 ### Fixed
 
 - Fixed cross-provider handoff failing when switching from OpenAI Responses API providers (github-copilot, openai-codex) to other providers due to pipe-separated tool call IDs not being normalized, and trailing underscores in truncated IDs being rejected by OpenAI Codex ([#1022](https://github.com/badlogic/pi-mono/issues/1022))
diff --git a/packages/ai/README.md b/packages/ai/README.md
index cf58728b..6207e4b7 100644
--- a/packages/ai/README.md
+++ b/packages/ai/README.md
@@ -909,6 +909,19 @@ const response = await complete(model, context, {
 });
 ```
 
+#### Cache Retention
+
+Set `PI_CACHE_RETENTION=long` to extend prompt cache retention:
+
+| Provider | Default | With `PI_CACHE_RETENTION=long` |
+|----------|---------|-------------------------------|
+| Anthropic | 5 minutes | 1 hour |
+| OpenAI | in-memory | 24 hours |
+
+This only affects direct API calls to `api.anthropic.com` and `api.openai.com`. Proxies and other providers are unaffected.
+
+> **Note**: Extended cache retention may increase costs for Anthropic (cache writes are charged at a higher rate). OpenAI's 24h retention has no additional cost.
+
 ### Checking Environment Variables
 
 ```typescript
diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts
index 6e99e852..6755c4c4 100644
--- a/packages/ai/src/providers/anthropic.ts
+++ b/packages/ai/src/providers/anthropic.ts
@@ -30,6 +30,22 @@ import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
 import { adjustMaxTokensForThinking, buildBaseOptions } from "./simple-options.js";
 import { transformMessages } from "./transform-messages.js";
 
+/**
+ * Get cache TTL based on PI_CACHE_RETENTION env var.
+ * Only applies to direct Anthropic API calls (api.anthropic.com).
+ * Returns '1h' for long retention, undefined for default (5m).
+ */
+function getCacheTtl(baseUrl: string): "1h" | undefined {
+	if (
+		typeof process !== "undefined" &&
+		process.env.PI_CACHE_RETENTION === "long" &&
+		baseUrl.includes("api.anthropic.com")
+	) {
+		return "1h";
+	}
+	return undefined;
+}
+
 // Stealth mode: Mimic Claude Code's tool naming exactly
 const claudeCodeVersion = "2.1.2";
 
@@ -452,6 +468,7 @@ function buildParams(
 	};
 
 	// For OAuth tokens, we MUST include Claude Code identity
+	const cacheTtl = getCacheTtl(model.baseUrl);
 	if (isOAuthToken) {
 		params.system = [
 			{
@@ -459,6 +476,7 @@ function buildParams(
 				text: "You are Claude Code, Anthropic's official CLI for Claude.",
 				cache_control: {
 					type: "ephemeral",
+					...(cacheTtl && { ttl: cacheTtl }),
 				},
 			},
 		];
@@ -468,6 +486,7 @@ function buildParams(
 				text: sanitizeSurrogates(context.systemPrompt),
 				cache_control: {
 					type: "ephemeral",
+					...(cacheTtl && { ttl: cacheTtl }),
 				},
 			});
 		}
@@ -479,6 +498,7 @@ function buildParams(
 				text: sanitizeSurrogates(context.systemPrompt),
 				cache_control: {
 					type: "ephemeral",
+					...(cacheTtl && { ttl: cacheTtl }),
 				},
 			},
 		];
@@ -655,7 +675,8 @@ function convertMessages(
 					lastBlock &&
 					(lastBlock.type === "text" || lastBlock.type === "image" || lastBlock.type === "tool_result")
 				) {
-					(lastBlock as any).cache_control = { type: "ephemeral" };
+					const cacheTtl = getCacheTtl(model.baseUrl);
+					(lastBlock as any).cache_control = { type: "ephemeral", ...(cacheTtl && { ttl: cacheTtl }) };
 				}
 			}
 		}
diff --git a/packages/ai/src/providers/openai-responses.ts b/packages/ai/src/providers/openai-responses.ts
index dbe9a4b1..21db9d60 100644
--- a/packages/ai/src/providers/openai-responses.ts
+++ b/packages/ai/src/providers/openai-responses.ts
@@ -18,6 +18,22 @@ import { buildBaseOptions, clampReasoning } from "./simple-options.js";
 
 const OPENAI_TOOL_CALL_PROVIDERS = new Set(["openai", "openai-codex", "opencode"]);
 
+/**
+ * Get prompt cache retention based on PI_CACHE_RETENTION env var.
+ * Only applies to direct OpenAI API calls (api.openai.com).
+ * Returns '24h' for long retention, undefined for default (in-memory).
+ */
+function getPromptCacheRetention(baseUrl: string): "24h" | undefined {
+	if (
+		typeof process !== "undefined" &&
+		process.env.PI_CACHE_RETENTION === "long" &&
+		baseUrl.includes("api.openai.com")
+	) {
+		return "24h";
+	}
+	return undefined;
+}
+
 // OpenAI Responses-specific options
 export interface OpenAIResponsesOptions extends StreamOptions {
 	reasoningEffort?: "minimal" | "low" | "medium" | "high" | "xhigh";
@@ -175,6 +191,7 @@ function buildParams(model: Model<"openai-responses">, context: Context, options
 		input: messages,
 		stream: true,
 		prompt_cache_key: options?.sessionId,
+		prompt_cache_retention: getPromptCacheRetention(model.baseUrl),
 	};
 
 	if (options?.maxTokens) {
diff --git a/packages/ai/test/cache-retention.test.ts b/packages/ai/test/cache-retention.test.ts
new file mode 100644
index 00000000..2e585a47
--- /dev/null
+++ b/packages/ai/test/cache-retention.test.ts
@@ -0,0 +1,199 @@
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { getModel } from "../src/models.js";
+import { stream } from "../src/stream.js";
+import type { Context } from "../src/types.js";
+
+describe("Cache Retention (PI_CACHE_RETENTION)", () => {
+	const originalEnv = process.env.PI_CACHE_RETENTION;
+
+	beforeEach(() => {
+		delete process.env.PI_CACHE_RETENTION;
+	});
+
+	afterEach(() => {
+		if (originalEnv !== undefined) {
+			process.env.PI_CACHE_RETENTION = originalEnv;
+		} else {
+			delete process.env.PI_CACHE_RETENTION;
+		}
+	});
+
+	const context: Context = {
+		systemPrompt: "You are a helpful assistant.",
+		messages: [{ role: "user", content: "Hello", timestamp: Date.now() }],
+	};
+
+	describe("Anthropic Provider", () => {
+		it.skipIf(!process.env.ANTHROPIC_API_KEY)(
+			"should use default cache TTL (no ttl field) when PI_CACHE_RETENTION is not set",
+			async () => {
+				const model = getModel("anthropic", "claude-3-5-haiku-20241022");
+				let capturedPayload: any = null;
+
+				const s = stream(model, context, {
+					onPayload: (payload) => {
+						capturedPayload = payload;
+					},
+				});
+
+				// Consume the stream to trigger the request
+				for await (const _ of s) {
+					// Just consume
+				}
+
+				expect(capturedPayload).not.toBeNull();
+				// System prompt should have cache_control without ttl
+				expect(capturedPayload.system).toBeDefined();
+				expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral" });
+			},
+		);
+
+		it.skipIf(!process.env.ANTHROPIC_API_KEY)("should use 1h cache TTL when PI_CACHE_RETENTION=long", async () => {
+			process.env.PI_CACHE_RETENTION = "long";
+			const model = getModel("anthropic", "claude-3-5-haiku-20241022");
+			let capturedPayload: any = null;
+
+			const s = stream(model, context, {
+				onPayload: (payload) => {
+					capturedPayload = payload;
+				},
+			});
+
+			// Consume the stream to trigger the request
+			for await (const _ of s) {
+				// Just consume
+			}
+
+			expect(capturedPayload).not.toBeNull();
+			// System prompt should have cache_control with ttl: "1h"
+			expect(capturedPayload.system).toBeDefined();
+			expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral", ttl: "1h" });
+		});
+
+		it("should not add ttl when baseUrl is not api.anthropic.com", async () => {
+			process.env.PI_CACHE_RETENTION = "long";
+
+			// Create a model with a different baseUrl (simulating a proxy)
+			const baseModel = getModel("anthropic", "claude-3-5-haiku-20241022");
+			const proxyModel = {
+				...baseModel,
+				baseUrl: "https://my-proxy.example.com/v1",
+			};
+
+			let capturedPayload: any = null;
+
+			// We can't actually make the request (no proxy), but we can verify the payload
+			// by using a mock or checking the logic directly
+			// For this test, we'll import the helper directly
+
+			// Since we can't easily test this without mocking, we'll skip the actual API call
+			// and just verify the helper logic works correctly
+			const { streamAnthropic } = await import("../src/providers/anthropic.js");
+
+			try {
+				const s = streamAnthropic(proxyModel, context, {
+					apiKey: "fake-key",
+					onPayload: (payload) => {
+						capturedPayload = payload;
+					},
+				});
+
+				// This will fail since we're using a fake key and fake proxy, but the payload should be captured
+				for await (const event of s) {
+					if (event.type === "error") break;
+				}
+			} catch {
+				// Expected to fail
+			}
+
+			// The payload should have been captured before the error
+			if (capturedPayload) {
+				// System prompt should have cache_control WITHOUT ttl (proxy URL)
+				expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral" });
+			}
+		});
+	});
+
+	describe("OpenAI Responses Provider", () => {
+		it.skipIf(!process.env.OPENAI_API_KEY)(
+			"should not set prompt_cache_retention when PI_CACHE_RETENTION is not set",
+			async () => {
+				const model = getModel("openai", "gpt-4o-mini");
+				let capturedPayload: any = null;
+
+				const s = stream(model, context, {
+					onPayload: (payload) => {
+						capturedPayload = payload;
+					},
+				});
+
+				// Consume the stream to trigger the request
+				for await (const _ of s) {
+					// Just consume
+				}
+
+				expect(capturedPayload).not.toBeNull();
+				expect(capturedPayload.prompt_cache_retention).toBeUndefined();
+			},
+		);
+
+		it.skipIf(!process.env.OPENAI_API_KEY)(
+			"should set prompt_cache_retention to 24h when PI_CACHE_RETENTION=long",
+			async () => {
+				process.env.PI_CACHE_RETENTION = "long";
+				const model = getModel("openai", "gpt-4o-mini");
+				let capturedPayload: any = null;
+
+				const s = stream(model, context, {
+					onPayload: (payload) => {
+						capturedPayload = payload;
+					},
+				});
+
+				// Consume the stream to trigger the request
+				for await (const _ of s) {
+					// Just consume
+				}
+
+				expect(capturedPayload).not.toBeNull();
+				expect(capturedPayload.prompt_cache_retention).toBe("24h");
+			},
+		);
+
+		it("should not set prompt_cache_retention when baseUrl is not api.openai.com", async () => {
+			process.env.PI_CACHE_RETENTION = "long";
+
+			// Create a model with a different baseUrl (simulating a proxy)
+			const baseModel = getModel("openai", "gpt-4o-mini");
+			const proxyModel = {
+				...baseModel,
+				baseUrl: "https://my-proxy.example.com/v1",
+			};
+
+			let capturedPayload: any = null;
+
+			const { streamOpenAIResponses } = await import("../src/providers/openai-responses.js");
+
+			try {
+				const s = streamOpenAIResponses(proxyModel, context, {
+					apiKey: "fake-key",
+					onPayload: (payload) => {
+						capturedPayload = payload;
+					},
+				});
+
+				// This will fail since we're using a fake key and fake proxy, but the payload should be captured
+				for await (const event of s) {
+					if (event.type === "error") break;
+				}
+			} catch {
+				// Expected to fail
+			}
+
+			// The payload should have been captured before the error
+			if (capturedPayload) {
+				expect(capturedPayload.prompt_cache_retention).toBeUndefined();
+			}
+		});
+	});
+});
diff --git a/packages/coding-agent/README.md b/packages/coding-agent/README.md
index db9c4b9a..06fd497c 100644
--- a/packages/coding-agent/README.md
+++ b/packages/coding-agent/README.md
@@ -525,6 +525,7 @@ pi --thinking high "Solve this complex problem"
 |----------|-------------|
 | `PI_CODING_AGENT_DIR` | Override config directory (default: `~/.pi/agent`) |
 | `PI_SKIP_VERSION_CHECK` | Skip version check at startup |
+| `PI_CACHE_RETENTION` | Set to `long` for extended prompt cache (Anthropic: 1h, OpenAI: 24h) |
 | `VISUAL`, `EDITOR` | External editor for Ctrl+G |
 
 ---