From 1b6a147579cbe9e7b593259051d507319cc6b127 Mon Sep 17 00:00:00 2001 From: Mario Zechner Date: Thu, 29 Jan 2026 02:22:06 +0100 Subject: [PATCH] feat(ai): add PI_CACHE_RETENTION env var for extended prompt caching Adds support for extended cache retention via PI_CACHE_RETENTION=long: - Anthropic: 5m -> 1h TTL - OpenAI: in-memory -> 24h retention Only applies to direct API calls (api.anthropic.com, api.openai.com). Proxies and other providers are unaffected. fixes #967 --- packages/ai/CHANGELOG.md | 4 + packages/ai/README.md | 13 ++ packages/ai/src/providers/anthropic.ts | 23 +- packages/ai/src/providers/openai-responses.ts | 17 ++ packages/ai/test/cache-retention.test.ts | 199 ++++++++++++++++++ packages/coding-agent/README.md | 1 + 6 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 packages/ai/test/cache-retention.test.ts diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md index db90dbc1..4159ccc7 100644 --- a/packages/ai/CHANGELOG.md +++ b/packages/ai/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Added + +- Added `PI_CACHE_RETENTION` environment variable to control cache TTL for Anthropic (5m vs 1h) and OpenAI (in-memory vs 24h). Set to `long` for extended retention. Only applies to direct API calls (api.anthropic.com, api.openai.com). ([#967](https://github.com/badlogic/pi-mono/issues/967)) + ### Fixed - Fixed cross-provider handoff failing when switching from OpenAI Responses API providers (github-copilot, openai-codex) to other providers due to pipe-separated tool call IDs not being normalized, and trailing underscores in truncated IDs being rejected by OpenAI Codex ([#1022](https://github.com/badlogic/pi-mono/issues/1022)) diff --git a/packages/ai/README.md b/packages/ai/README.md index cf58728b..6207e4b7 100644 --- a/packages/ai/README.md +++ b/packages/ai/README.md @@ -909,6 +909,19 @@ const response = await complete(model, context, { }); ``` +#### Cache Retention + +Set `PI_CACHE_RETENTION=long` to extend prompt cache retention: + +| Provider | Default | With `PI_CACHE_RETENTION=long` | +|----------|---------|-------------------------------| +| Anthropic | 5 minutes | 1 hour | +| OpenAI | in-memory | 24 hours | + +This only affects direct API calls to `api.anthropic.com` and `api.openai.com`. Proxies and other providers are unaffected. + +> **Note**: Extended cache retention may increase costs for Anthropic (cache writes are charged at a higher rate). OpenAI's 24h retention has no additional cost. + ### Checking Environment Variables ```typescript diff --git a/packages/ai/src/providers/anthropic.ts b/packages/ai/src/providers/anthropic.ts index 6e99e852..6755c4c4 100644 --- a/packages/ai/src/providers/anthropic.ts +++ b/packages/ai/src/providers/anthropic.ts @@ -30,6 +30,22 @@ import { sanitizeSurrogates } from "../utils/sanitize-unicode.js"; import { adjustMaxTokensForThinking, buildBaseOptions } from "./simple-options.js"; import { transformMessages } from "./transform-messages.js"; +/** + * Get cache TTL based on PI_CACHE_RETENTION env var. + * Only applies to direct Anthropic API calls (api.anthropic.com). + * Returns '1h' for long retention, undefined for default (5m). + */ +function getCacheTtl(baseUrl: string): "1h" | undefined { + if ( + typeof process !== "undefined" && + process.env.PI_CACHE_RETENTION === "long" && + baseUrl.includes("api.anthropic.com") + ) { + return "1h"; + } + return undefined; +} + // Stealth mode: Mimic Claude Code's tool naming exactly const claudeCodeVersion = "2.1.2"; @@ -452,6 +468,7 @@ function buildParams( }; // For OAuth tokens, we MUST include Claude Code identity + const cacheTtl = getCacheTtl(model.baseUrl); if (isOAuthToken) { params.system = [ { @@ -459,6 +476,7 @@ function buildParams( text: "You are Claude Code, Anthropic's official CLI for Claude.", cache_control: { type: "ephemeral", + ...(cacheTtl && { ttl: cacheTtl }), }, }, ]; @@ -468,6 +486,7 @@ function buildParams( text: sanitizeSurrogates(context.systemPrompt), cache_control: { type: "ephemeral", + ...(cacheTtl && { ttl: cacheTtl }), }, }); } @@ -479,6 +498,7 @@ function buildParams( text: sanitizeSurrogates(context.systemPrompt), cache_control: { type: "ephemeral", + ...(cacheTtl && { ttl: cacheTtl }), }, }, ]; @@ -655,7 +675,8 @@ function convertMessages( lastBlock && (lastBlock.type === "text" || lastBlock.type === "image" || lastBlock.type === "tool_result") ) { - (lastBlock as any).cache_control = { type: "ephemeral" }; + const cacheTtl = getCacheTtl(model.baseUrl); + (lastBlock as any).cache_control = { type: "ephemeral", ...(cacheTtl && { ttl: cacheTtl }) }; } } } diff --git a/packages/ai/src/providers/openai-responses.ts b/packages/ai/src/providers/openai-responses.ts index dbe9a4b1..21db9d60 100644 --- a/packages/ai/src/providers/openai-responses.ts +++ b/packages/ai/src/providers/openai-responses.ts @@ -18,6 +18,22 @@ import { buildBaseOptions, clampReasoning } from "./simple-options.js"; const OPENAI_TOOL_CALL_PROVIDERS = new Set(["openai", "openai-codex", "opencode"]); +/** + * Get prompt cache retention based on PI_CACHE_RETENTION env var. + * Only applies to direct OpenAI API calls (api.openai.com). + * Returns '24h' for long retention, undefined for default (in-memory). + */ +function getPromptCacheRetention(baseUrl: string): "24h" | undefined { + if ( + typeof process !== "undefined" && + process.env.PI_CACHE_RETENTION === "long" && + baseUrl.includes("api.openai.com") + ) { + return "24h"; + } + return undefined; +} + // OpenAI Responses-specific options export interface OpenAIResponsesOptions extends StreamOptions { reasoningEffort?: "minimal" | "low" | "medium" | "high" | "xhigh"; @@ -175,6 +191,7 @@ function buildParams(model: Model<"openai-responses">, context: Context, options input: messages, stream: true, prompt_cache_key: options?.sessionId, + prompt_cache_retention: getPromptCacheRetention(model.baseUrl), }; if (options?.maxTokens) { diff --git a/packages/ai/test/cache-retention.test.ts b/packages/ai/test/cache-retention.test.ts new file mode 100644 index 00000000..2e585a47 --- /dev/null +++ b/packages/ai/test/cache-retention.test.ts @@ -0,0 +1,199 @@ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { getModel } from "../src/models.js"; +import { stream } from "../src/stream.js"; +import type { Context } from "../src/types.js"; + +describe("Cache Retention (PI_CACHE_RETENTION)", () => { + const originalEnv = process.env.PI_CACHE_RETENTION; + + beforeEach(() => { + delete process.env.PI_CACHE_RETENTION; + }); + + afterEach(() => { + if (originalEnv !== undefined) { + process.env.PI_CACHE_RETENTION = originalEnv; + } else { + delete process.env.PI_CACHE_RETENTION; + } + }); + + const context: Context = { + systemPrompt: "You are a helpful assistant.", + messages: [{ role: "user", content: "Hello", timestamp: Date.now() }], + }; + + describe("Anthropic Provider", () => { + it.skipIf(!process.env.ANTHROPIC_API_KEY)( + "should use default cache TTL (no ttl field) when PI_CACHE_RETENTION is not set", + async () => { + const model = getModel("anthropic", "claude-3-5-haiku-20241022"); + let capturedPayload: any = null; + + const s = stream(model, context, { + onPayload: (payload) => { + capturedPayload = payload; + }, + }); + + // Consume the stream to trigger the request + for await (const _ of s) { + // Just consume + } + + expect(capturedPayload).not.toBeNull(); + // System prompt should have cache_control without ttl + expect(capturedPayload.system).toBeDefined(); + expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral" }); + }, + ); + + it.skipIf(!process.env.ANTHROPIC_API_KEY)("should use 1h cache TTL when PI_CACHE_RETENTION=long", async () => { + process.env.PI_CACHE_RETENTION = "long"; + const model = getModel("anthropic", "claude-3-5-haiku-20241022"); + let capturedPayload: any = null; + + const s = stream(model, context, { + onPayload: (payload) => { + capturedPayload = payload; + }, + }); + + // Consume the stream to trigger the request + for await (const _ of s) { + // Just consume + } + + expect(capturedPayload).not.toBeNull(); + // System prompt should have cache_control with ttl: "1h" + expect(capturedPayload.system).toBeDefined(); + expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral", ttl: "1h" }); + }); + + it("should not add ttl when baseUrl is not api.anthropic.com", async () => { + process.env.PI_CACHE_RETENTION = "long"; + + // Create a model with a different baseUrl (simulating a proxy) + const baseModel = getModel("anthropic", "claude-3-5-haiku-20241022"); + const proxyModel = { + ...baseModel, + baseUrl: "https://my-proxy.example.com/v1", + }; + + let capturedPayload: any = null; + + // We can't actually make the request (no proxy), but we can verify the payload + // by using a mock or checking the logic directly + // For this test, we'll import the helper directly + + // Since we can't easily test this without mocking, we'll skip the actual API call + // and just verify the helper logic works correctly + const { streamAnthropic } = await import("../src/providers/anthropic.js"); + + try { + const s = streamAnthropic(proxyModel, context, { + apiKey: "fake-key", + onPayload: (payload) => { + capturedPayload = payload; + }, + }); + + // This will fail since we're using a fake key and fake proxy, but the payload should be captured + for await (const event of s) { + if (event.type === "error") break; + } + } catch { + // Expected to fail + } + + // The payload should have been captured before the error + if (capturedPayload) { + // System prompt should have cache_control WITHOUT ttl (proxy URL) + expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral" }); + } + }); + }); + + describe("OpenAI Responses Provider", () => { + it.skipIf(!process.env.OPENAI_API_KEY)( + "should not set prompt_cache_retention when PI_CACHE_RETENTION is not set", + async () => { + const model = getModel("openai", "gpt-4o-mini"); + let capturedPayload: any = null; + + const s = stream(model, context, { + onPayload: (payload) => { + capturedPayload = payload; + }, + }); + + // Consume the stream to trigger the request + for await (const _ of s) { + // Just consume + } + + expect(capturedPayload).not.toBeNull(); + expect(capturedPayload.prompt_cache_retention).toBeUndefined(); + }, + ); + + it.skipIf(!process.env.OPENAI_API_KEY)( + "should set prompt_cache_retention to 24h when PI_CACHE_RETENTION=long", + async () => { + process.env.PI_CACHE_RETENTION = "long"; + const model = getModel("openai", "gpt-4o-mini"); + let capturedPayload: any = null; + + const s = stream(model, context, { + onPayload: (payload) => { + capturedPayload = payload; + }, + }); + + // Consume the stream to trigger the request + for await (const _ of s) { + // Just consume + } + + expect(capturedPayload).not.toBeNull(); + expect(capturedPayload.prompt_cache_retention).toBe("24h"); + }, + ); + + it("should not set prompt_cache_retention when baseUrl is not api.openai.com", async () => { + process.env.PI_CACHE_RETENTION = "long"; + + // Create a model with a different baseUrl (simulating a proxy) + const baseModel = getModel("openai", "gpt-4o-mini"); + const proxyModel = { + ...baseModel, + baseUrl: "https://my-proxy.example.com/v1", + }; + + let capturedPayload: any = null; + + const { streamOpenAIResponses } = await import("../src/providers/openai-responses.js"); + + try { + const s = streamOpenAIResponses(proxyModel, context, { + apiKey: "fake-key", + onPayload: (payload) => { + capturedPayload = payload; + }, + }); + + // This will fail since we're using a fake key and fake proxy, but the payload should be captured + for await (const event of s) { + if (event.type === "error") break; + } + } catch { + // Expected to fail + } + + // The payload should have been captured before the error + if (capturedPayload) { + expect(capturedPayload.prompt_cache_retention).toBeUndefined(); + } + }); + }); +}); diff --git a/packages/coding-agent/README.md b/packages/coding-agent/README.md index db9c4b9a..06fd497c 100644 --- a/packages/coding-agent/README.md +++ b/packages/coding-agent/README.md @@ -525,6 +525,7 @@ pi --thinking high "Solve this complex problem" |----------|-------------| | `PI_CODING_AGENT_DIR` | Override config directory (default: `~/.pi/agent`) | | `PI_SKIP_VERSION_CHECK` | Skip version check at startup | +| `PI_CACHE_RETENTION` | Set to `long` for extended prompt cache (Anthropic: 1h, OpenAI: 24h) | | `VISUAL`, `EDITOR` | External editor for Ctrl+G | ---