mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-17 10:02:23 +00:00
feat(ai): add PI_CACHE_RETENTION env var for extended prompt caching
Adds support for extended cache retention via PI_CACHE_RETENTION=long: - Anthropic: 5m -> 1h TTL - OpenAI: in-memory -> 24h retention Only applies to direct API calls (api.anthropic.com, api.openai.com). Proxies and other providers are unaffected. fixes #967
This commit is contained in:
parent
605f6f494b
commit
1b6a147579
6 changed files with 256 additions and 1 deletions
|
|
@ -2,6 +2,10 @@
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- Added `PI_CACHE_RETENTION` environment variable to control cache TTL for Anthropic (5m vs 1h) and OpenAI (in-memory vs 24h). Set to `long` for extended retention. Only applies to direct API calls (api.anthropic.com, api.openai.com). ([#967](https://github.com/badlogic/pi-mono/issues/967))
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
||||||
- Fixed cross-provider handoff failing when switching from OpenAI Responses API providers (github-copilot, openai-codex) to other providers due to pipe-separated tool call IDs not being normalized, and trailing underscores in truncated IDs being rejected by OpenAI Codex ([#1022](https://github.com/badlogic/pi-mono/issues/1022))
|
- Fixed cross-provider handoff failing when switching from OpenAI Responses API providers (github-copilot, openai-codex) to other providers due to pipe-separated tool call IDs not being normalized, and trailing underscores in truncated IDs being rejected by OpenAI Codex ([#1022](https://github.com/badlogic/pi-mono/issues/1022))
|
||||||
|
|
|
||||||
|
|
@ -909,6 +909,19 @@ const response = await complete(model, context, {
|
||||||
});
|
});
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Cache Retention
|
||||||
|
|
||||||
|
Set `PI_CACHE_RETENTION=long` to extend prompt cache retention:
|
||||||
|
|
||||||
|
| Provider | Default | With `PI_CACHE_RETENTION=long` |
|
||||||
|
|----------|---------|-------------------------------|
|
||||||
|
| Anthropic | 5 minutes | 1 hour |
|
||||||
|
| OpenAI | in-memory | 24 hours |
|
||||||
|
|
||||||
|
This only affects direct API calls to `api.anthropic.com` and `api.openai.com`. Proxies and other providers are unaffected.
|
||||||
|
|
||||||
|
> **Note**: Extended cache retention may increase costs for Anthropic (cache writes are charged at a higher rate). OpenAI's 24h retention has no additional cost.
|
||||||
|
|
||||||
### Checking Environment Variables
|
### Checking Environment Variables
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,22 @@ import { sanitizeSurrogates } from "../utils/sanitize-unicode.js";
|
||||||
import { adjustMaxTokensForThinking, buildBaseOptions } from "./simple-options.js";
|
import { adjustMaxTokensForThinking, buildBaseOptions } from "./simple-options.js";
|
||||||
import { transformMessages } from "./transform-messages.js";
|
import { transformMessages } from "./transform-messages.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get cache TTL based on PI_CACHE_RETENTION env var.
|
||||||
|
* Only applies to direct Anthropic API calls (api.anthropic.com).
|
||||||
|
* Returns '1h' for long retention, undefined for default (5m).
|
||||||
|
*/
|
||||||
|
function getCacheTtl(baseUrl: string): "1h" | undefined {
|
||||||
|
if (
|
||||||
|
typeof process !== "undefined" &&
|
||||||
|
process.env.PI_CACHE_RETENTION === "long" &&
|
||||||
|
baseUrl.includes("api.anthropic.com")
|
||||||
|
) {
|
||||||
|
return "1h";
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
// Stealth mode: Mimic Claude Code's tool naming exactly
|
// Stealth mode: Mimic Claude Code's tool naming exactly
|
||||||
const claudeCodeVersion = "2.1.2";
|
const claudeCodeVersion = "2.1.2";
|
||||||
|
|
||||||
|
|
@ -452,6 +468,7 @@ function buildParams(
|
||||||
};
|
};
|
||||||
|
|
||||||
// For OAuth tokens, we MUST include Claude Code identity
|
// For OAuth tokens, we MUST include Claude Code identity
|
||||||
|
const cacheTtl = getCacheTtl(model.baseUrl);
|
||||||
if (isOAuthToken) {
|
if (isOAuthToken) {
|
||||||
params.system = [
|
params.system = [
|
||||||
{
|
{
|
||||||
|
|
@ -459,6 +476,7 @@ function buildParams(
|
||||||
text: "You are Claude Code, Anthropic's official CLI for Claude.",
|
text: "You are Claude Code, Anthropic's official CLI for Claude.",
|
||||||
cache_control: {
|
cache_control: {
|
||||||
type: "ephemeral",
|
type: "ephemeral",
|
||||||
|
...(cacheTtl && { ttl: cacheTtl }),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
@ -468,6 +486,7 @@ function buildParams(
|
||||||
text: sanitizeSurrogates(context.systemPrompt),
|
text: sanitizeSurrogates(context.systemPrompt),
|
||||||
cache_control: {
|
cache_control: {
|
||||||
type: "ephemeral",
|
type: "ephemeral",
|
||||||
|
...(cacheTtl && { ttl: cacheTtl }),
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
@ -479,6 +498,7 @@ function buildParams(
|
||||||
text: sanitizeSurrogates(context.systemPrompt),
|
text: sanitizeSurrogates(context.systemPrompt),
|
||||||
cache_control: {
|
cache_control: {
|
||||||
type: "ephemeral",
|
type: "ephemeral",
|
||||||
|
...(cacheTtl && { ttl: cacheTtl }),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
@ -655,7 +675,8 @@ function convertMessages(
|
||||||
lastBlock &&
|
lastBlock &&
|
||||||
(lastBlock.type === "text" || lastBlock.type === "image" || lastBlock.type === "tool_result")
|
(lastBlock.type === "text" || lastBlock.type === "image" || lastBlock.type === "tool_result")
|
||||||
) {
|
) {
|
||||||
(lastBlock as any).cache_control = { type: "ephemeral" };
|
const cacheTtl = getCacheTtl(model.baseUrl);
|
||||||
|
(lastBlock as any).cache_control = { type: "ephemeral", ...(cacheTtl && { ttl: cacheTtl }) };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,22 @@ import { buildBaseOptions, clampReasoning } from "./simple-options.js";
|
||||||
|
|
||||||
const OPENAI_TOOL_CALL_PROVIDERS = new Set(["openai", "openai-codex", "opencode"]);
|
const OPENAI_TOOL_CALL_PROVIDERS = new Set(["openai", "openai-codex", "opencode"]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get prompt cache retention based on PI_CACHE_RETENTION env var.
|
||||||
|
* Only applies to direct OpenAI API calls (api.openai.com).
|
||||||
|
* Returns '24h' for long retention, undefined for default (in-memory).
|
||||||
|
*/
|
||||||
|
function getPromptCacheRetention(baseUrl: string): "24h" | undefined {
|
||||||
|
if (
|
||||||
|
typeof process !== "undefined" &&
|
||||||
|
process.env.PI_CACHE_RETENTION === "long" &&
|
||||||
|
baseUrl.includes("api.openai.com")
|
||||||
|
) {
|
||||||
|
return "24h";
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
// OpenAI Responses-specific options
|
// OpenAI Responses-specific options
|
||||||
export interface OpenAIResponsesOptions extends StreamOptions {
|
export interface OpenAIResponsesOptions extends StreamOptions {
|
||||||
reasoningEffort?: "minimal" | "low" | "medium" | "high" | "xhigh";
|
reasoningEffort?: "minimal" | "low" | "medium" | "high" | "xhigh";
|
||||||
|
|
@ -175,6 +191,7 @@ function buildParams(model: Model<"openai-responses">, context: Context, options
|
||||||
input: messages,
|
input: messages,
|
||||||
stream: true,
|
stream: true,
|
||||||
prompt_cache_key: options?.sessionId,
|
prompt_cache_key: options?.sessionId,
|
||||||
|
prompt_cache_retention: getPromptCacheRetention(model.baseUrl),
|
||||||
};
|
};
|
||||||
|
|
||||||
if (options?.maxTokens) {
|
if (options?.maxTokens) {
|
||||||
|
|
|
||||||
199
packages/ai/test/cache-retention.test.ts
Normal file
199
packages/ai/test/cache-retention.test.ts
Normal file
|
|
@ -0,0 +1,199 @@
|
||||||
|
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||||
|
import { getModel } from "../src/models.js";
|
||||||
|
import { stream } from "../src/stream.js";
|
||||||
|
import type { Context } from "../src/types.js";
|
||||||
|
|
||||||
|
describe("Cache Retention (PI_CACHE_RETENTION)", () => {
|
||||||
|
const originalEnv = process.env.PI_CACHE_RETENTION;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
delete process.env.PI_CACHE_RETENTION;
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
if (originalEnv !== undefined) {
|
||||||
|
process.env.PI_CACHE_RETENTION = originalEnv;
|
||||||
|
} else {
|
||||||
|
delete process.env.PI_CACHE_RETENTION;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const context: Context = {
|
||||||
|
systemPrompt: "You are a helpful assistant.",
|
||||||
|
messages: [{ role: "user", content: "Hello", timestamp: Date.now() }],
|
||||||
|
};
|
||||||
|
|
||||||
|
describe("Anthropic Provider", () => {
|
||||||
|
it.skipIf(!process.env.ANTHROPIC_API_KEY)(
|
||||||
|
"should use default cache TTL (no ttl field) when PI_CACHE_RETENTION is not set",
|
||||||
|
async () => {
|
||||||
|
const model = getModel("anthropic", "claude-3-5-haiku-20241022");
|
||||||
|
let capturedPayload: any = null;
|
||||||
|
|
||||||
|
const s = stream(model, context, {
|
||||||
|
onPayload: (payload) => {
|
||||||
|
capturedPayload = payload;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Consume the stream to trigger the request
|
||||||
|
for await (const _ of s) {
|
||||||
|
// Just consume
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(capturedPayload).not.toBeNull();
|
||||||
|
// System prompt should have cache_control without ttl
|
||||||
|
expect(capturedPayload.system).toBeDefined();
|
||||||
|
expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral" });
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
it.skipIf(!process.env.ANTHROPIC_API_KEY)("should use 1h cache TTL when PI_CACHE_RETENTION=long", async () => {
|
||||||
|
process.env.PI_CACHE_RETENTION = "long";
|
||||||
|
const model = getModel("anthropic", "claude-3-5-haiku-20241022");
|
||||||
|
let capturedPayload: any = null;
|
||||||
|
|
||||||
|
const s = stream(model, context, {
|
||||||
|
onPayload: (payload) => {
|
||||||
|
capturedPayload = payload;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Consume the stream to trigger the request
|
||||||
|
for await (const _ of s) {
|
||||||
|
// Just consume
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(capturedPayload).not.toBeNull();
|
||||||
|
// System prompt should have cache_control with ttl: "1h"
|
||||||
|
expect(capturedPayload.system).toBeDefined();
|
||||||
|
expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral", ttl: "1h" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should not add ttl when baseUrl is not api.anthropic.com", async () => {
|
||||||
|
process.env.PI_CACHE_RETENTION = "long";
|
||||||
|
|
||||||
|
// Create a model with a different baseUrl (simulating a proxy)
|
||||||
|
const baseModel = getModel("anthropic", "claude-3-5-haiku-20241022");
|
||||||
|
const proxyModel = {
|
||||||
|
...baseModel,
|
||||||
|
baseUrl: "https://my-proxy.example.com/v1",
|
||||||
|
};
|
||||||
|
|
||||||
|
let capturedPayload: any = null;
|
||||||
|
|
||||||
|
// We can't actually make the request (no proxy), but we can verify the payload
|
||||||
|
// by using a mock or checking the logic directly
|
||||||
|
// For this test, we'll import the helper directly
|
||||||
|
|
||||||
|
// Since we can't easily test this without mocking, we'll skip the actual API call
|
||||||
|
// and just verify the helper logic works correctly
|
||||||
|
const { streamAnthropic } = await import("../src/providers/anthropic.js");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const s = streamAnthropic(proxyModel, context, {
|
||||||
|
apiKey: "fake-key",
|
||||||
|
onPayload: (payload) => {
|
||||||
|
capturedPayload = payload;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// This will fail since we're using a fake key and fake proxy, but the payload should be captured
|
||||||
|
for await (const event of s) {
|
||||||
|
if (event.type === "error") break;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Expected to fail
|
||||||
|
}
|
||||||
|
|
||||||
|
// The payload should have been captured before the error
|
||||||
|
if (capturedPayload) {
|
||||||
|
// System prompt should have cache_control WITHOUT ttl (proxy URL)
|
||||||
|
expect(capturedPayload.system[0].cache_control).toEqual({ type: "ephemeral" });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("OpenAI Responses Provider", () => {
|
||||||
|
it.skipIf(!process.env.OPENAI_API_KEY)(
|
||||||
|
"should not set prompt_cache_retention when PI_CACHE_RETENTION is not set",
|
||||||
|
async () => {
|
||||||
|
const model = getModel("openai", "gpt-4o-mini");
|
||||||
|
let capturedPayload: any = null;
|
||||||
|
|
||||||
|
const s = stream(model, context, {
|
||||||
|
onPayload: (payload) => {
|
||||||
|
capturedPayload = payload;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Consume the stream to trigger the request
|
||||||
|
for await (const _ of s) {
|
||||||
|
// Just consume
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(capturedPayload).not.toBeNull();
|
||||||
|
expect(capturedPayload.prompt_cache_retention).toBeUndefined();
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
it.skipIf(!process.env.OPENAI_API_KEY)(
|
||||||
|
"should set prompt_cache_retention to 24h when PI_CACHE_RETENTION=long",
|
||||||
|
async () => {
|
||||||
|
process.env.PI_CACHE_RETENTION = "long";
|
||||||
|
const model = getModel("openai", "gpt-4o-mini");
|
||||||
|
let capturedPayload: any = null;
|
||||||
|
|
||||||
|
const s = stream(model, context, {
|
||||||
|
onPayload: (payload) => {
|
||||||
|
capturedPayload = payload;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Consume the stream to trigger the request
|
||||||
|
for await (const _ of s) {
|
||||||
|
// Just consume
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(capturedPayload).not.toBeNull();
|
||||||
|
expect(capturedPayload.prompt_cache_retention).toBe("24h");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
it("should not set prompt_cache_retention when baseUrl is not api.openai.com", async () => {
|
||||||
|
process.env.PI_CACHE_RETENTION = "long";
|
||||||
|
|
||||||
|
// Create a model with a different baseUrl (simulating a proxy)
|
||||||
|
const baseModel = getModel("openai", "gpt-4o-mini");
|
||||||
|
const proxyModel = {
|
||||||
|
...baseModel,
|
||||||
|
baseUrl: "https://my-proxy.example.com/v1",
|
||||||
|
};
|
||||||
|
|
||||||
|
let capturedPayload: any = null;
|
||||||
|
|
||||||
|
const { streamOpenAIResponses } = await import("../src/providers/openai-responses.js");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const s = streamOpenAIResponses(proxyModel, context, {
|
||||||
|
apiKey: "fake-key",
|
||||||
|
onPayload: (payload) => {
|
||||||
|
capturedPayload = payload;
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// This will fail since we're using a fake key and fake proxy, but the payload should be captured
|
||||||
|
for await (const event of s) {
|
||||||
|
if (event.type === "error") break;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Expected to fail
|
||||||
|
}
|
||||||
|
|
||||||
|
// The payload should have been captured before the error
|
||||||
|
if (capturedPayload) {
|
||||||
|
expect(capturedPayload.prompt_cache_retention).toBeUndefined();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -525,6 +525,7 @@ pi --thinking high "Solve this complex problem"
|
||||||
|----------|-------------|
|
|----------|-------------|
|
||||||
| `PI_CODING_AGENT_DIR` | Override config directory (default: `~/.pi/agent`) |
|
| `PI_CODING_AGENT_DIR` | Override config directory (default: `~/.pi/agent`) |
|
||||||
| `PI_SKIP_VERSION_CHECK` | Skip version check at startup |
|
| `PI_SKIP_VERSION_CHECK` | Skip version check at startup |
|
||||||
|
| `PI_CACHE_RETENTION` | Set to `long` for extended prompt cache (Anthropic: 1h, OpenAI: 24h) |
|
||||||
| `VISUAL`, `EDITOR` | External editor for Ctrl+G |
|
| `VISUAL`, `EDITOR` | External editor for Ctrl+G |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue