diff --git a/packages/ai/CHANGELOG.md b/packages/ai/CHANGELOG.md index 7883aa9e..378d6ea7 100644 --- a/packages/ai/CHANGELOG.md +++ b/packages/ai/CHANGELOG.md @@ -4,6 +4,8 @@ ### Fixed +- **OpenAI Token Counting**: Fixed `usage.input` to exclude cached tokens for OpenAI providers. Previously, `input` included cached tokens, causing double-counting when calculating total context size via `input + cacheRead`. Now `input` represents non-cached input tokens across all providers, making `input + output + cacheRead + cacheWrite` the correct formula for total context size. + - **Fixed Claude Opus 4.5 cache pricing** (was 3x too expensive) - Corrected cache_read: $1.50 → $0.50 per MTok - Corrected cache_write: $18.75 → $6.25 per MTok diff --git a/packages/ai/src/providers/openai-completions.ts b/packages/ai/src/providers/openai-completions.ts index 5d4aaa9a..22f57503 100644 --- a/packages/ai/src/providers/openai-completions.ts +++ b/packages/ai/src/providers/openai-completions.ts @@ -105,12 +105,14 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = ( for await (const chunk of openaiStream) { if (chunk.usage) { + const cachedTokens = chunk.usage.prompt_tokens_details?.cached_tokens || 0; output.usage = { - input: chunk.usage.prompt_tokens || 0, + // OpenAI includes cached tokens in prompt_tokens, so subtract to get non-cached input + input: (chunk.usage.prompt_tokens || 0) - cachedTokens, output: (chunk.usage.completion_tokens || 0) + (chunk.usage.completion_tokens_details?.reasoning_tokens || 0), - cacheRead: chunk.usage.prompt_tokens_details?.cached_tokens || 0, + cacheRead: cachedTokens, cacheWrite: 0, cost: { input: 0, diff --git a/packages/ai/src/providers/openai-responses.ts b/packages/ai/src/providers/openai-responses.ts index 59d8cba1..45569b38 100644 --- a/packages/ai/src/providers/openai-responses.ts +++ b/packages/ai/src/providers/openai-responses.ts @@ -253,10 +253,12 @@ export const streamOpenAIResponses: StreamFunction<"openai-responses"> = ( else if (event.type === "response.completed") { const response = event.response; if (response?.usage) { + const cachedTokens = response.usage.input_tokens_details?.cached_tokens || 0; output.usage = { - input: response.usage.input_tokens || 0, + // OpenAI includes cached tokens in input_tokens, so subtract to get non-cached input + input: (response.usage.input_tokens || 0) - cachedTokens, output: response.usage.output_tokens || 0, - cacheRead: response.usage.input_tokens_details?.cached_tokens || 0, + cacheRead: cachedTokens, cacheWrite: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, }; diff --git a/packages/coding-agent/CHANGELOG.md b/packages/coding-agent/CHANGELOG.md index ba83c9e0..b8242fe0 100644 --- a/packages/coding-agent/CHANGELOG.md +++ b/packages/coding-agent/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Added + +- **Branch Source Tracking**: Branched sessions now store `branchedFrom` in the session header, containing the path to the original session file. Useful for tracing session lineage. + ## [0.12.5] - 2025-12-03 ### Added diff --git a/packages/coding-agent/docs/compaction.md b/packages/coding-agent/docs/compaction.md index de0431b2..0bece293 100644 --- a/packages/coding-agent/docs/compaction.md +++ b/packages/coding-agent/docs/compaction.md @@ -246,17 +246,18 @@ interface Settings { ``` **Why these defaults:** -- `reserveTokens: 16384` - Room for summary output (~8k) plus safety margin (~8k) +- `reserveTokens: 16384` - Room for summary output (~13k) plus safety margin (~3k) - `keepRecentTokens: 20000` - Preserves recent context verbatim, summary focuses on older content ### Token Calculation -Context tokens are calculated from assistant messages as: +Context tokens are calculated from the **last non-aborted assistant message** using the same formula as the footer: + ``` contextTokens = usage.input + usage.output + usage.cacheRead + usage.cacheWrite ``` -The diff between consecutive (non-aborted, non-error) assistant messages gives the tokens added by that turn. Verified against actual session files. +This gives total context size across all providers. The `input` field represents non-cached input tokens, so adding `cacheRead` and `cacheWrite` gives the true total input. **Trigger condition:** ```typescript @@ -353,12 +354,12 @@ u5, a5, u6, a6, t6, a6, u7, a7 Session loader finds COMPACTION 2 (latest), builds: ``` -[summary2_as_user_msg], u7, a7 +[summary2_as_user_msg], u6, a6, t6, a6, u7, a7 ``` Note: COMPACTION 2's summary incorporates COMPACTION 1's summary because the summarization model received the full current context (which included summary1 as first message). -When calculating `keepLastMessages` for COMPACTION 2, we only count messages between COMPACTION 1 and COMPACTION 2 (cannot cross the boundary). +**Boundary rule:** When calculating `keepLastMessages` for COMPACTION 2, we only count messages between COMPACTION 1 and COMPACTION 2. If `keepLastMessages` exceeds the available messages (e.g., keepLastMessages=10 but only 6 messages exist after COMPACTION 1), we take all available messages up to the boundary. We never cross a compaction boundary. ### Summarization @@ -367,6 +368,7 @@ Use **pi-ai directly** (not the full agent loop) for summarization: - Set `maxTokens` to `0.8 * reserveTokens` (leaves 20% for prompt overhead and safety margin) - Pass abort signal for cancellation - Use the currently selected model +- **Reasoning disabled** (thinking level "off") since we just need a summary, not extended reasoning With default `reserveTokens: 16384`, maxTokens = ~13107. @@ -407,8 +409,20 @@ Works in all modes: The `/branch` command lets users create a new session from a previous user message. With compaction: -- **Branch UI shows ALL user messages** in the session file, both before and after any compaction events -- **Branching copies the session file** up to the selected user message, including all compaction events and messages +- **Branch UI reads from session file directly** (not from `state.messages`) to show ALL user messages, including those before compaction events +- **Branching copies the raw session file** line-by-line up to (but excluding) the selected user message, preserving all compaction events and intermediate entries + +#### Why read from session file instead of state.messages + +After compaction, `state.messages` only contains `[summary_user_msg, ...kept_messages, ...new_messages]`. The pre-compaction messages are not in state. To allow branching to any historical point, we must read the session file directly. + +#### Reworked createBranchedSession + +Current implementation iterates `state.messages` and writes fresh entries. New implementation: +1. Read session file line by line +2. For each line, check if it's the target user message +3. Copy all lines up to (but excluding) the target user message +4. The target user message text goes into the editor #### Example: Branching After Compaction @@ -423,24 +437,25 @@ User branches at u3. New session file: ``` u1, a1, u2, a2 [COMPACTION: summary="...", keepLastMessages=2] -u3 ``` Session loader builds context for new session: ``` -[summary_as_user_msg], u2, a2, u3 +[summary_as_user_msg], u2, a2 ``` +User's editor contains u3's text for editing/resubmission. + #### Example: Branching Before Compaction Same session file, user branches at u2. New session file: ``` -u1, a1, u2 +u1, a1 ``` No compaction in new session. Session loader builds: ``` -u1, a1, u2 +u1, a1 ``` This effectively "undoes" the compaction, letting users recover if important context was lost. @@ -449,6 +464,12 @@ This effectively "undoes" the compaction, letting users recover if important con Auto-compaction is checked in the agent subscription callback after each `message_end` event for assistant messages. If context tokens exceed the threshold, compaction runs. +**Why abort mid-turn:** If auto-compaction triggers after an assistant message that contains tool calls, we abort immediately rather than waiting for tool results. Waiting would risk: +1. Tool results filling remaining context, leaving no room for the summary +2. Context overflow before the next check point (agent_end) + +The abort causes some work loss, but the summary captures progress up to that point. + **Trigger flow (similar to `/clear` command):** ```typescript @@ -468,7 +489,7 @@ async handleAutoCompaction(): Promise { this.statusContainer.clear(); // 4. Perform compaction on current state: - // - Generate summary using pi-ai directly + // - Generate summary using pi-ai directly (no tools, reasoning off) // - Write compaction event to session file // - Rebuild agent messages (summary as user msg + kept messages) // - Rebuild UI to reflect new state @@ -486,12 +507,13 @@ This mirrors the `/clear` command pattern: unsubscribe first to prevent processi 1. Add `compaction` field to `Settings` interface and `SettingsManager` 2. Add `CompactionEvent` type to session manager -3. Update session loader to handle compaction events -4. Add `/compact` command handler -5. Add `/autocompact` command with selector UI -6. Add auto-compaction check in subscription callback after assistant `message_end` -7. Implement `handleAutoCompaction()` following the unsubscribe/abort/wait/compact/resubscribe pattern -8. Implement summarization function using pi-ai -9. Add compaction event to RPC/JSON output types -10. Update footer to show when auto-compact is disabled -11. Ensure `/branch` UI shows all user messages (including pre-compaction) +3. Update session loader to handle compaction events (find latest, apply keepLastMessages with boundary rule) +4. Rework `createBranchedSession` to copy raw session file lines instead of re-serializing from state +5. Update `/branch` UI to read user messages from session file directly +6. Add `/compact` command handler +7. Add `/autocompact` command with selector UI +8. Add auto-compaction check in subscription callback after assistant `message_end` +9. Implement `handleAutoCompaction()` following the unsubscribe/abort/wait/compact/resubscribe pattern +10. Implement summarization function using pi-ai (no tools, reasoning off) +11. Add compaction event to RPC/JSON output types +12. Update footer to show when auto-compact is disabled diff --git a/packages/coding-agent/docs/session.md b/packages/coding-agent/docs/session.md index 7b949728..66499b77 100644 --- a/packages/coding-agent/docs/session.md +++ b/packages/coding-agent/docs/session.md @@ -26,6 +26,12 @@ First line of the file. Defines session metadata. {"type":"session","id":"uuid","timestamp":"2024-12-03T14:00:00.000Z","cwd":"/path/to/project","provider":"anthropic","modelId":"claude-sonnet-4-5","thinkingLevel":"off"} ``` +For branched sessions, includes the source session path: + +```json +{"type":"session","id":"uuid","timestamp":"2024-12-03T14:00:00.000Z","cwd":"/path/to/project","provider":"anthropic","modelId":"claude-sonnet-4-5","thinkingLevel":"off","branchedFrom":"/path/to/original/session.jsonl"} +``` + ### SessionMessageEntry A message in the conversation. The `message` field contains an `AppMessage` (see [rpc.md](./rpc.md#message-types)). diff --git a/packages/coding-agent/src/session-manager.ts b/packages/coding-agent/src/session-manager.ts index 6cd565b6..36b5dc8a 100644 --- a/packages/coding-agent/src/session-manager.ts +++ b/packages/coding-agent/src/session-manager.ts @@ -20,6 +20,7 @@ export interface SessionHeader { provider: string; modelId: string; thinkingLevel: string; + branchedFrom?: string; // Path to the session file this was branched from } export interface SessionMessageEntry { @@ -430,6 +431,7 @@ export class SessionManager { provider: state.model.provider, modelId: state.model.id, thinkingLevel: state.thinkingLevel, + branchedFrom: this.sessionFile, }; appendFileSync(newSessionFile, JSON.stringify(entry) + "\n");