From 3424550d215d1983084e8758976822185b6ec3c8 Mon Sep 17 00:00:00 2001 From: Mario Zechner Date: Wed, 17 Dec 2025 22:04:28 +0100 Subject: [PATCH] Improve documentation: README settings table, philosophy section, custom-tools intro, rpc hook_error event, hooks import aliases --- packages/coding-agent/CHANGELOG.md | 10 +- packages/coding-agent/README.md | 176 +++--- packages/coding-agent/docs/compaction-new.md | 387 ------------- .../docs/compaction-strategies.ts | 502 ----------------- packages/coding-agent/docs/compaction.md | 519 ------------------ packages/coding-agent/docs/custom-tools.md | 20 +- packages/coding-agent/docs/gemini.md | 255 --------- packages/coding-agent/docs/hooks.md | 14 + packages/coding-agent/docs/rpc.md | 14 + packages/coding-agent/docs/truncation.md | 235 -------- packages/coding-agent/docs/undercompaction.md | 313 ----------- .../coding-agent/src/core/hooks/loader.ts | 29 +- 12 files changed, 155 insertions(+), 2319 deletions(-) delete mode 100644 packages/coding-agent/docs/compaction-new.md delete mode 100644 packages/coding-agent/docs/compaction-strategies.ts delete mode 100644 packages/coding-agent/docs/compaction.md delete mode 100644 packages/coding-agent/docs/gemini.md delete mode 100644 packages/coding-agent/docs/truncation.md delete mode 100644 packages/coding-agent/docs/undercompaction.md diff --git a/packages/coding-agent/CHANGELOG.md b/packages/coding-agent/CHANGELOG.md index e10c392e..2944acfd 100644 --- a/packages/coding-agent/CHANGELOG.md +++ b/packages/coding-agent/CHANGELOG.md @@ -6,7 +6,15 @@ - Improved system prompt documentation section with clearer pointers to specific doc files for custom models, themes, skills, hooks, custom tools, and RPC. -- Cleaned up documentation: `theme.md` (added missing color tokens), `skills.md` (rewrote with better framing and examples), `hooks.md` (fixed timeout/error handling docs, added examples). +- Cleaned up documentation: + - `theme.md`: Added missing color tokens (`thinkingXhigh`, `bashMode`) + - `skills.md`: Rewrote with better framing and examples + - `hooks.md`: Fixed timeout/error handling docs, added import aliases section + - `custom-tools.md`: Added intro with use cases and comparison table + - `rpc.md`: Added missing `hook_error` event documentation + - `README.md`: Complete settings table, condensed philosophy section, standardized OAuth docs + +- Hooks loader now supports same import aliases as custom tools (`@sinclair/typebox`, `@mariozechner/pi-ai`, `@mariozechner/pi-tui`, `@mariozechner/pi-coding-agent`). ### Breaking Changes diff --git a/packages/coding-agent/README.md b/packages/coding-agent/README.md index 72fd2aa7..b4b8155e 100644 --- a/packages/coding-agent/README.md +++ b/packages/coding-agent/README.md @@ -9,7 +9,7 @@ Works on Linux, macOS, and Windows (requires bash; see [Windows Setup](#windows- - [Getting Started](#getting-started) - [Installation](#installation) - [Windows Setup](#windows-setup) - - [API Keys](#api-keys) + - [API Keys & OAuth](#api-keys--oauth) - [Quick Start](#quick-start) - [Usage](#usage) - [Slash Commands](#slash-commands) @@ -101,13 +101,13 @@ For most users, [Git for Windows](https://git-scm.com/download/win) is sufficien } ``` -### API Keys +### API Keys & OAuth Set the environment variable for your provider: | Provider | Environment Variable | |----------|---------------------| -| Anthropic | `ANTHROPIC_API_KEY` or `ANTHROPIC_OAUTH_TOKEN` | +| Anthropic | `ANTHROPIC_API_KEY` | | OpenAI | `OPENAI_API_KEY` | | Google | `GEMINI_API_KEY` | | Mistral | `MISTRAL_API_KEY` | @@ -117,31 +117,27 @@ Set the environment variable for your provider: | OpenRouter | `OPENROUTER_API_KEY` | | ZAI | `ZAI_API_KEY` | -The `/model` command only shows models for providers with configured API keys. - -**OAuth (Claude Pro/Max subscribers):** +**Anthropic OAuth (Claude Pro/Max):** ```bash pi /login # Select "Anthropic (Claude Pro/Max)", authorize in browser ``` -Tokens stored in `~/.pi/agent/oauth.json` (mode 0600). Use `/logout` to clear. +Tokens stored in `~/.pi/agent/oauth.json`. Use `/logout` to clear. -**GitHub Copilot:** +**GitHub Copilot OAuth:** ```bash pi /login # Select "GitHub Copilot", authorize in browser ``` -During login, you'll be prompted for an enterprise domain. Press Enter to use github.com, or enter your GitHub Enterprise Server domain (e.g., `github.mycompany.com`). All models are automatically enabled after login. +Press Enter to use github.com, or enter your GitHub Enterprise Server domain (e.g., `github.mycompany.com`). -If you get "The requested model is not supported" error, enable the model manually in VS Code: open Copilot Chat, click the model selector, select the model, and click "Enable". +If you get "The requested model is not supported" error, enable the model in VS Code: Copilot Chat → model selector → select model → "Enable". -For enterprise users, check with your organization's Copilot administrator for model availability. - -Tokens stored in `~/.pi/agent/oauth.json` (mode 0600). Use `/logout` to clear. +Tokens stored in `~/.pi/agent/oauth.json`. Use `/logout` to clear. ### Quick Start @@ -303,6 +299,8 @@ When disabled, neither case triggers automatic compaction (use `/compact` manual 3. Summary replaces old messages as "context handoff" 4. Previous compaction summaries chain into new ones +Compaction does not create a new session, but continues the existing one, with a marker in the `.jsonl` file that encodes the compaction point. + **Configuration** (`~/.pi/agent/settings.json`): ```json @@ -315,7 +313,7 @@ When disabled, neither case triggers automatic compaction (use `/compact` manual } ``` -> **Note:** Compaction is lossy. The agent loses full conversation access afterward. Size tasks to avoid context limits when possible. For critical context, ask the agent to write a summary to a file, then start a new session with that file. The full session history is preserved in the JSONL file; use `/branch` to revisit any previous point. +> **Note:** Compaction is lossy. The agent loses full conversation access afterward. Size tasks to avoid context limits when possible. For critical context, ask the agent to write a summary to a file, iterate on it until it covers everything, then start a new session with that file. The full session history is preserved in the JSONL file; use `/branch` to revisit any previous point. ### Branching @@ -427,6 +425,8 @@ Add custom models (Ollama, vLLM, LM Studio, etc.) via `~/.pi/agent/models.json`: 4. Saved default from settings 5. First available model with valid API key +> pi can help you create custom provider and model configurations. + ### Themes Built-in themes: `dark` (default), `light`. Auto-detected on first run. @@ -444,7 +444,7 @@ cp $(npm root -g)/@mariozechner/pi-coding-agent/dist/theme/dark.json ~/.pi/agent Select with `/theme`, then edit the file. Changes apply on save. -See [Theme Documentation](docs/theme.md) for all 44 color tokens. +> See [Theme Documentation](docs/theme.md) on how to create custom themes in detail. Pi can help you create a new one. **VS Code terminal fix:** Set `terminal.integrated.minimumContrastRatio` to `1` for accurate colors. @@ -490,6 +490,14 @@ Usage: `/component Button "onClick handler" "disabled support"` Skills are self-contained capability packages that the agent loads on-demand. A skill provides specialized workflows, setup instructions, helper scripts, and reference documentation for specific tasks. Skills are loaded when the agent decides a task matches the description, or when you explicitly ask to use one. +**Example use cases:** +- Web search and content extraction (Brave Search API) +- Browser automation via Chrome DevTools Protocol +- Google Calendar, Gmail, Drive integration +- PDF/DOCX processing and creation +- Speech-to-text transcription +- YouTube transcript extraction + **Skill locations:** - Pi user: `~/.pi/agent/skills/**/SKILL.md` (recursive) - Pi project: `.pi/skills/**/SKILL.md` (recursive) @@ -522,7 +530,7 @@ cd {baseDir} && npm install **Disable skills:** `pi --no-skills` or set `skills.enabled: false` in settings. -See [docs/skills.md](docs/skills.md) for details, examples, and links to skill repositories. +> See [docs/skills.md](docs/skills.md) for details, examples, and links to skill repositories. pi can help you create new skills. ### Hooks @@ -532,7 +540,7 @@ Hooks are TypeScript modules that extend pi's behavior by subscribing to lifecyc - **Checkpoint code state** (git stash at each turn, restore on `/branch`) - **Protect paths** (block writes to `.env`, `node_modules/`, etc.) - **Modify tool output** (filter or transform results before the LLM sees them) -- **Inject messages from external sources** (file watchers, webhooks, CI systems) +- **Inject messages from external sources to wake up the agent** (file watchers, webhooks, CI systems) **Hook locations:** - Global: `~/.pi/agent/hooks/*.ts` @@ -574,13 +582,13 @@ export default function (pi: HookAPI) { } ``` -See [Hooks Documentation](docs/hooks.md) for full API reference. +> See [Hooks Documentation](docs/hooks.md) for full API reference. pi can help you create new hooks -See [examples/hooks/](examples/hooks/) for working examples including permission gates, git checkpointing, and path protection. +> See [examples/hooks/](examples/hooks/) for working examples including permission gates, git checkpointing, and path protection. ### Custom Tools -Custom tools extend pi with new capabilities beyond the built-in tools. They are TypeScript modules that define tools with optional custom TUI rendering. +Custom tools let you extend the built-in toolset (read, write, edit, bash, ...) and are called by the LLM directly. They are TypeScript modules that define tools with optional custom TUI integration for getting user input and custom tool call and result rendering. **Tool locations:** - Global: `~/.pi/agent/tools/*.ts` @@ -619,12 +627,11 @@ export default factory; - Custom rendering via `renderCall()` and `renderResult()` methods - Streaming results via `onUpdate` callback - Abort handling via `signal` parameter -- Cleanup via `dispose()` method - Multiple tools from one factory (return an array) -See [Custom Tools Documentation](docs/custom-tools.md) for the full API reference, TUI component guide, and examples. +> See [Custom Tools Documentation](docs/custom-tools.md) for the full API reference, TUI component guide, and examples. pi can help you create custom tools. -See [examples/custom-tools/](examples/custom-tools/) for working examples including a todo list with session state management and a question tool with UI interaction. +> See [examples/custom-tools/](examples/custom-tools/) for working examples including a todo list with session state management and a question tool with UI interaction. ### Settings File @@ -633,8 +640,13 @@ See [examples/custom-tools/](examples/custom-tools/) for working examples includ ```json { "theme": "dark", - "shellPath": "C:\\path\\to\\bash.exe", + "defaultProvider": "anthropic", + "defaultModel": "claude-sonnet-4-20250514", + "defaultThinkingLevel": "medium", "queueMode": "one-at-a-time", + "shellPath": "C:\\path\\to\\bash.exe", + "hideThinkingBlock": false, + "collapseChangelog": false, "compaction": { "enabled": true, "reserveTokens": 16384, @@ -650,17 +662,34 @@ See [examples/custom-tools/](examples/custom-tools/) for working examples includ }, "terminal": { "showImages": true - } + }, + "hooks": ["/path/to/hook.ts"], + "hookTimeout": 30000, + "customTools": ["/path/to/tool.ts"] } ``` -**Retry settings:** -- `enabled`: Auto-retry on transient errors (overloaded, rate limit, 5xx). Default: `true` -- `maxRetries`: Maximum retry attempts. Default: `3` -- `baseDelayMs`: Base delay for exponential backoff (2s, 4s, 8s). Default: `2000` - -**Terminal settings:** -- `showImages`: Render images inline in supported terminals. Default: `true` +| Setting | Description | Default | +|---------|-------------|---------| +| `theme` | Color theme name | auto-detected | +| `defaultProvider` | Default model provider | - | +| `defaultModel` | Default model ID | - | +| `defaultThinkingLevel` | Thinking level: `off`, `minimal`, `low`, `medium`, `high`, `xhigh` | - | +| `queueMode` | Message queue mode: `all` or `one-at-a-time` | `one-at-a-time` | +| `shellPath` | Custom bash path (Windows) | auto-detected | +| `hideThinkingBlock` | Hide thinking blocks in output (Ctrl+T to toggle) | `false` | +| `collapseChangelog` | Show condensed changelog after update | `false` | +| `compaction.enabled` | Enable auto-compaction | `true` | +| `compaction.reserveTokens` | Tokens to reserve before compaction triggers | `16384` | +| `compaction.keepRecentTokens` | Recent tokens to keep after compaction | `20000` | +| `skills.enabled` | Enable skills discovery | `true` | +| `retry.enabled` | Auto-retry on transient errors | `true` | +| `retry.maxRetries` | Maximum retry attempts | `3` | +| `retry.baseDelayMs` | Base delay for exponential backoff | `2000` | +| `terminal.showImages` | Render images inline (supported terminals) | `true` | +| `hooks` | Additional hook file paths | `[]` | +| `hookTimeout` | Timeout for hook operations (ms) | `30000` | +| `customTools` | Additional custom tool file paths | `[]` | --- @@ -768,32 +797,7 @@ Available via `--tools` flag: Example: `--tools read,grep,find,ls` for code review without modification. -### Custom Tools - -Pi relies on CLI tools invoked via bash rather than MCP. Create a tool with a README: - -`~/agent-tools/screenshot/README.md`: -```markdown -# Screenshot Tool -Takes a screenshot of your main display. - -## Usage -```bash -screenshot.sh -``` -Returns the path to the saved PNG. -``` - -`~/agent-tools/screenshot/screenshot.sh`: -```bash -#!/bin/bash -screencapture -x /tmp/screenshot-$(date +%s).png -ls -t /tmp/screenshot-*.png | head -1 -``` - -Usage: "Read ~/agent-tools/screenshot/README.md and take a screenshot" - -Reference tool READMEs in `AGENTS.md` to make them automatically available. +For adding new tools, see [Custom Tools](#custom-tools) in the Configuration section. --- @@ -830,59 +834,21 @@ Works with both session files and streaming event logs from `--mode json`. ## Philosophy -Pi is opinionated about what it won't do. These are intentional design decisions. +Pi is opinionated about what it won't do. These are intentional design decisions to minimize context bloat and avoid anti-patterns. -### No MCP +**No MCP.** Build CLI tools with READMEs (see [Skills](#skills)). The agent reads them on demand. [Would you like to know more?](https://mariozechner.at/posts/2025-11-02-what-if-you-dont-need-mcp/) -Pi does not support MCP (Model Context Protocol). Instead, it relies on four core tools (read, write, edit, bash) and assumes the agent can invoke CLI tools or write them as needed. +**No sub-agents.** Spawn pi instances via tmux, or build a task tool with [custom tools](#custom-tools). Full observability and steerability. -CLI tools are simpler: any executable with a README works. No protocol overhead, no server management. The agent reads the README and uses bash. +**No permission popups.** Security theater. Run in a container or build your own with [Hooks](#hooks). -See: [What if you don't need MCP?](https://mariozechner.at/posts/2025-11-02-what-if-you-dont-need-mcp/) +**No plan mode.** Gather context in one session, write plans to file, start fresh for implementation. -### No Sub-Agents +**No built-in to-dos.** They confuse models. Use a TODO.md file, or [build your own](examples/custom-tools/todo.ts) with [custom tools](#custom-tools). -If the agent needs to delegate, it can spawn `pi` via bash or write a custom tool. Built-in sub-agents transfer context poorly; information gets lost or misrepresented. For parallel work, run multiple `pi` sessions in different terminals. +**No background bash.** Use tmux. Full observability, direct interaction. -### No Built-in To-Dos - -To-do lists confuse models more than they help. For task tracking, use a file: - -```markdown -# TODO.md -- [x] Implement authentication -- [ ] Write API docs -``` - -### No Planning Mode - -Tell the agent to think through problems without modifying files. For persistent plans, write to a file: - -```markdown -# PLAN.md -## Goal -Refactor auth to support OAuth -## Current Step -Working on authorization endpoints -``` - -### No Permission System (YOLO Mode) - -Pi runs with full filesystem access and no permission prompts. Why: -- Permission systems add friction while being easily circumvented -- Pre-checking for "dangerous" patterns causes latency and false positives - -**Risks:** -- Can read, write, delete anything with your user privileges -- Prompt injection via files or command output can influence behavior - -**Mitigations:** -- Run in a container if uncomfortable -- Don't use on systems with sensitive data you can't afford to lose - -### No Background Bash - -Use `tmux` or similar. Bonus: you can watch the agent interact with CLIs and intervene if needed. +Read the [blog post](https://mariozechner.at/posts/2025-11-30-pi-coding-agent/) for the full rationale. --- @@ -917,7 +883,7 @@ Never use `__dirname` directly for package assets. ### Debug Command -`/debug` (hidden) writes rendered lines with ANSI codes to `~/.pi/agent/pi-debug.log` for TUI debugging. +`/debug` (hidden) writes rendered lines with ANSI codes to `~/.pi/agent/pi-debug.log` for TUI debugging, as well as the last set of messages that were sent to the LLM. For architecture and contribution guidelines, see [DEVELOPMENT.md](./DEVELOPMENT.md). diff --git a/packages/coding-agent/docs/compaction-new.md b/packages/coding-agent/docs/compaction-new.md deleted file mode 100644 index ddf1417d..00000000 --- a/packages/coding-agent/docs/compaction-new.md +++ /dev/null @@ -1,387 +0,0 @@ -# Compaction Research & Redesign - -## Current Pi Compaction Implementation - -### Settings (defaults) -- `reserveTokens: 16384` - Buffer to leave for new responses -- `keepRecentTokens: 20000` - How many tokens of recent messages to keep - -### Trigger Conditions -1. **Threshold**: After each turn, if `contextTokens > contextWindow - reserveTokens` -2. **Overflow**: If LLM returns context overflow error, compact and retry - -### Current Process -1. Find cut point by walking backwards until `keepRecentTokens` accumulated -2. Generate single summary of everything before cut point -3. If cutting mid-turn, also generate "turn prefix summary" -4. Save `CompactionEntry` with summary and `firstKeptEntryIndex` - -### Current Prompt -``` -You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. - -Include: -- Current progress and key decisions made -- Important context, constraints, or user preferences -- Absolute file paths of any relevant files that were read or modified -- What remains to be done (clear next steps) -- Any critical data, examples, or references needed to continue - -Be concise, structured, and focused on helping the next LLM seamlessly continue the work. -``` - -### maxTokens for Summarization -- History summary: `0.8 * reserveTokens` (≈13K tokens) -- Turn prefix summary: `0.5 * reserveTokens` (≈8K tokens) - ---- - -## Claude Code's Approach - -### Key Differences -- Much more structured, detailed prompt -- Uses `` tags for chain-of-thought before summary -- Uses `` tags for structured output -- 9-section format with explicit requirements -- Supports custom summarization instructions via user input - -### Full Prompt (reconstructed from cli.js) - -``` -Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions. -This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context. - -Before providing your final summary, wrap your analysis in tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process: - -1. Chronologically analyze each message and section of the conversation. For each section thoroughly identify: - - The user's explicit requests and intents - - Your approach to addressing the user's requests - - Key decisions, technical concepts and code patterns - - Specific details like: - - file names - - full code snippets - - function signatures - - file edits - - Errors that you ran into and how you fixed them - - Pay special attention to specific user feedback that you received, especially if the user told you to do something differently. -2. Double-check for technical accuracy and completeness, addressing each required element thoroughly. - -Your summary should include the following sections: - -1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail -2. Key Technical Concepts: List all important technical concepts, technologies, and frameworks discussed. -3. Files and Code Sections: Enumerate specific files and code sections examined, modified, or created. Pay special attention to the most recent messages and include full code snippets where applicable and include a summary of why this file read or edit is important. -4. Errors and fixes: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received, especially if the user told you to do something differently. -5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts. -6. All user messages: List ALL user messages that are not tool results. These are critical for understanding the users' feedback and changing intent. -7. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on. -8. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include file names and code snippets where applicable. -9. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's most recent explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests or really old requests that were already completed without confirming with the user first. - If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation. - - - -[Your thought process, ensuring all points are covered thoroughly and accurately] - - - -1. Primary Request and Intent: - [Detailed description] - -2. Key Technical Concepts: - - [Concept 1] - - [Concept 2] - - [...] - -3. Files and Code Sections: - - [File Name 1] - - [Summary of why this file is important] - - [Summary of the changes made to this file, if any] - - [Important Code Snippet] - - [...] - -4. Errors and fixes: - - [Detailed description of error 1]: - - [How you fixed the error] - - [User feedback on the error if any] - - [...] - -5. Problem Solving: - [Description of solved problems and ongoing troubleshooting] - -6. All user messages: - - [Detailed non tool use user message] - - [...] - -7. Pending Tasks: - - [Task 1] - - [Task 2] - - [...] - -8. Current Work: - [Precise description of current work] - -9. Optional Next Step: - [Optional Next step to take] - - - - -There may be additional summarization instructions provided in the included context. If so, remember to follow these instructions when creating the above summary. Examples of instructions include: - -## Compact Instructions -When summarizing the conversation focus on typescript code changes and also remember the mistakes you made and how you fixed them. - -``` - -### Additional Features -- Supports custom instructions: `When you are using compact - please focus on test output and code changes. Include file reads verbatim.` -- Post-processes to extract `` and `` sections -- Has "microcompact" for tool results (abbreviated tool outputs) - ---- - -## OpenAI Codex's Approach - -### Compaction Prompt (`codex-rs/core/templates/compact/prompt.md`) -``` -You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. - -Include: -- Current progress and key decisions made -- Important context, constraints, or user preferences -- What remains to be done (clear next steps) -- Any critical data, examples, or references needed to continue - -Be concise, structured, and focused on helping the next LLM seamlessly continue the work. -``` - -### Summary Prefix (`codex-rs/core/templates/compact/summary_prefix.md`) -``` -Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis: -``` - -### Notes -- Very similar to our current prompt (likely we derived from same source) -- Supports custom `compact_prompt` override in config -- Has `experimental_compact_prompt_file` for loading from file - ---- - -## SST OpenCode's Approach - -### Compaction System Prompt (`session/prompt/compaction.txt`) -``` -You are a helpful AI assistant tasked with summarizing conversations. - -When asked to summarize, provide a detailed but concise summary of the conversation. -Focus on information that would be helpful for continuing the conversation, including: -- What was done -- What is currently being worked on -- Which files are being modified -- What needs to be done next -- Key user requests, constraints, or preferences that should persist -- Important technical decisions and why they were made - -Your summary should be comprehensive enough to provide context but concise enough to be quickly understood. -``` - -### User Message for Compaction -``` -Provide a detailed prompt for continuing our conversation above. Focus on information that would be helpful for continuing the conversation, including what we did, what we're doing, which files we're working on, and what we're going to do next considering new session will not have access to our conversation. -``` - -### Short Summary Prompt (`session/prompt/summarize.txt`) -``` -Summarize the following conversation into 2 sentences MAX explaining what the assistant did and why -Do not explain the user's input. -Do not speak in the third person about the assistant. -``` - -### Additional Features -- **Pruning**: Goes backwards through parts, after 40K tokens of tool calls, erases output of older tool calls -- **Prune thresholds**: `PRUNE_MINIMUM = 20_000`, `PRUNE_PROTECT = 40_000` -- Marks tool outputs as `compacted` with timestamp to avoid re-pruning - ---- - -## Factory Droid's Approach (from binary strings) - -### Scratchpad Feature -From extracted strings: -``` -Edit the session scratchpad using multiple operations in a single call. Operations can be str_replace, insert, or overwrite commands and are applied in order. The scratchpad is working memory that persists when conversation history is compacted or summarized. -``` - -### Summary Guidance -``` -Once you are done with the task, you can summarize the changes you made in a 1-4 sentences, don't go into too much detail. -``` - -### Compaction Model -Uses external model for summarization with configurable providers (Anthropic, OpenAI, generic chat completion API). - ---- - -## Proposed Slice-Based Compaction - -### Concept -Instead of summarizing the entire history in one call: -1. Segment session into slices (possibly overlapping) -2. Summarize each slice with budget = 1/10th of slice token count -3. Stitch slice summaries together into unified summary - -### Benefits -- More parallelizable (summarize slices concurrently) -- Less risk of losing detail in long sessions -- Better "compression ratio" control per slice -- Overlapping slices can preserve continuity/context -- Can prioritize recent slices with larger budgets - -### Proposed Algorithm - -```typescript -interface SliceConfig { - sliceTokens: number; // Target tokens per slice (e.g., 20K) - overlapTokens: number; // Overlap between slices (e.g., 2K) - compressionRatio: number; // Summary budget as fraction of slice (e.g., 0.1) - recentBoost: number; // Multiplier for most recent slice budget (e.g., 2.0) -} - -async function sliceBasedCompaction( - messages: Message[], - config: SliceConfig -): Promise { - // 1. Segment into slices - const slices = segmentIntoSlices(messages, config.sliceTokens, config.overlapTokens); - - // 2. Calculate budget per slice - const budgets = slices.map((slice, i) => { - const base = estimateTokens(slice) * config.compressionRatio; - // Boost recent slices - const isRecent = i >= slices.length - 2; - return Math.floor(isRecent ? base * config.recentBoost : base); - }); - - // 3. Summarize slices in parallel - const summaries = await Promise.all( - slices.map((slice, i) => summarizeSlice(slice, budgets[i], i, slices.length)) - ); - - // 4. Stitch summaries together - return stitchSummaries(summaries); -} -``` - -### Slice Summarization Prompt (per slice) - -``` -You are summarizing slice ${sliceIndex + 1} of ${totalSlices} from a coding session. - -${sliceIndex === 0 ? 'This is the BEGINNING of the session.' : ''} -${sliceIndex === totalSlices - 1 ? 'This is the MOST RECENT activity.' : ''} - -Summarize the key information in this slice: -- User requests and intent changes -- Files read, created, or modified (with paths) -- Key code changes or patterns -- Errors encountered and how they were resolved -- Decisions made and their rationale - -${sliceIndex === totalSlices - 1 ? ` -For the most recent slice, also include: -- Current work in progress -- Exact state of any pending tasks -- Next steps that were planned -` : ''} - -Be precise and technical. Preserve file paths and important code snippets. -Budget: approximately ${budget} tokens. -``` - -### Stitching Prompt - -``` -You have ${summaries.length} chronological slice summaries from a coding session. -Combine them into a single coherent handoff summary for another LLM. - -Requirements: -- Preserve chronological flow -- Deduplicate information that appears in overlapping sections -- Emphasize the most recent work and next steps -- Keep all file paths and critical code snippets -- Total budget: ${totalBudget} tokens - -Slice summaries: -${summaries.map((s, i) => `--- Slice ${i + 1} ---\n${s}`).join('\n\n')} -``` - ---- - -## Comparison Table - -| Feature | Pi (Current) | Claude Code | OpenAI Codex | SST OpenCode | -|---------|--------------|-------------|--------------|--------------| -| Prompt detail | Basic | Very detailed | Basic | Medium | -| Structured output | No | Yes () | No | No | -| Chain-of-thought | No | Yes () | No | No | -| Custom instructions | Yes | Yes | Yes (config) | No | -| Tool output pruning | No | Yes (microcompact) | No | Yes | -| Parallel summarization | No | No | No | No | -| Scratchpad/persistent memory | No | No | No | No | - ---- - ---- - -## Test Harness - -A CLI test tool is available at [compaction-strategies.ts](./compaction-strategies.ts) to compare strategies: - -```bash -npx tsx docs/compaction-strategies.ts before-compaction -npx tsx docs/compaction-strategies.ts large-session -``` - -This outputs results to `compaction-results/[fixture]-[strategy].md` (in repo root) and a comparison file. - -### Implemented Strategies - -1. **single-shot**: Current approach, one LLM call with full transcript -2. **parallel-stitch**: Slice into chunks, summarize in parallel, LLM-merge results -3. **sequential-accumulated**: Slice into chunks, summarize each with all previous summaries as context -4. **sequential-rolling**: Slice into chunks, each call updates/rewrites the running summary - -### Example Results (30K token session, 4 slices) - -| Strategy | Input Tokens | Output Tokens | API Calls | Time (ms) | -|----------|-------------|---------------|-----------|-----------| -| single-shot | 35706 | 1284 | 1 | 31914 | -| parallel-stitch | 37850 | 3087 | 5 | 34010 | -| sequential-accumulated | 39136 | 2996 | 4 | 66907 | -| sequential-rolling | 38873 | 4557 | 4 | 98032 | - -Observations: -- **single-shot**: Fastest, simplest, but entire context in one call -- **parallel-stitch**: Similar wall-clock (parallel), needs extra stitch call -- **sequential-accumulated**: 2x time, but each slice knows full prior context -- **sequential-rolling**: Slowest, most output (rewrites summary each time) - ---- - -## Recommendations - -### Short Term -1. **Improve prompt**: Adopt Claude Code's structured format with sections -2. **Add pruning**: Implement tool output pruning like OpenCode (mark old outputs as compacted) -3. **Better token estimation**: Use actual tokenizer instead of chars/4 heuristic - -### Medium Term -1. **Slice-based compaction**: Implement parallel slice summarization -2. **Persistent scratchpad**: Add working memory that survives compaction -3. **Custom instructions**: Support user-provided compaction focus - -### Long Term -1. **Semantic chunking**: Use embeddings to find natural break points -2. **Importance scoring**: Weight messages by relevance to current task -3. **Incremental compaction**: Compact older portions while keeping recent detailed diff --git a/packages/coding-agent/docs/compaction-strategies.ts b/packages/coding-agent/docs/compaction-strategies.ts deleted file mode 100644 index d1ae7ba9..00000000 --- a/packages/coding-agent/docs/compaction-strategies.ts +++ /dev/null @@ -1,502 +0,0 @@ -/** - * CLI tool to test different compaction strategies on session fixtures. - * - * Usage: - * npx tsx test/compaction-strategies.ts [fixture-name] - * - * Examples: - * npx tsx test/compaction-strategies.ts large-session - * npx tsx test/compaction-strategies.ts before-compaction - * - * Output: - * test/compaction-results/[fixture]-[strategy].md - */ - -import * as fs from "fs"; -import * as path from "path"; -import { fileURLToPath } from "url"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -import { complete, getModel, type UserMessage } from "@mariozechner/pi-ai"; - -// ============================================================================ -// Types -// ============================================================================ - -interface SessionEntry { - type: string; - timestamp: string; - message?: { - role: string; - content: unknown; - stopReason?: string; - }; -} - -interface SimpleMessage { - role: "user" | "assistant"; - content: string; - tokens: number; // estimated -} - -interface SliceSummary { - sliceIndex: number; - summary: string; - tokens: number; -} - -interface StrategyResult { - name: string; - summary: string; - totalInputTokens: number; - totalOutputTokens: number; - numCalls: number; - timeMs: number; -} - -// ============================================================================ -// Config -// ============================================================================ - -const MODEL = getModel("anthropic", "claude-sonnet-4-5"); -const SLICE_TOKENS = 10000; // target tokens per slice (smaller for testing) -const SUMMARY_BUDGET = 2000; // max tokens for each summary call -const FINAL_SUMMARY_BUDGET = 4000; // max tokens for final/stitched summary - -// ============================================================================ -// Utilities -// ============================================================================ - -function estimateTokens(text: string): number { - return Math.ceil(text.length / 4); -} - -function extractTextContent(content: unknown): string { - if (typeof content === "string") return content; - if (Array.isArray(content)) { - return content - .map((block) => { - if (typeof block === "string") return block; - if (block.type === "text") return block.text || ""; - if (block.type === "tool_use") - return `[Tool: ${block.name}]\n${JSON.stringify(block.arguments || block.input, null, 2)}`; - if (block.type === "tool_result") { - const text = typeof block.content === "string" ? block.content : JSON.stringify(block.content); - return `[Tool Result: ${block.tool_use_id}]\n${text.slice(0, 2000)}${text.length > 2000 ? "..." : ""}`; - } - if (block.type === "thinking") return `[Thinking]\n${block.thinking}`; - return ""; - }) - .filter(Boolean) - .join("\n"); - } - return JSON.stringify(content); -} - -function loadSession(fixturePath: string): SimpleMessage[] { - const content = fs.readFileSync(fixturePath, "utf-8"); - const lines = content.trim().split("\n"); - const messages: SimpleMessage[] = []; - - for (const line of lines) { - try { - const entry: SessionEntry = JSON.parse(line); - if (entry.type === "message" && entry.message) { - const role = entry.message.role; - if (role !== "user" && role !== "assistant") continue; - if (entry.message.stopReason === "aborted" || entry.message.stopReason === "error") continue; - - const text = extractTextContent(entry.message.content); - if (!text.trim()) continue; - - messages.push({ - role: role as "user" | "assistant", - content: text, - tokens: estimateTokens(text), - }); - } - } catch { - // skip malformed lines - } - } - - return messages; -} - -function segmentByTokens(messages: SimpleMessage[], sliceTokens: number): SimpleMessage[][] { - const slices: SimpleMessage[][] = []; - let current: SimpleMessage[] = []; - let currentTokens = 0; - - for (const msg of messages) { - if (currentTokens + msg.tokens > sliceTokens && current.length > 0) { - slices.push(current); - current = []; - currentTokens = 0; - } - current.push(msg); - currentTokens += msg.tokens; - } - - if (current.length > 0) { - slices.push(current); - } - - return slices; -} - -function messagesToTranscript(messages: SimpleMessage[]): string { - return messages - .map((m) => { - const prefix = m.role === "user" ? "USER:" : "ASSISTANT:"; - return `${prefix}\n${m.content}`; - }) - .join("\n\n---\n\n"); -} - -async function callLLM( - systemPrompt: string, - userPrompt: string, - maxTokens: number, -): Promise<{ text: string; inputTokens: number; outputTokens: number }> { - const apiKey = process.env.ANTHROPIC_API_KEY; - if (!apiKey) throw new Error("ANTHROPIC_API_KEY not set"); - - const messages: UserMessage[] = [ - { - role: "user", - content: userPrompt, - timestamp: Date.now(), - }, - ]; - - const result = await complete( - MODEL, - { - system: systemPrompt, - messages, - }, - { - maxTokens, - apiKey, - }, - ); - - const text = result.content - .filter((c): c is { type: "text"; text: string } => c.type === "text") - .map((c) => c.text) - .join("\n"); - - return { - text, - inputTokens: result.usage.input + result.usage.cacheRead, - outputTokens: result.usage.output, - }; -} - -// ============================================================================ -// Strategy 1: Single-shot (current approach) -// ============================================================================ - -const SINGLE_SHOT_SYSTEM = `You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. - -Include: -- Current progress and key decisions made -- Important context, constraints, or user preferences -- Absolute file paths of any relevant files that were read or modified -- What remains to be done (clear next steps) -- Any critical data, examples, or references needed to continue - -Be concise, structured, and focused on helping the next LLM seamlessly continue the work.`; - -async function strategySingleShot(messages: SimpleMessage[]): Promise { - const start = Date.now(); - const transcript = messagesToTranscript(messages); - - const { text, inputTokens, outputTokens } = await callLLM( - SINGLE_SHOT_SYSTEM, - `Here is the conversation to summarize:\n\n\n${transcript}\n\n\nProvide your summary now:`, - FINAL_SUMMARY_BUDGET, - ); - - return { - name: "single-shot", - summary: text, - totalInputTokens: inputTokens, - totalOutputTokens: outputTokens, - numCalls: 1, - timeMs: Date.now() - start, - }; -} - -// ============================================================================ -// Strategy 2: Parallel slices with LLM stitch -// ============================================================================ - -const SLICE_SYSTEM = `You are summarizing one segment of a longer coding session. -Be concise but capture key information: user requests, files modified, decisions made, errors fixed. -Preserve file paths and important code snippets.`; - -const STITCH_SYSTEM = `You are combining multiple chronological summaries of a coding session into one coherent handoff document. -Remove redundancy. Preserve all file paths and key details. Emphasize the most recent work (last segment).`; - -async function strategyParallelStitch(messages: SimpleMessage[]): Promise { - const start = Date.now(); - const slices = segmentByTokens(messages, SLICE_TOKENS); - let totalInput = 0; - let totalOutput = 0; - - console.log(` Parallel: ${slices.length} slices`); - - // Summarize all slices in parallel - const sliceSummaries = await Promise.all( - slices.map(async (slice, i) => { - const isLast = i === slices.length - 1; - const transcript = messagesToTranscript(slice); - const prompt = `Segment ${i + 1} of ${slices.length}${isLast ? " (MOST RECENT)" : ""}: - -${transcript} - -${isLast ? "This is the most recent activity. Be detailed about current state and next steps." : "Summarize the key points from this segment."}`; - - const { text, inputTokens, outputTokens } = await callLLM(SLICE_SYSTEM, prompt, SUMMARY_BUDGET); - totalInput += inputTokens; - totalOutput += outputTokens; - - return { sliceIndex: i, summary: text, tokens: estimateTokens(text) }; - }), - ); - - // Stitch summaries together - const stitchPrompt = sliceSummaries.map((s) => `=== Segment ${s.sliceIndex + 1} ===\n${s.summary}`).join("\n\n"); - - const { - text: finalSummary, - inputTokens, - outputTokens, - } = await callLLM( - STITCH_SYSTEM, - `Combine these ${sliceSummaries.length} chronological segment summaries into one unified handoff summary:\n\n${stitchPrompt}`, - FINAL_SUMMARY_BUDGET, - ); - totalInput += inputTokens; - totalOutput += outputTokens; - - return { - name: "parallel-stitch", - summary: finalSummary, - totalInputTokens: totalInput, - totalOutputTokens: totalOutput, - numCalls: slices.length + 1, - timeMs: Date.now() - start, - }; -} - -// ============================================================================ -// Strategy 3: Sequential slices with accumulated context -// ============================================================================ - -const SEQUENTIAL_SYSTEM = `You are summarizing one segment of a longer coding session. -You may be given summaries of earlier segments for context. -Create a summary of THIS segment's content. Do not repeat information from previous summaries. -Be concise but capture: user requests, files modified, decisions made, errors fixed.`; - -async function strategySequentialAccumulated(messages: SimpleMessage[]): Promise { - const start = Date.now(); - const slices = segmentByTokens(messages, SLICE_TOKENS); - let totalInput = 0; - let totalOutput = 0; - - console.log(` Sequential: ${slices.length} slices`); - - const sliceSummaries: SliceSummary[] = []; - - for (let i = 0; i < slices.length; i++) { - const slice = slices[i]; - const isLast = i === slices.length - 1; - const transcript = messagesToTranscript(slice); - - // Build context from previous summaries - const previousContext = - sliceSummaries.length > 0 - ? `Previous segments summary:\n${sliceSummaries.map((s) => `[Segment ${s.sliceIndex + 1}] ${s.summary}`).join("\n\n")}\n\n---\n\n` - : ""; - - const prompt = `${previousContext}Current segment (${i + 1} of ${slices.length})${isLast ? " - MOST RECENT" : ""}: - -${transcript} - -${isLast ? "This is the most recent activity. Be detailed about current state, pending work, and next steps." : "Summarize the key NEW information from this segment (don't repeat what's in previous summaries)."}`; - - const { text, inputTokens, outputTokens } = await callLLM( - SEQUENTIAL_SYSTEM, - prompt, - isLast ? FINAL_SUMMARY_BUDGET : SUMMARY_BUDGET, - ); - totalInput += inputTokens; - totalOutput += outputTokens; - - sliceSummaries.push({ - sliceIndex: i, - summary: text, - tokens: estimateTokens(text), - }); - - console.log(` Slice ${i + 1}/${slices.length} done`); - } - - // Combine all slice summaries into final output - const finalSummary = sliceSummaries.map((s) => `## Segment ${s.sliceIndex + 1}\n\n${s.summary}`).join("\n\n---\n\n"); - - return { - name: "sequential-accumulated", - summary: finalSummary, - totalInputTokens: totalInput, - totalOutputTokens: totalOutput, - numCalls: slices.length, - timeMs: Date.now() - start, - }; -} - -// ============================================================================ -// Strategy 4: Sequential with rolling summary -// ============================================================================ - -const ROLLING_SYSTEM = `You are creating a rolling summary of a coding session. -Given a previous summary and new conversation content, produce an UPDATED summary that incorporates the new information. -Keep the summary focused and under the token budget. Condense older details as needed to make room for recent work.`; - -async function strategySequentialRolling(messages: SimpleMessage[]): Promise { - const start = Date.now(); - const slices = segmentByTokens(messages, SLICE_TOKENS); - let totalInput = 0; - let totalOutput = 0; - - console.log(` Rolling: ${slices.length} slices`); - - let runningSummary = ""; - - for (let i = 0; i < slices.length; i++) { - const slice = slices[i]; - const isLast = i === slices.length - 1; - const transcript = messagesToTranscript(slice); - - const prompt = runningSummary - ? `Current summary so far:\n${runningSummary}\n\n---\n\nNew content (segment ${i + 1} of ${slices.length}):\n${transcript}\n\n${isLast ? "This is the final segment. Produce the complete handoff summary with emphasis on current state and next steps." : "Update the summary to incorporate this new content. Condense older details if needed."}` - : `First segment of the conversation:\n${transcript}\n\nCreate an initial summary capturing the key points.`; - - const { text, inputTokens, outputTokens } = await callLLM( - ROLLING_SYSTEM, - prompt, - isLast ? FINAL_SUMMARY_BUDGET : SUMMARY_BUDGET, - ); - totalInput += inputTokens; - totalOutput += outputTokens; - - runningSummary = text; - console.log(` Slice ${i + 1}/${slices.length} done`); - } - - return { - name: "sequential-rolling", - summary: runningSummary, - totalInputTokens: totalInput, - totalOutputTokens: totalOutput, - numCalls: slices.length, - timeMs: Date.now() - start, - }; -} - -// ============================================================================ -// Main -// ============================================================================ - -async function main() { - const fixtureName = process.argv[2] || "large-session"; - const fixturesDir = path.join(__dirname, "fixtures"); - const fixturePath = path.join(fixturesDir, `${fixtureName}.jsonl`); - - if (!fs.existsSync(fixturePath)) { - console.error(`Fixture not found: ${fixturePath}`); - console.error(`Available fixtures:`); - for (const f of fs.readdirSync(fixturesDir).filter((f) => f.endsWith(".jsonl"))) { - console.error(` - ${f.replace(".jsonl", "")}`); - } - process.exit(1); - } - - console.log(`Loading fixture: ${fixtureName}`); - const messages = loadSession(fixturePath); - const totalTokens = messages.reduce((sum, m) => sum + m.tokens, 0); - console.log(` ${messages.length} messages, ~${totalTokens} tokens\n`); - - const resultsDir = path.join(__dirname, "compaction-results"); - fs.mkdirSync(resultsDir, { recursive: true }); - - const strategies: Array<{ - name: string; - fn: (msgs: SimpleMessage[]) => Promise; - }> = [ - { name: "single-shot", fn: strategySingleShot }, - { name: "parallel-stitch", fn: strategyParallelStitch }, - { name: "sequential-accumulated", fn: strategySequentialAccumulated }, - { name: "sequential-rolling", fn: strategySequentialRolling }, - ]; - - const results: StrategyResult[] = []; - - for (const strategy of strategies) { - console.log(`Running strategy: ${strategy.name}`); - try { - const result = await strategy.fn(messages); - results.push(result); - - // Write individual result - const outputPath = path.join(resultsDir, `${fixtureName}-${strategy.name}.md`); - const output = `# Compaction Result: ${strategy.name} - -## Stats -- Input tokens: ${result.totalInputTokens} -- Output tokens: ${result.totalOutputTokens} -- API calls: ${result.numCalls} -- Time: ${result.timeMs}ms - -## Summary - -${result.summary} -`; - fs.writeFileSync(outputPath, output); - console.log(` ✓ Wrote ${outputPath}\n`); - } catch (err) { - console.error(` ✗ Failed: ${err}\n`); - } - } - - // Write comparison summary - const comparisonPath = path.join(resultsDir, `${fixtureName}-comparison.md`); - const comparison = `# Compaction Strategy Comparison: ${fixtureName} - -## Input -- Messages: ${messages.length} -- Estimated tokens: ${totalTokens} - -## Results - -| Strategy | Input Tokens | Output Tokens | API Calls | Time (ms) | -|----------|-------------|---------------|-----------|-----------| -${results.map((r) => `| ${r.name} | ${r.totalInputTokens} | ${r.totalOutputTokens} | ${r.numCalls} | ${r.timeMs} |`).join("\n")} - -## Summaries - -${results.map((r) => `### ${r.name}\n\n${r.summary}\n`).join("\n---\n\n")} -`; - fs.writeFileSync(comparisonPath, comparison); - console.log(`Wrote comparison: ${comparisonPath}`); -} - -main().catch((err) => { - console.error(err); - process.exit(1); -}); diff --git a/packages/coding-agent/docs/compaction.md b/packages/coding-agent/docs/compaction.md deleted file mode 100644 index 0bece293..00000000 --- a/packages/coding-agent/docs/compaction.md +++ /dev/null @@ -1,519 +0,0 @@ -# Context Compaction - -Research on how other coding assistants implement context compaction to manage long conversations. - -## Overview - -Context compaction (also called "handoff" or "summarization") is a technique to manage the context window in long coding sessions. When conversations grow too long, performance degrades and costs increase. Compaction summarizes the conversation history into a condensed form, allowing work to continue without hitting context limits. - -## Claude Code - -**Manual:** `/compact` command -**Auto:** Triggers at ~95% context capacity ([source](https://stevekinney.com/courses/ai-development/claude-code-compaction)) - -### How it works - -1. Takes entire conversation history -2. Uses an LLM to generate a summary -3. Starts a new session with the summary as initial context -4. User can provide custom instructions with `/compact` (e.g., "summarize only the TODOs") ([source](https://stevekinney.com/courses/ai-development/claude-code-compaction)) - -### Prompt (extracted from community) - -From [r/ClaudeAI](https://www.reddit.com/r/ClaudeAI/comments/1jr52qj/here_is_claude_codes_compact_prompt/): - -``` -Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions. This summary will be used as context when continuing the conversation, so preserve critical information including: -- What was accomplished -- Current work in progress -- Files involved -- Next steps -- Key user requests or constraints -``` - -### Key observations - -- Auto-compact triggers at ~95% capacity but users often recommend manual compaction earlier ([source](https://stevekinney.com/courses/ai-development/claude-code-compaction)) -- Quality can degrade with multiple compactions (cumulative information loss) ([source](https://stevekinney.com/courses/ai-development/claude-code-compaction)) -- Different from `/clear` which wipes history completely ([source](https://stevekinney.com/courses/ai-development/claude-code-compaction)) -- Users report the model can "go off the rails" if auto-compact happens mid-task ([source](https://stevekinney.com/courses/ai-development/claude-code-compaction)) - -## OpenAI Codex CLI - -Source: [github.com/openai/codex](https://github.com/openai/codex) (codex-rs/core/src/compact.rs, codex-rs/core/templates/compact/) - -**Manual:** `/compact` slash command -**Auto:** Triggers when token usage exceeds `model_auto_compact_token_limit` - -### How it works - -1. Uses a dedicated summarization prompt -2. Sends entire history with the prompt appended -3. Collects the summary from the model response -4. Builds new history: initial context + recent user messages (up to 20k tokens) + summary -5. Replaces session history with the compacted version - -### Prompt - -From [codex-rs/core/templates/compact/prompt.md](https://github.com/openai/codex/blob/main/codex-rs/core/templates/compact/prompt.md): - -```markdown -You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. - -Include: -- Current progress and key decisions made -- Important context, constraints, or user preferences -- What remains to be done (clear next steps) -- Any critical data, examples, or references needed to continue - -Be concise, structured, and focused on helping the next LLM seamlessly continue the work. -``` - -### Summary prefix (prepended to summaries in new context) - -From [codex-rs/core/templates/compact/summary_prefix.md](https://github.com/openai/codex/blob/main/codex-rs/core/templates/compact/summary_prefix.md): - -```markdown -Another language model started to solve this problem and produced a summary of its thinking process. You also have access to the state of the tools that were used by that language model. Use this to build on the work that has already been done and avoid duplicating work. Here is the summary produced by the other language model, use the information in this summary to assist with your own analysis: -``` - -### Key observations - -- Uses token-based threshold (`model_auto_compact_token_limit`) rather than percentage ([config/mod.rs](https://github.com/openai/codex/blob/main/codex-rs/core/src/config/mod.rs)) -- Default thresholds vary by model (e.g., 180k for some models, 244k for others) ([config/mod.rs](https://github.com/openai/codex/blob/main/codex-rs/core/src/config/mod.rs)) -- Preserves recent user messages (last ~20k tokens worth) alongside summary ([compact.rs](https://github.com/openai/codex/blob/main/codex-rs/core/src/compact.rs)) -- Warns user: "Long conversations and multiple compactions can cause the model to be less accurate" ([compact.rs](https://github.com/openai/codex/blob/main/codex-rs/core/src/compact.rs)) -- Has retry logic with exponential backoff for failed compactions ([compact.rs](https://github.com/openai/codex/blob/main/codex-rs/core/src/compact.rs)) -- Uses "effective_context_window_percent" of 95% for safety margin ([model_family.rs](https://github.com/openai/codex/blob/main/codex-rs/core/src/model_family.rs)) - -## OpenCode (sst/opencode) - -Source: [github.com/sst/opencode](https://github.com/sst/opencode) (packages/opencode/src/session/compaction.ts) - -**Manual:** `/compact` command -**Auto:** Triggers when `isOverflow()` returns true (based on token usage vs model limits) - -### How it works - -1. Checks if tokens exceed (context_limit - output_limit) ([compaction.ts](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/compaction.ts)) -2. Creates a new assistant message marked as "summary" -3. Uses a compaction system prompt -4. Streams the summary generation -5. If auto-compaction, adds a "Continue if you have next steps" message - -### Prompt - -From [packages/opencode/src/session/prompt/compaction.txt](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/prompt/compaction.txt): - -``` -You are a helpful AI assistant tasked with summarizing conversations. - -When asked to summarize, provide a detailed but concise summary of the conversation. -Focus on information that would be helpful for continuing the conversation, including: -- What was done -- What is currently being worked on -- Which files are being modified -- What needs to be done next -- Key user requests, constraints, or preferences that should persist -- Important technical decisions and why they were made - -Your summary should be comprehensive enough to provide context but concise enough to be quickly understood. -``` - -### Final user message - -From [compaction.ts](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/compaction.ts): - -``` -Summarize our conversation above. This summary will be the only context available when the conversation continues, so preserve critical information including: what was accomplished, current work in progress, files involved, next steps, and any key user requests or constraints. Be concise but detailed enough that work can continue seamlessly. -``` - -### Key observations - -- Has a "prune" mechanism separate from compaction ([compaction.ts](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/compaction.ts)): - - Scans backward through tool calls - - Protects last 40k tokens of tool output (PRUNE_PROTECT constant) - - Prunes tool outputs beyond that threshold if >20k tokens prunable (PRUNE_MINIMUM constant) -- Disables auto-compaction via `OPENCODE_DISABLE_AUTOCOMPACT` env var ([flag.ts](https://github.com/sst/opencode/blob/main/packages/opencode/src/flag/flag.ts)) -- Separate summarization for UI display (2 sentences max) vs. compaction (detailed) ([summary.ts](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/summary.ts)) - -## Amp (Sourcegraph) - -Source: [ampcode.com/guides/context-management](https://ampcode.com/guides/context-management) - -**Manual:** "Handoff" feature -**Auto:** None (manual context management encouraged) - -### How it works - -Amp takes a different approach, providing tools for manual context management rather than automatic compaction: - -1. **Handoff**: Specify a goal for the next task, Amp analyzes the current thread and extracts relevant information into a new message for a fresh thread -2. **Fork**: Duplicate context window at a specific point -3. **Edit/Restore**: Edit or restore to previous messages -4. **Thread References**: Reference other threads to extract information on-demand - -### Key observations - -- Philosophy: "For best results, keep conversations short & focused" ([source](https://ampcode.com/guides/context-management)) -- Emphasizes that everything in context affects output quality: "everything in the context window has an influence on the output" ([source](https://ampcode.com/guides/context-management)) -- Uses a secondary model to extract relevant information during handoff ([source](https://ampcode.com/guides/context-management)) -- Thread references allow selective extraction without full context inclusion ([source](https://ampcode.com/guides/context-management)) -- No automatic compaction; relies on user discipline and tooling - -## Implementation Recommendations for pi-coding-agent - -### `/compact` Command - -```typescript -// User triggers: /compact [optional custom instructions] -// 1. Generate summary using current conversation -// 2. Create new session with summary as initial context -// 3. Optionally continue with queued user message -``` - -### Auto-compaction - -```typescript -// Threshold-based (e.g., 85-90% of context limit) -// Check after each turn: -if (tokenUsage / contextLimit > 0.85) { - await compact({ auto: true }); -} -``` - -### Compaction Prompt - -Based on research, a good compaction prompt should include: - -```markdown -Create a detailed summary for continuing this coding session. Include: - -1. **Completed work**: What tasks were finished -2. **Current state**: Files modified, their current status -3. **In progress**: What is being worked on now -4. **Next steps**: Clear actions to take -5. **Constraints**: User preferences, project requirements, key decisions made -6. **Critical context**: Any information essential for continuing - -Be concise but preserve enough detail that work can continue seamlessly. -``` - -### Key Design Decisions - -1. **Threshold**: 85-90% recommended (95% is often too late, per Claude Code user feedback) -2. **Pruning**: Consider pruning old tool outputs before full compaction (OpenCode approach) -3. **Warning**: Notify users that compaction happened and quality may degrade (Codex approach) -4. **Disable option**: Allow users to disable auto-compaction via flag/env (OpenCode approach) -5. **Custom instructions**: Support `/compact [instructions]` for targeted summaries (Claude Code approach) -6. **Session continuity**: New session should feel seamless (summary as hidden context) - -### Existing Infrastructure - -The coding-agent already has: -- `/clear` command that resets the session -- Session management with message history -- Token counting per turn - -For compaction, we need to: -1. Add `/compact` command handler (similar to `/clear` but with summary) -2. Add token threshold checking after each assistant turn -3. Create a summarization prompt -4. Wire it to create a new session with the summary - ---- - -## Our Implementation Plan - -### Commands - -- **`/compact [custom instructions]`** - Manual compaction trigger. Optional custom instructions let users guide what to focus on in the summary. -- **`/autocompact`** - Opens selector UI to toggle auto-compaction on/off. Also displays current power-user settings (reserveTokens, keepRecentTokens). - -### Configuration - -Settings stored in `~/.pi/agent/settings.json`: - -```typescript -interface Settings { - // ... existing fields - compaction?: { - enabled?: boolean // default: true, toggled via /autocompact - reserveTokens?: number // default: 16384, power-user setting - keepRecentTokens?: number // default: 20000, power-user setting - } -} -``` - -**Why these defaults:** -- `reserveTokens: 16384` - Room for summary output (~13k) plus safety margin (~3k) -- `keepRecentTokens: 20000` - Preserves recent context verbatim, summary focuses on older content - -### Token Calculation - -Context tokens are calculated from the **last non-aborted assistant message** using the same formula as the footer: - -``` -contextTokens = usage.input + usage.output + usage.cacheRead + usage.cacheWrite -``` - -This gives total context size across all providers. The `input` field represents non-cached input tokens, so adding `cacheRead` and `cacheWrite` gives the true total input. - -**Trigger condition:** -```typescript -if (contextTokens > model.contextWindow - settings.compaction.reserveTokens) { - await compact({ auto: true }); -} -``` - -### Turn Boundaries - -Messages follow patterns like: `user, assistant, toolResult, toolResult, user, assistant, ...` - -**Critical rule:** Never cut mid-turn. A turn = user message → assistant responses + tool results until next user message. Always cut before a user message to keep assistant + toolResult pairs intact (providers fail if toolResult is orphaned from its assistant message with the toolCall). - -### Summary Injection - -The summary is injected as a **user message** with a prefix (similar to Codex approach). This makes it visible to the user and clearly frames it for the model. - -Prefix: -``` -Another language model worked on this task and produced a summary. Use this to continue the work without duplicating effort: -``` - -### Session File Format - -Compaction events are **appended** to the session file (never inserted mid-file): - -```typescript -interface CompactionEvent { - type: "compaction" - timestamp: string - summary: string // The summary text - keepLastMessages: number // How many messages before this event to keep - tokensBefore: number // Context size before compaction -} -``` - -Example session file after compaction: -``` -{"type": "message", "message": {"role": "user", ...}} -{"type": "message", "message": {"role": "assistant", ...}} -{"type": "message", "message": {"role": "toolResult", ...}} -... more messages ... -{"type": "compaction", "summary": "...", "keepLastMessages": 4, ...} -{"type": "message", "message": {"role": "user", ...}} <- new messages after compaction -``` - -**Session loader behavior:** -1. Find the latest compaction event -2. Take last `keepLastMessages` messages *before* the compaction event -3. Build context: `[summary_as_user_msg, ...kept_messages, ...messages_after_compaction]` - -**Multiple compactions:** When doing a second compaction, don't cross the first compaction boundary. The new summary incorporates the previous summary (since current context already includes it). - -#### Example: Single Compaction - -Session file with messages (u=user, a=assistant, t=toolResult): -``` -u1, a1, t1, t1, a1, u2, a2, u3, a3, t3, a3, t3, a3, u4, a4, t4, a4 -``` - -Compaction triggers, keeping last 4 messages. The compaction event is appended: -``` -u1, a1, t1, t1, a1, u2, a2, u3, a3, t3, a3, t3, a3, u4, a4, t4, a4 -[COMPACTION: summary="...", keepLastMessages=4] -``` - -Session loader builds context: -``` -[summary_as_user_msg], u4, a4, t4, a4 -``` - -New messages after compaction are appended: -``` -u1, a1, t1, t1, a1, u2, a2, u3, a3, t3, a3, t3, a3, u4, a4, t4, a4 -[COMPACTION: summary="...", keepLastMessages=4] -u5, a5 -``` - -Session loader now builds: -``` -[summary_as_user_msg], u4, a4, t4, a4, u5, a5 -``` - -#### Example: Multiple Compactions - -After more messages, second compaction triggers: -``` -u1, a1, t1, t1, a1, u2, a2, u3, a3, t3, a3, t3, a3, u4, a4, t4, a4 -[COMPACTION 1: summary="...", keepLastMessages=4] -u5, a5, u6, a6, t6, a6, u7, a7 -[COMPACTION 2: summary="...", keepLastMessages=3] -``` - -Session loader finds COMPACTION 2 (latest), builds: -``` -[summary2_as_user_msg], u6, a6, t6, a6, u7, a7 -``` - -Note: COMPACTION 2's summary incorporates COMPACTION 1's summary because the summarization model received the full current context (which included summary1 as first message). - -**Boundary rule:** When calculating `keepLastMessages` for COMPACTION 2, we only count messages between COMPACTION 1 and COMPACTION 2. If `keepLastMessages` exceeds the available messages (e.g., keepLastMessages=10 but only 6 messages exist after COMPACTION 1), we take all available messages up to the boundary. We never cross a compaction boundary. - -### Summarization - -Use **pi-ai directly** (not the full agent loop) for summarization: -- No tools needed -- Set `maxTokens` to `0.8 * reserveTokens` (leaves 20% for prompt overhead and safety margin) -- Pass abort signal for cancellation -- Use the currently selected model -- **Reasoning disabled** (thinking level "off") since we just need a summary, not extended reasoning - -With default `reserveTokens: 16384`, maxTokens = ~13107. - -**Prompt** (based on Codex, enhanced): -```markdown -You are performing a CONTEXT CHECKPOINT COMPACTION. Create a handoff summary for another LLM that will resume the task. - -Include: -- Current progress and key decisions made -- Important context, constraints, or user preferences -- Absolute file paths of any relevant files that were read or modified -- What remains to be done (clear next steps) -- Any critical data, examples, or references needed to continue - -Be concise, structured, and focused on helping the next LLM seamlessly continue the work. -``` - -### Error Handling - -- On compaction failure: output error, let user decide what to do -- In JSON/RPC mode: emit `{"type": "error", "error": "message"}` (existing pattern) -- Compaction is abortable via the same abort signal as regular streaming - -### Image Handling - -Two cases: -1. **Images via file path in prompt** → Model reads with tool → Can be captured in summary as "image at /path/to/file.png was analyzed". Prompt instructs model to include absolute file paths. -2. **Images via @attachment** → Attached to user message directly → Lost in compaction (can't summarize an image). Known limitation. - -### Modes - -Works in all modes: -- **TUI**: Commands available, UI shows compaction happening -- **Print/JSON**: Compaction events emitted as output -- **RPC**: Compaction events sent to client - -### Interaction with /branch - -The `/branch` command lets users create a new session from a previous user message. With compaction: - -- **Branch UI reads from session file directly** (not from `state.messages`) to show ALL user messages, including those before compaction events -- **Branching copies the raw session file** line-by-line up to (but excluding) the selected user message, preserving all compaction events and intermediate entries - -#### Why read from session file instead of state.messages - -After compaction, `state.messages` only contains `[summary_user_msg, ...kept_messages, ...new_messages]`. The pre-compaction messages are not in state. To allow branching to any historical point, we must read the session file directly. - -#### Reworked createBranchedSession - -Current implementation iterates `state.messages` and writes fresh entries. New implementation: -1. Read session file line by line -2. For each line, check if it's the target user message -3. Copy all lines up to (but excluding) the target user message -4. The target user message text goes into the editor - -#### Example: Branching After Compaction - -Session file: -``` -u1, a1, u2, a2 -[COMPACTION: summary="...", keepLastMessages=2] -u3, a3, u4, a4 -``` - -User branches at u3. New session file: -``` -u1, a1, u2, a2 -[COMPACTION: summary="...", keepLastMessages=2] -``` - -Session loader builds context for new session: -``` -[summary_as_user_msg], u2, a2 -``` - -User's editor contains u3's text for editing/resubmission. - -#### Example: Branching Before Compaction - -Same session file, user branches at u2. New session file: -``` -u1, a1 -``` - -No compaction in new session. Session loader builds: -``` -u1, a1 -``` - -This effectively "undoes" the compaction, letting users recover if important context was lost. - -### Auto-Compaction Trigger - -Auto-compaction is checked in the agent subscription callback after each `message_end` event for assistant messages. If context tokens exceed the threshold, compaction runs. - -**Why abort mid-turn:** If auto-compaction triggers after an assistant message that contains tool calls, we abort immediately rather than waiting for tool results. Waiting would risk: -1. Tool results filling remaining context, leaving no room for the summary -2. Context overflow before the next check point (agent_end) - -The abort causes some work loss, but the summary captures progress up to that point. - -**Trigger flow (similar to `/clear` command):** - -```typescript -async handleAutoCompaction(): Promise { - // 1. Unsubscribe to stop processing events (no more messages added to state/session) - this.unsubscribe?.(); - - // 2. Abort current agent run and wait for completion - this.agent.abort(); - await this.agent.waitForIdle(); - - // 3. Stop loading animation - if (this.loadingAnimation) { - this.loadingAnimation.stop(); - this.loadingAnimation = null; - } - this.statusContainer.clear(); - - // 4. Perform compaction on current state: - // - Generate summary using pi-ai directly (no tools, reasoning off) - // - Write compaction event to session file - // - Rebuild agent messages (summary as user msg + kept messages) - // - Rebuild UI to reflect new state - - // 5. Resubscribe to agent - this.subscribeToAgent(); - - // 6. Show compaction notification to user -} -``` - -This mirrors the `/clear` command pattern: unsubscribe first to prevent processing abort events, then abort and wait, then do the work, then resubscribe. - -### Implementation Steps - -1. Add `compaction` field to `Settings` interface and `SettingsManager` -2. Add `CompactionEvent` type to session manager -3. Update session loader to handle compaction events (find latest, apply keepLastMessages with boundary rule) -4. Rework `createBranchedSession` to copy raw session file lines instead of re-serializing from state -5. Update `/branch` UI to read user messages from session file directly -6. Add `/compact` command handler -7. Add `/autocompact` command with selector UI -8. Add auto-compaction check in subscription callback after assistant `message_end` -9. Implement `handleAutoCompaction()` following the unsubscribe/abort/wait/compact/resubscribe pattern -10. Implement summarization function using pi-ai (no tools, reasoning off) -11. Add compaction event to RPC/JSON output types -12. Update footer to show when auto-compact is disabled diff --git a/packages/coding-agent/docs/custom-tools.md b/packages/coding-agent/docs/custom-tools.md index 777ee25e..6ea2f98d 100644 --- a/packages/coding-agent/docs/custom-tools.md +++ b/packages/coding-agent/docs/custom-tools.md @@ -1,6 +1,24 @@ # Custom Tools -Custom tools extend pi with new capabilities beyond the built-in read/write/edit/bash tools. They are TypeScript modules that define one or more tools with optional custom rendering for the TUI. +Custom tools are additional tools that the LLM can call directly, just like the built-in `read`, `write`, `edit`, and `bash` tools. They are TypeScript modules that define callable functions with parameters, return values, and optional TUI rendering. + +**Example use cases:** +- Ask the user questions with selectable options +- Maintain state across calls (todo lists, connection pools) +- Custom TUI rendering (progress indicators, structured output) +- Integrate external services with proper error handling +- Tools that need user confirmation before proceeding + +**When to use custom tools vs. alternatives:** + +| Need | Solution | +|------|----------| +| Always-needed context (conventions, commands) | AGENTS.md | +| User triggers a specific prompt template | Slash command | +| On-demand capability package (workflows, scripts, setup) | Skill | +| Additional tool directly callable by the LLM | **Custom tool** | + +See [examples/custom-tools/](../examples/custom-tools/) for working examples. ## Quick Start diff --git a/packages/coding-agent/docs/gemini.md b/packages/coding-agent/docs/gemini.md deleted file mode 100644 index 7d04d0f9..00000000 --- a/packages/coding-agent/docs/gemini.md +++ /dev/null @@ -1,255 +0,0 @@ -# Gemini OAuth Integration Guide - -This document provides a comprehensive analysis of how OAuth authentication could be implemented for Google Gemini in the pi coding-agent, based on the existing Anthropic OAuth implementation and the Gemini CLI's approach. - -## Table of Contents - -1. [Current Anthropic OAuth Implementation](#current-anthropic-oauth-implementation) -2. [Gemini CLI Authentication Analysis](#gemini-cli-authentication-analysis) -3. [Gemini API Capabilities](#gemini-api-capabilities) -4. [Gemini API Endpoints](#gemini-api-endpoints) -5. [Implementation Plan](#implementation-plan) - -## Current Anthropic OAuth Implementation - -The pi coding-agent implements OAuth for Anthropic with the following architecture: - -### Key Components - -1. **OAuth Flow** (`packages/coding-agent/src/core/oauth/anthropic.ts`): - - Uses PKCE (Proof Key for Code Exchange) flow for security - - Client ID: `9d1c250a-e61b-44d9-88ed-5944d1962f5e` - - Authorization URL: `https://claude.ai/oauth/authorize` - - Token URL: `https://console.anthropic.com/v1/oauth/token` - - Scopes: `org:create_api_key user:profile user:inference` - -2. **Token Storage** (`packages/coding-agent/src/core/oauth/storage.ts`): - - Stores credentials in `~/.pi/agent/oauth.json` - - File permissions set to 0600 (owner read/write only) - - Format: `{ provider: { type: "oauth", refresh: string, access: string, expires: number } }` - -3. **Token Management** (`packages/coding-agent/src/core/oauth/index.ts`): - - Auto-refresh tokens when expired (with 5-minute buffer) - - Supports multiple providers through `SupportedOAuthProvider` type - - Provider info includes id, name, and availability status - -4. **Model Integration** (`packages/coding-agent/src/core/model-config.ts`): - - Checks OAuth tokens first, then environment variables - - OAuth status cached to avoid repeated file reads - - Maps providers to OAuth providers via `providerToOAuthProvider` - -### Authentication Flow - -1. User initiates login with `pi auth login` -2. Authorization URL is generated with PKCE challenge -3. User opens URL in browser and authorizes -4. User copies authorization code (format: `code#state`) -5. Code is exchanged for access/refresh tokens -6. Tokens are saved encrypted with expiry time - -## Gemini CLI Authentication Analysis - -The Gemini CLI uses a more complex OAuth implementation with several key differences: - -### Authentication Methods - -Gemini supports multiple authentication types: -- `LOGIN_WITH_GOOGLE` (OAuth personal account) -- `USE_GEMINI` (API key) -- `USE_VERTEX_AI` (Vertex AI) -- `COMPUTE_ADC` (Application Default Credentials) - -### OAuth Implementation Details - -1. **OAuth Configuration**: - - Client ID and Secret: See [google-gemini/gemini-cli oauth2.ts](https://github.com/google-gemini/gemini-cli/blob/main/packages/core/src/code_assist/oauth2.ts) (public for installed apps per Google's OAuth docs) - - Scopes: - - `https://www.googleapis.com/auth/cloud-platform` - - `https://www.googleapis.com/auth/userinfo.email` - - `https://www.googleapis.com/auth/userinfo.profile` - -2. **Authentication Flows**: - - **Web Flow**: Opens browser, runs local HTTP server for callback - - **User Code Flow**: For environments without browser (NO_BROWSER=true) - - Uses Google's `google-auth-library` for OAuth handling - -3. **Token Storage**: - - Supports encrypted storage via `OAuthCredentialStorage` - - Falls back to plain JSON storage - - Stores user info (email) separately - -4. **API Integration**: - - Uses `CodeAssistServer` for API calls - - Endpoint: `https://cloudcode-pa.googleapis.com` - - Includes user tier information (FREE, STANDARD, etc.) - -## Gemini API Capabilities - -Based on the Gemini CLI analysis: - -### System Prompts -✅ **Yes, Gemini supports system prompts** -- Implemented via `getCoreSystemPrompt()` in the codebase -- System instructions are part of the `GenerateContentParameters` - -### Tools/Function Calling -✅ **Yes, Gemini supports tools and function calling** -- Uses the `Tool` type from `@google/genai` -- Extensive tool support including: - - File system operations (read, write, edit) - - Web search and fetch - - MCP (Model Context Protocol) tools - - Custom tool registration - -### Content Generation -- Supports streaming and non-streaming generation -- Token counting capabilities -- Embedding support -- Context compression for long conversations - -## Gemini API Endpoints - -When using OAuth tokens, the Gemini CLI talks to: - -### Primary Endpoint -- **Base URL**: `https://cloudcode-pa.googleapis.com` -- **API Version**: `v1internal` - -### Key Methods -- `generateContent` - Non-streaming content generation -- `streamGenerateContent` - Streaming content generation -- `countTokens` - Token counting -- `embedContent` - Text embeddings -- `loadCodeAssist` - User setup and tier information -- `onboardUser` - User onboarding - -### Authentication -- OAuth tokens are passed via `AuthClient` from `google-auth-library` -- Tokens are automatically refreshed by the library -- Project ID and session ID included in requests - -## Implementation Plan - -### 1. Add Gemini OAuth Provider Support - -**File**: `packages/coding-agent/src/core/oauth/gemini.ts` - -```typescript -import { OAuth2Client } from 'google-auth-library'; -import { type OAuthCredentials, saveOAuthCredentials } from "./storage.js"; - -// OAuth credentials from google-gemini/gemini-cli: -// https://github.com/google-gemini/gemini-cli/blob/main/packages/core/src/code_assist/oauth2.ts -const SCOPES = [ - "https://www.googleapis.com/auth/cloud-platform", - "https://www.googleapis.com/auth/userinfo.email", - "https://www.googleapis.com/auth/userinfo.profile" -]; - -export async function loginGemini( - onAuthUrl: (url: string) => void, - onPromptCode: () => Promise, -): Promise { - // Implementation similar to Anthropic but using google-auth-library -} - -export async function refreshGeminiToken(refreshToken: string): Promise { - // Use google-auth-library for refresh -} -``` - -### 2. Update OAuth Index - -**File**: `packages/coding-agent/src/core/oauth/index.ts` - -```typescript -export type SupportedOAuthProvider = "anthropic" | "github-copilot" | "gemini"; - -// Add Gemini to provider list -{ - id: "gemini", - name: "Google Gemini (Code Assist)", - available: true, -} - -// Add cases for Gemini in login/refresh functions -``` - -### 3. Create Gemini API Client - -**File**: `packages/ai/src/providers/gemini-oauth.ts` - -```typescript -export class GeminiOAuthProvider implements Provider { - // Implement Provider interface - // Use CodeAssistServer approach from Gemini CLI - // Map to standard pi-ai API format -} -``` - -### 4. Update Model Configuration - -**File**: `packages/coding-agent/src/core/model-config.ts` - -```typescript -// Add to providerToOAuthProvider mapping -gemini: "gemini", - -// Add Gemini OAuth token check -if (model.provider === "gemini") { - const oauthToken = await getOAuthToken("gemini"); - if (oauthToken) return oauthToken; - const oauthEnv = process.env.GEMINI_OAUTH_TOKEN; - if (oauthEnv) return oauthEnv; -} -``` - -### 5. Dependencies - -Add to `package.json`: -```json -{ - "dependencies": { - "google-auth-library": "^9.0.0" - } -} -``` - -### 6. Environment Variables - -Support these environment variables: -- `GEMINI_OAUTH_TOKEN` - Manual OAuth token -- `GOOGLE_CLOUD_PROJECT` - For project-specific features -- `NO_BROWSER` - Force user code flow - -### Key Differences from Anthropic Implementation - -1. **Authentication Library**: Use `google-auth-library` instead of manual OAuth -2. **Multiple Auth Types**: Support OAuth, API key, and ADC -3. **User Info**: Fetch and cache user email/profile -4. **Project Context**: Include project ID in API calls -5. **Tier Management**: Handle user tier (FREE/STANDARD) responses - -### Challenges and Considerations - -1. **API Access**: The Code Assist API (`cloudcode-pa.googleapis.com`) might require special access or be in preview -2. **Model Naming**: Need to map Gemini model names to Code Assist equivalents -3. **Rate Limits**: Handle tier-based rate limits -4. **Error Handling**: Map Google-specific errors to pi error types -5. **Token Scopes**: Ensure scopes are sufficient for all operations - -### Testing Plan - -1. Test OAuth flow (browser and NO_BROWSER modes) -2. Test token refresh -3. Test API calls with OAuth tokens -4. Test fallback to API keys -5. Test error scenarios (invalid tokens, network errors) -6. Test model switching and tier limits - -### Migration Path - -1. Users with `GEMINI_API_KEY` continue working unchanged -2. New `pi auth login gemini` command for OAuth -3. OAuth takes precedence over API keys when available -4. Clear messaging about benefits (higher limits, better features) \ No newline at end of file diff --git a/packages/coding-agent/docs/hooks.md b/packages/coding-agent/docs/hooks.md index 768f5f15..6fd76869 100644 --- a/packages/coding-agent/docs/hooks.md +++ b/packages/coding-agent/docs/hooks.md @@ -44,6 +44,20 @@ You can also add explicit hook paths in `~/.pi/agent/settings.json`: - `hooks`: Additional hook file paths (supports `~` expansion) - `hookTimeout`: Timeout in milliseconds for hook operations (default: 30000). Does not apply to `tool_call` events, which have no timeout since they may prompt the user. +## Available Imports + +Hooks can import from these packages (automatically resolved by pi): + +| Package | Purpose | +|---------|---------| +| `@mariozechner/pi-coding-agent/hooks` | Hook types (`HookAPI`, etc.) | +| `@mariozechner/pi-coding-agent` | Additional types if needed | +| `@mariozechner/pi-ai` | AI utilities (`ToolResultMessage`, etc.) | +| `@mariozechner/pi-tui` | TUI components (for advanced use cases) | +| `@sinclair/typebox` | Schema definitions | + +Node.js built-in modules (`node:fs`, `node:path`, etc.) are also available. + ## Writing a Hook A hook is a TypeScript file that exports a default function. The function receives a `HookAPI` object used to subscribe to events. diff --git a/packages/coding-agent/docs/rpc.md b/packages/coding-agent/docs/rpc.md index 2c7b7af3..cbc1ca5f 100644 --- a/packages/coding-agent/docs/rpc.md +++ b/packages/coding-agent/docs/rpc.md @@ -559,6 +559,7 @@ Events are streamed to stdout as JSON lines during agent operation. Events do NO | `auto_compaction_end` | Auto-compaction completes | | `auto_retry_start` | Auto-retry begins (after transient error) | | `auto_retry_end` | Auto-retry completes (success or final failure) | +| `hook_error` | Hook threw an error | ### agent_start @@ -744,6 +745,19 @@ On final failure (max retries exceeded): } ``` +### hook_error + +Emitted when a hook throws an error. + +```json +{ + "type": "hook_error", + "hookPath": "/path/to/hook.ts", + "event": "tool_call", + "error": "Error message..." +} +``` + ## Error Handling Failed commands return a response with `success: false`: diff --git a/packages/coding-agent/docs/truncation.md b/packages/coding-agent/docs/truncation.md deleted file mode 100644 index 7651ca54..00000000 --- a/packages/coding-agent/docs/truncation.md +++ /dev/null @@ -1,235 +0,0 @@ -# Tool Output Truncation - -## Limits - -- **Line limit**: 2000 lines -- **Byte limit**: 30KB -- **Grep line limit**: 500 chars per match line - -Whichever limit is hit first wins. **Never return partial lines** (except bash edge case). - ---- - -## read - -Head truncation (first N lines). Has offset/limit params for continuation. - -### Scenarios - -**First line > 30KB:** -``` -LLM sees: -[Line 1 is 50KB, exceeds 30KB limit. Use bash to read: head -c 30000 path/to/file] - -Details: -{ truncation: { truncated: true, truncatedBy: "bytes", outputLines: 0, ... } } -``` - -**Hit line limit (2000 lines, < 30KB):** -``` -LLM sees: -[lines 1-2000 content] - -[Showing lines 1-2000 of 5000. Use offset=2001 to continue] - -Details: -{ truncation: { truncated: true, truncatedBy: "lines", outputLines: 2000, totalLines: 5000 } } -``` - -**Hit byte limit (< 2000 lines, 30KB):** -``` -LLM sees: -[lines 1-500 content] - -[Showing lines 1-500 of 5000 (30KB limit). Use offset=501 to continue] - -Details: -{ truncation: { truncated: true, truncatedBy: "bytes", outputLines: 500, totalLines: 5000 } } -``` - -**With offset, hit line limit (e.g., offset=1000):** -``` -LLM sees: -[lines 1000-2999 content] - -[Showing lines 1000-2999 of 5000. Use offset=3000 to continue] - -Details: -{ truncation: { truncatedBy: "lines", ... } } -``` - -**With offset, hit byte limit (e.g., offset=1000, 30KB after 500 lines):** -``` -LLM sees: -[lines 1000-1499 content] - -[Showing lines 1000-1499 of 5000 (30KB limit). Use offset=1500 to continue] - -Details: -{ truncation: { truncatedBy: "bytes", outputLines: 500, ... } } -``` - -**With offset, first line at offset > 30KB (e.g., offset=1000, line 1000 is 50KB):** -``` -LLM sees: -[Line 1000 is 50KB, exceeds 30KB limit. Use bash: sed -n '1000p' file | head -c 30000] - -Details: -{ truncation: { truncated: true, truncatedBy: "bytes", outputLines: 0 } } -``` - ---- - -## bash - -Tail truncation (last N lines). Writes full output to temp file if truncated. - -### Scenarios - -**Hit line limit (2000 lines):** -``` -LLM sees: -[lines 48001-50000 content] - -[Showing lines 48001-50000 of 50000. Full output: /tmp/pi-bash-xxx.log] - -Details: -{ truncation: { truncated: true, truncatedBy: "lines", outputLines: 2000, totalLines: 50000 }, fullOutputPath: "/tmp/..." } -``` - -**Hit byte limit (< 2000 lines, 30KB):** -``` -LLM sees: -[lines 49501-50000 content] - -[Showing lines 49501-50000 of 50000 (30KB limit). Full output: /tmp/pi-bash-xxx.log] - -Details: -{ truncation: { truncatedBy: "bytes", ... }, fullOutputPath: "/tmp/..." } -``` - -**Last line alone > 30KB (edge case, partial OK here):** -``` -LLM sees: -[last 30KB of final line] - -[Showing last 30KB of line 50000 (line is 100KB). Full output: /tmp/pi-bash-xxx.log] - -Details: -{ truncation: { truncatedBy: "bytes", lastLinePartial: true }, fullOutputPath: "/tmp/..." } -``` - ---- - -## grep - -Head truncation. Primary limit: 100 matches. Each match line truncated to 500 chars. - -### Scenarios - -**Hit match limit (100 matches):** -``` -LLM sees: -file.ts:10: matching content here... -file.ts:25: another match... -... - -[100 matches limit reached. Use limit=200 for more, or refine pattern] - -Details: -{ matchLimitReached: 100 } -``` - -**Hit byte limit (< 100 matches, 30KB):** -``` -LLM sees: -[matches that fit in 30KB] - -[30KB limit reached (50 of 100+ matches shown)] - -Details: -{ truncation: { truncatedBy: "bytes", ... } } -``` - -**Match lines truncated (any line > 500 chars):** -``` -LLM sees: -file.ts:10: very long matching content that exceeds 500 chars gets cut off here... [truncated] -file.ts:25: normal match - -[Some lines truncated to 500 chars. Use read tool to see full lines] - -Details: -{ linesTruncated: true } -``` - ---- - -## find - -Head truncation. Primary limit: 1000 results. File paths only (never > 30KB each). - -### Scenarios - -**Hit result limit (1000 results):** -``` -LLM sees: -src/file1.ts -src/file2.ts -[998 more paths] - -[1000 results limit reached. Use limit=2000 for more, or refine pattern] - -Details: -{ resultLimitReached: 1000 } -``` - -**Hit byte limit (unlikely, < 1000 results, 30KB):** -``` -LLM sees: -[paths that fit] - -[30KB limit reached] - -Details: -{ truncation: { truncatedBy: "bytes", ... } } -``` - ---- - -## ls - -Head truncation. Primary limit: 500 entries. Entry names only (never > 30KB each). - -### Scenarios - -**Hit entry limit (500 entries):** -``` -LLM sees: -.gitignore -README.md -src/ -[497 more entries] - -[500 entries limit reached. Use limit=1000 for more] - -Details: -{ entryLimitReached: 500 } -``` - -**Hit byte limit (unlikely):** -``` -LLM sees: -[entries that fit] - -[30KB limit reached] - -Details: -{ truncation: { truncatedBy: "bytes", ... } } -``` - ---- - -## TUI Display - -`tool-execution.ts` reads `details.truncation` and related fields to display truncation notices in warning color. The LLM text content and TUI display show the same information. diff --git a/packages/coding-agent/docs/undercompaction.md b/packages/coding-agent/docs/undercompaction.md deleted file mode 100644 index 3de29719..00000000 --- a/packages/coding-agent/docs/undercompaction.md +++ /dev/null @@ -1,313 +0,0 @@ -# Under-Compaction Analysis - -## Problem Statement - -Auto-compaction triggers too late, causing context window overflows that result in failed LLM calls with `stopReason == "length"`. - -## Architecture Overview - -### Event Flow - -``` -User prompt - │ - ▼ -agent.prompt() - │ - ▼ -agentLoop() in packages/ai/src/agent/agent-loop.ts - │ - ├─► streamAssistantResponse() - │ │ - │ ▼ - │ LLM provider (Anthropic, OpenAI, etc.) - │ │ - │ ▼ - │ Events: message_start → message_update* → message_end - │ │ - │ ▼ - │ AssistantMessage with usage stats (input, output, cacheRead, cacheWrite) - │ - ├─► If assistant has tool calls: - │ │ - │ ▼ - │ executeToolCalls() - │ │ - │ ├─► tool_execution_start (toolCallId, toolName, args) - │ │ - │ ├─► tool.execute() runs (read, bash, write, edit, etc.) - │ │ - │ ├─► tool_execution_end (toolCallId, toolName, result, isError) - │ │ - │ └─► message_start + message_end for ToolResultMessage - │ - └─► Loop continues until no more tool calls - │ - ▼ - agent_end -``` - -### Token Usage Reporting - -Token usage is ONLY available in `AssistantMessage.usage` after the LLM responds: - -```typescript -// From packages/ai/src/types.ts -export interface Usage { - input: number; // Tokens in the request - output: number; // Tokens generated - cacheRead: number; // Cached tokens read - cacheWrite: number; // Cached tokens written - cost: Cost; -} -``` - -The `input` field represents the total context size sent to the LLM, which includes: -- System prompt -- All conversation messages -- All tool results from previous calls - -### Current Compaction Check - -Both TUI (`tui-renderer.ts`) and RPC (`main.ts`) modes check compaction identically: - -```typescript -// In agent.subscribe() callback: -if (event.type === "message_end") { - // ... - if (event.message.role === "assistant") { - await checkAutoCompaction(); - } -} - -async function checkAutoCompaction() { - // Get last non-aborted assistant message - const messages = agent.state.messages; - let lastAssistant = findLastNonAbortedAssistant(messages); - if (!lastAssistant) return; - - const contextTokens = calculateContextTokens(lastAssistant.usage); - const contextWindow = agent.state.model.contextWindow; - - if (!shouldCompact(contextTokens, contextWindow, settings)) return; - - // Trigger compaction... -} -``` - -**The check happens on `message_end` for assistant messages only.** - -## The Under-Compaction Problem - -### Failure Scenario - -``` -Context window: 200,000 tokens -Reserve tokens: 16,384 (default) -Threshold: 200,000 - 16,384 = 183,616 - -Turn N: - 1. Assistant message received, usage shows 180,000 tokens - 2. shouldCompact(180000, 200000, settings) → 180000 > 183616 → FALSE - 3. Tool executes: `cat large-file.txt` → outputs 100KB (~25,000 tokens) - 4. Context now effectively 205,000 tokens, but we don't know this - 5. Next LLM call fails: context exceeds 200,000 window -``` - -The problem occurs when: -1. Context is below threshold (so compaction doesn't trigger) -2. A tool adds enough content to push it over the window limit -3. We only discover this when the next LLM call fails - -### Root Cause - -1. **Token counts are retrospective**: We only learn the context size AFTER the LLM processes it -2. **Tool results are blind spots**: When a tool executes and returns a large result, we don't know how many tokens it adds until the next LLM call -3. **No estimation before submission**: We submit the context and hope it fits - -## Current Tool Output Limits - -| Tool | Our Limit | Worst Case | -|------|-----------|------------| -| bash | 10MB per stream | 20MB (~5M tokens) | -| read | 2000 lines × 2000 chars | 4MB (~1M tokens) | -| write | Byte count only | Minimal | -| edit | Diff output | Variable | - -## How Other Tools Handle This - -### SST/OpenCode - -**Tool Output Limits (during execution):** - -| Tool | Limit | Details | -|------|-------|---------| -| bash | 30KB chars | `MAX_OUTPUT_LENGTH = 30_000`, truncates with notice | -| read | 2000 lines × 2000 chars/line | No total cap, theoretically 4MB | -| grep | 100 matches, 2000 chars/line | Truncates with notice | -| ls | 100 files | Truncates with notice | -| glob | 100 results | Truncates with notice | -| webfetch | 5MB | `MAX_RESPONSE_SIZE` | - -**Overflow Detection:** -- `isOverflow()` runs BEFORE each turn (not during) -- Uses last LLM-reported token count: `tokens.input + tokens.cache.read + tokens.output` -- Triggers if `count > context - maxOutput` -- Does NOT detect overflow from tool results in current turn - -**Recovery - Pruning:** -- `prune()` runs AFTER each turn completes -- Walks backwards through completed tool results -- Keeps last 40k tokens of tool outputs (`PRUNE_PROTECT`) -- Removes content from older tool results (marks `time.compacted`) -- Only prunes if savings > 20k tokens (`PRUNE_MINIMUM`) -- Token estimation: `chars / 4` - -**Recovery - Compaction:** -- Triggered when `isOverflow()` returns true before a turn -- LLM generates summary of conversation -- Replaces old messages with summary - -**Gap:** No mid-turn protection. A single read returning 4MB would overflow. The 30KB bash limit is their primary practical protection. - -### OpenAI/Codex - -**Tool Output Limits (during execution):** - -| Tool | Limit | Details | -|------|-------|---------| -| shell/exec | 10k tokens or 10k bytes | Per-model `TruncationPolicy`, user-configurable | -| read_file | 2000 lines, 500 chars/line | `MAX_LINE_LENGTH = 500`, ~1MB max | -| grep_files | 100 matches | Default limit | -| list_dir | Configurable | BFS with depth limits | - -**Truncation Policy:** -- Per-model family setting: `TruncationPolicy::Bytes(10_000)` or `TruncationPolicy::Tokens(10_000)` -- User can override via `tool_output_token_limit` config -- Applied to ALL tool outputs uniformly via `truncate_function_output_items_with_policy()` -- Preserves beginning and end, removes middle with `"…N tokens truncated…"` marker - -**Overflow Detection:** -- After each successful turn: `if total_usage_tokens >= auto_compact_token_limit { compact() }` -- Per-model thresholds (e.g., 180k for 200k context window) -- `ContextWindowExceeded` error caught and handled - -**Recovery - Compaction:** -- If tokens exceed threshold after turn, triggers `run_inline_auto_compact_task()` -- During compaction, if `ContextWindowExceeded`: removes oldest history item and retries -- Loop: `history.remove_first_item()` until it fits -- Notifies user: "Trimmed N older conversation item(s)" - -**Recovery - Turn Error:** -- On `ContextWindowExceeded` during normal turn: marks tokens as full, returns error to user -- Does NOT auto-retry the failed turn -- User must manually continue - -**Gap:** Still no mid-turn protection, but aggressive 10k token truncation on all tool outputs prevents most issues in practice. - -### Comparison - -| Feature | pi-coding-agent | OpenCode | Codex | -|---------|-----------------|----------|-------| -| Bash limit | 10MB | 30KB | ~40KB (10k tokens) | -| Read limit | 2000×2000 (4MB) | 2000×2000 (4MB) | 2000×500 (1MB) | -| Truncation policy | None | Per-tool | Per-model, uniform | -| Token estimation | None | chars/4 | chars/4 | -| Pre-turn check | No | Yes (last tokens) | Yes (threshold) | -| Mid-turn check | No | No | No | -| Post-turn pruning | No | Yes (removes old tool output) | No | -| Overflow recovery | No | Compaction | Trim oldest + compact | - -**Key insight:** None of these tools protect against mid-turn overflow. Their practical protection is aggressive static limits on tool output, especially bash. OpenCode's 30KB bash limit vs our 10MB is the critical difference. - -## Recommended Solution - -### Phase 1: Static Limits (immediate) - -Add hard limits to tool outputs matching industry practice: - -```typescript -// packages/coding-agent/src/tools/limits.ts -export const MAX_TOOL_OUTPUT_CHARS = 30_000; // ~7.5k tokens, matches OpenCode bash -export const MAX_TOOL_OUTPUT_NOTICE = "\n\n...(truncated, output exceeded limit)..."; -``` - -Apply to all tools: -- bash: 10MB → 30KB -- read: Add 100KB total output cap -- edit: Cap diff output - -### Phase 2: Post-Tool Estimation - -After `tool_execution_end`, estimate and flag: - -```typescript -let needsCompactionAfterTurn = false; - -agent.subscribe(async (event) => { - if (event.type === "tool_execution_end") { - const resultChars = extractTextLength(event.result); - const estimatedTokens = Math.ceil(resultChars / 4); - - const lastUsage = getLastAssistantUsage(agent.state.messages); - if (lastUsage) { - const current = calculateContextTokens(lastUsage); - const projected = current + estimatedTokens; - const threshold = agent.state.model.contextWindow - settings.reserveTokens; - if (projected > threshold) { - needsCompactionAfterTurn = true; - } - } - } - - if (event.type === "turn_end" && needsCompactionAfterTurn) { - needsCompactionAfterTurn = false; - await triggerCompaction(); - } -}); -``` - -### Phase 3: Overflow Recovery (like Codex) - -Handle `stopReason === "length"` gracefully: - -```typescript -if (event.type === "message_end" && event.message.role === "assistant") { - if (event.message.stopReason === "length") { - // Context overflow occurred - await triggerCompaction(); - // Optionally: retry the turn - } -} -``` - -During compaction, if it also overflows, trim oldest messages: - -```typescript -async function compactWithRetry() { - while (true) { - try { - await compact(); - break; - } catch (e) { - if (isContextOverflow(e) && messages.length > 1) { - messages.shift(); // Remove oldest - continue; - } - throw e; - } - } -} -``` - -## Summary - -The under-compaction problem occurs because: -1. We only check context size after assistant messages -2. Tool results can add arbitrary amounts of content -3. We discover overflows only when the next LLM call fails - -The fix requires: -1. Aggressive static limits on tool output (immediate safety net) -2. Token estimation after tool execution (proactive detection) -3. Graceful handling of overflow errors (fallback recovery) diff --git a/packages/coding-agent/src/core/hooks/loader.ts b/packages/coding-agent/src/core/hooks/loader.ts index 8ed97d97..f4af4988 100644 --- a/packages/coding-agent/src/core/hooks/loader.ts +++ b/packages/coding-agent/src/core/hooks/loader.ts @@ -3,13 +3,36 @@ */ import * as fs from "node:fs"; +import { createRequire } from "node:module"; import * as os from "node:os"; import * as path from "node:path"; +import { fileURLToPath } from "node:url"; import type { Attachment } from "@mariozechner/pi-agent-core"; import { createJiti } from "jiti"; import { getAgentDir } from "../../config.js"; import type { HookAPI, HookFactory } from "./types.js"; +// Create require function to resolve module paths at runtime +const require = createRequire(import.meta.url); + +// Lazily computed aliases - resolved at runtime to handle global installs +let _aliases: Record | null = null; +function getAliases(): Record { + if (_aliases) return _aliases; + + const __dirname = path.dirname(fileURLToPath(import.meta.url)); + const packageIndex = path.resolve(__dirname, "../..", "index.js"); + + _aliases = { + "@mariozechner/pi-coding-agent": packageIndex, + "@mariozechner/pi-coding-agent/hooks": path.resolve(__dirname, "index.js"), + "@mariozechner/pi-tui": require.resolve("@mariozechner/pi-tui"), + "@mariozechner/pi-ai": require.resolve("@mariozechner/pi-ai"), + "@sinclair/typebox": require.resolve("@sinclair/typebox"), + }; + return _aliases; +} + /** * Generic handler function type. */ @@ -117,7 +140,11 @@ async function loadHook(hookPath: string, cwd: string): Promise<{ hook: LoadedHo try { // Create jiti instance for TypeScript/ESM loading - const jiti = createJiti(import.meta.url); + // Use aliases to resolve package imports since hooks are loaded from user directories + // (e.g. ~/.pi/agent/hooks) but import from packages installed with pi-coding-agent + const jiti = createJiti(import.meta.url, { + alias: getAliases(), + }); // Import the module const module = await jiti.import(resolvedPath, { default: true });