# 6. Observability & Tracing - System Design > **Priority**: 🟑 HIGH β€” Debugging is critical > **Complexity**: Medium > **Effort Estimate**: 4-6 hours --- ## Overview Every eval run produces a **trace** capturing what Claude did and why. No black boxes. When a test fails, you can see: - What files Claude analyzed - What questions it asked - What specs it generated - The reasoning behind each decision --- ## Architecture ``` β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Observability Pipeline β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ Claude Agent │───▢│ Tracer │───▢│ Trace Store β”‚ β”‚ β”‚ β”‚ Hooks β”‚ β”‚ (collector) β”‚ β”‚ (.json) β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β”‚ β”‚ β”‚ β–Ό β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚ β”‚ Trace Viewer β”‚ β”‚ β”‚ β”‚ (Promptfoo) β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` --- ## Core Types ```typescript interface EvalTrace { id: string; // UUID evalId: string; // Links to EvalSpec startedAt: string; completedAt: string; duration: number; // ms status: 'success' | 'partial' | 'failed'; introspection: { filesAnalyzed: string[]; totalFunctions: number; totalClasses: number; duration: number; }; analysis: { promptTokens: number; completionTokens: number; toolCalls: ToolCall[]; questionsAsked: Question[]; decisions: Decision[]; }; generation: { scenariosGenerated: number; filesWritten: string[]; }; execution: { testsPassed: number; testsFailed: number; testsSkipped: number; failures: TestFailure[]; }; errors: TraceError[]; } interface ToolCall { timestamp: string; tool: string; input: any; output: any; duration: number; } interface Decision { timestamp: string; type: 'include' | 'exclude' | 'prioritize' | 'question'; subject: string; // What was decided about reasoning: string; // Why confidence: number; // 0-1 } interface TestFailure { scenarioId: string; error: string; stack?: string; expected?: any; actual?: any; } ``` --- ## Hook-Based Collection Use Claude Agent SDK hooks to capture everything: ```typescript import { ClaudeAgentOptions } from '@anthropic-ai/claude-agent-sdk'; import { Tracer } from './tracer'; function createTracedOptions(tracer: Tracer): Partial { return { hooks: { preToolUse: [{ hooks: [async (input) => { tracer.recordToolStart(input.tool_name, input.tool_input); return { continue_: true }; }] }], postToolUse: [{ hooks: [async (input) => { tracer.recordToolEnd(input.tool_name, input.tool_output); return {}; }] }], }, }; } ``` --- ## Tracer Implementation ```typescript class Tracer { private trace: EvalTrace; private currentToolCall?: { name: string; input: any; startTime: number }; constructor(evalId: string) { this.trace = { id: crypto.randomUUID(), evalId, startedAt: new Date().toISOString(), completedAt: '', duration: 0, status: 'success', introspection: { filesAnalyzed: [], totalFunctions: 0, totalClasses: 0, duration: 0 }, analysis: { promptTokens: 0, completionTokens: 0, toolCalls: [], questionsAsked: [], decisions: [] }, generation: { scenariosGenerated: 0, filesWritten: [] }, execution: { testsPassed: 0, testsFailed: 0, testsSkipped: 0, failures: [] }, errors: [], }; } recordToolStart(name: string, input: any): void { this.currentToolCall = { name, input, startTime: Date.now() }; } recordToolEnd(name: string, output: any): void { if (this.currentToolCall?.name === name) { this.trace.analysis.toolCalls.push({ timestamp: new Date().toISOString(), tool: name, input: this.currentToolCall.input, output, duration: Date.now() - this.currentToolCall.startTime, }); this.currentToolCall = undefined; } } recordQuestion(question: any, answer: string): void { this.trace.analysis.questionsAsked.push({ ...question, answer, timestamp: new Date().toISOString(), }); } recordDecision(type: Decision['type'], subject: string, reasoning: string, confidence: number): void { this.trace.analysis.decisions.push({ timestamp: new Date().toISOString(), type, subject, reasoning, confidence, }); } recordError(error: Error, context?: string): void { this.trace.errors.push({ timestamp: new Date().toISOString(), message: error.message, stack: error.stack, context, }); this.trace.status = 'failed'; } finalize(): EvalTrace { this.trace.completedAt = new Date().toISOString(); this.trace.duration = new Date(this.trace.completedAt).getTime() - new Date(this.trace.startedAt).getTime(); return this.trace; } } ``` --- ## Trace Storage ```typescript const TRACES_DIR = '.evaluclaude/traces'; async function saveTrace(trace: EvalTrace): Promise { await fs.mkdir(TRACES_DIR, { recursive: true }); const filePath = path.join(TRACES_DIR, `${trace.id}.json`); await fs.writeFile(filePath, JSON.stringify(trace, null, 2)); return filePath; } async function loadTrace(traceId: string): Promise { const filePath = path.join(TRACES_DIR, `${traceId}.json`); const content = await fs.readFile(filePath, 'utf-8'); return JSON.parse(content); } async function listTraces(evalId?: string): Promise { const files = await fs.readdir(TRACES_DIR); const traces = await Promise.all( files.filter(f => f.endsWith('.json')).map(f => loadTrace(f.replace('.json', ''))) ); return evalId ? traces.filter(t => t.evalId === evalId) : traces; } ``` --- ## Promptfoo Integration Link traces to test results: ```yaml # promptfooconfig.yaml defaultTest: metadata: traceFile: .evaluclaude/traces/{{evalId}}.json ``` --- ## Trace Viewer CLI ```typescript // src/cli/commands/view.ts import { Command } from 'commander'; import { loadTrace, listTraces } from '../observability/trace-store'; export const viewCommand = new Command('view') .description('View eval trace') .argument('[trace-id]', 'Specific trace ID') .option('--last', 'View most recent trace') .option('--json', 'Output raw JSON') .action(async (traceId, options) => { let trace: EvalTrace; if (options.last) { const traces = await listTraces(); trace = traces.sort((a, b) => new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime() )[0]; } else { trace = await loadTrace(traceId); } if (options.json) { console.log(JSON.stringify(trace, null, 2)); } else { displayTrace(trace); } }); function displayTrace(trace: EvalTrace): void { console.log(`\nπŸ“Š Trace: ${trace.id}`); console.log(` Status: ${trace.status}`); console.log(` Duration: ${trace.duration}ms`); console.log(`\nπŸ“‚ Introspection:`); console.log(` Files: ${trace.introspection.filesAnalyzed.length}`); console.log(` Functions: ${trace.introspection.totalFunctions}`); console.log(`\nπŸ€– Analysis:`); console.log(` Tool calls: ${trace.analysis.toolCalls.length}`); console.log(` Questions: ${trace.analysis.questionsAsked.length}`); console.log(` Decisions: ${trace.analysis.decisions.length}`); console.log(`\nπŸ§ͺ Execution:`); console.log(` βœ… Passed: ${trace.execution.testsPassed}`); console.log(` ❌ Failed: ${trace.execution.testsFailed}`); if (trace.execution.failures.length > 0) { console.log(`\n❌ Failures:`); trace.execution.failures.forEach(f => { console.log(` - ${f.scenarioId}: ${f.error}`); }); } } ``` --- ## File Structure ``` src/observability/ β”œβ”€β”€ index.ts # Main exports β”œβ”€β”€ tracer.ts # Hook-based collection β”œβ”€β”€ trace-store.ts # Persist to filesystem β”œβ”€β”€ trace-viewer.ts # Format for display └── types.ts # EvalTrace interface .evaluclaude/ └── traces/ β”œβ”€β”€ abc123.json β”œβ”€β”€ def456.json └── ... ``` --- ## What Gets Traced | Phase | Captured | |-------|----------| | Introspection | Files parsed, functions/classes found, duration | | Analysis | Every tool call, questions asked, decisions made | | Generation | Scenarios created, files written | | Execution | Test results, failures with context | | Errors | Any exceptions with stack traces | --- ## Dependencies ```json { "dependencies": {} } ``` --- ## Success Criteria - [ ] Every eval run produces a trace - [ ] Traces capture all tool calls - [ ] Questions and answers are recorded - [ ] Test failures link to trace - [ ] CLI viewer displays traces clearly - [ ] Traces stored efficiently (<1MB each)