mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-15 04:03:29 +00:00
10 KiB
10 KiB
6. Observability & Tracing - System Design
Priority: 🟡 HIGH — Debugging is critical
Complexity: Medium
Effort Estimate: 4-6 hours
Overview
Every eval run produces a trace capturing what Claude did and why. No black boxes. When a test fails, you can see:
- What files Claude analyzed
- What questions it asked
- What specs it generated
- The reasoning behind each decision
Architecture
┌─────────────────────────────────────────────────────────────────┐
│ Observability Pipeline │
├─────────────────────────────────────────────────────────────────┤
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
│ │ Claude Agent │───▶│ Tracer │───▶│ Trace Store │ │
│ │ Hooks │ │ (collector) │ │ (.json) │ │
│ └──────────────┘ └──────────────┘ └──────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Trace Viewer │ │
│ │ (Promptfoo) │ │
│ └──────────────┘ │
└─────────────────────────────────────────────────────────────────┘
Core Types
interface EvalTrace {
id: string; // UUID
evalId: string; // Links to EvalSpec
startedAt: string;
completedAt: string;
duration: number; // ms
status: 'success' | 'partial' | 'failed';
introspection: {
filesAnalyzed: string[];
totalFunctions: number;
totalClasses: number;
duration: number;
};
analysis: {
promptTokens: number;
completionTokens: number;
toolCalls: ToolCall[];
questionsAsked: Question[];
decisions: Decision[];
};
generation: {
scenariosGenerated: number;
filesWritten: string[];
};
execution: {
testsPassed: number;
testsFailed: number;
testsSkipped: number;
failures: TestFailure[];
};
errors: TraceError[];
}
interface ToolCall {
timestamp: string;
tool: string;
input: any;
output: any;
duration: number;
}
interface Decision {
timestamp: string;
type: 'include' | 'exclude' | 'prioritize' | 'question';
subject: string; // What was decided about
reasoning: string; // Why
confidence: number; // 0-1
}
interface TestFailure {
scenarioId: string;
error: string;
stack?: string;
expected?: any;
actual?: any;
}
Hook-Based Collection
Use Claude Agent SDK hooks to capture everything:
import { ClaudeAgentOptions } from '@anthropic-ai/claude-agent-sdk';
import { Tracer } from './tracer';
function createTracedOptions(tracer: Tracer): Partial<ClaudeAgentOptions> {
return {
hooks: {
preToolUse: [{
hooks: [async (input) => {
tracer.recordToolStart(input.tool_name, input.tool_input);
return { continue_: true };
}]
}],
postToolUse: [{
hooks: [async (input) => {
tracer.recordToolEnd(input.tool_name, input.tool_output);
return {};
}]
}],
},
};
}
Tracer Implementation
class Tracer {
private trace: EvalTrace;
private currentToolCall?: { name: string; input: any; startTime: number };
constructor(evalId: string) {
this.trace = {
id: crypto.randomUUID(),
evalId,
startedAt: new Date().toISOString(),
completedAt: '',
duration: 0,
status: 'success',
introspection: { filesAnalyzed: [], totalFunctions: 0, totalClasses: 0, duration: 0 },
analysis: { promptTokens: 0, completionTokens: 0, toolCalls: [], questionsAsked: [], decisions: [] },
generation: { scenariosGenerated: 0, filesWritten: [] },
execution: { testsPassed: 0, testsFailed: 0, testsSkipped: 0, failures: [] },
errors: [],
};
}
recordToolStart(name: string, input: any): void {
this.currentToolCall = { name, input, startTime: Date.now() };
}
recordToolEnd(name: string, output: any): void {
if (this.currentToolCall?.name === name) {
this.trace.analysis.toolCalls.push({
timestamp: new Date().toISOString(),
tool: name,
input: this.currentToolCall.input,
output,
duration: Date.now() - this.currentToolCall.startTime,
});
this.currentToolCall = undefined;
}
}
recordQuestion(question: any, answer: string): void {
this.trace.analysis.questionsAsked.push({
...question,
answer,
timestamp: new Date().toISOString(),
});
}
recordDecision(type: Decision['type'], subject: string, reasoning: string, confidence: number): void {
this.trace.analysis.decisions.push({
timestamp: new Date().toISOString(),
type,
subject,
reasoning,
confidence,
});
}
recordError(error: Error, context?: string): void {
this.trace.errors.push({
timestamp: new Date().toISOString(),
message: error.message,
stack: error.stack,
context,
});
this.trace.status = 'failed';
}
finalize(): EvalTrace {
this.trace.completedAt = new Date().toISOString();
this.trace.duration = new Date(this.trace.completedAt).getTime() - new Date(this.trace.startedAt).getTime();
return this.trace;
}
}
Trace Storage
const TRACES_DIR = '.evaluclaude/traces';
async function saveTrace(trace: EvalTrace): Promise<string> {
await fs.mkdir(TRACES_DIR, { recursive: true });
const filePath = path.join(TRACES_DIR, `${trace.id}.json`);
await fs.writeFile(filePath, JSON.stringify(trace, null, 2));
return filePath;
}
async function loadTrace(traceId: string): Promise<EvalTrace> {
const filePath = path.join(TRACES_DIR, `${traceId}.json`);
const content = await fs.readFile(filePath, 'utf-8');
return JSON.parse(content);
}
async function listTraces(evalId?: string): Promise<EvalTrace[]> {
const files = await fs.readdir(TRACES_DIR);
const traces = await Promise.all(
files.filter(f => f.endsWith('.json')).map(f => loadTrace(f.replace('.json', '')))
);
return evalId ? traces.filter(t => t.evalId === evalId) : traces;
}
Promptfoo Integration
Link traces to test results:
# promptfooconfig.yaml
defaultTest:
metadata:
traceFile: .evaluclaude/traces/{{evalId}}.json
Trace Viewer CLI
// src/cli/commands/view.ts
import { Command } from 'commander';
import { loadTrace, listTraces } from '../observability/trace-store';
export const viewCommand = new Command('view')
.description('View eval trace')
.argument('[trace-id]', 'Specific trace ID')
.option('--last', 'View most recent trace')
.option('--json', 'Output raw JSON')
.action(async (traceId, options) => {
let trace: EvalTrace;
if (options.last) {
const traces = await listTraces();
trace = traces.sort((a, b) =>
new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime()
)[0];
} else {
trace = await loadTrace(traceId);
}
if (options.json) {
console.log(JSON.stringify(trace, null, 2));
} else {
displayTrace(trace);
}
});
function displayTrace(trace: EvalTrace): void {
console.log(`\n📊 Trace: ${trace.id}`);
console.log(` Status: ${trace.status}`);
console.log(` Duration: ${trace.duration}ms`);
console.log(`\n📂 Introspection:`);
console.log(` Files: ${trace.introspection.filesAnalyzed.length}`);
console.log(` Functions: ${trace.introspection.totalFunctions}`);
console.log(`\n🤖 Analysis:`);
console.log(` Tool calls: ${trace.analysis.toolCalls.length}`);
console.log(` Questions: ${trace.analysis.questionsAsked.length}`);
console.log(` Decisions: ${trace.analysis.decisions.length}`);
console.log(`\n🧪 Execution:`);
console.log(` ✅ Passed: ${trace.execution.testsPassed}`);
console.log(` ❌ Failed: ${trace.execution.testsFailed}`);
if (trace.execution.failures.length > 0) {
console.log(`\n❌ Failures:`);
trace.execution.failures.forEach(f => {
console.log(` - ${f.scenarioId}: ${f.error}`);
});
}
}
File Structure
src/observability/
├── index.ts # Main exports
├── tracer.ts # Hook-based collection
├── trace-store.ts # Persist to filesystem
├── trace-viewer.ts # Format for display
└── types.ts # EvalTrace interface
.evaluclaude/
└── traces/
├── abc123.json
├── def456.json
└── ...
What Gets Traced
| Phase | Captured |
|---|---|
| Introspection | Files parsed, functions/classes found, duration |
| Analysis | Every tool call, questions asked, decisions made |
| Generation | Scenarios created, files written |
| Execution | Test results, failures with context |
| Errors | Any exceptions with stack traces |
Dependencies
{
"dependencies": {}
}
Success Criteria
- Every eval run produces a trace
- Traces capture all tool calls
- Questions and answers are recorded
- Test failures link to trace
- CLI viewer displays traces clearly
- Traces stored efficiently (<1MB each)