mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-16 15:02:37 +00:00
iteration 0
This commit is contained in:
commit
4b24606d0e
25 changed files with 7843 additions and 0 deletions
364
docs/06-observability-tracing.md
Normal file
364
docs/06-observability-tracing.md
Normal file
|
|
@ -0,0 +1,364 @@
|
|||
# 6. Observability & Tracing - System Design
|
||||
|
||||
> **Priority**: 🟡 HIGH — Debugging is critical
|
||||
> **Complexity**: Medium
|
||||
> **Effort Estimate**: 4-6 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Every eval run produces a **trace** capturing what Claude did and why. No black boxes. When a test fails, you can see:
|
||||
- What files Claude analyzed
|
||||
- What questions it asked
|
||||
- What specs it generated
|
||||
- The reasoning behind each decision
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Observability Pipeline │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Claude Agent │───▶│ Tracer │───▶│ Trace Store │ │
|
||||
│ │ Hooks │ │ (collector) │ │ (.json) │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │ Trace Viewer │ │
|
||||
│ │ (Promptfoo) │ │
|
||||
│ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface EvalTrace {
|
||||
id: string; // UUID
|
||||
evalId: string; // Links to EvalSpec
|
||||
startedAt: string;
|
||||
completedAt: string;
|
||||
duration: number; // ms
|
||||
|
||||
status: 'success' | 'partial' | 'failed';
|
||||
|
||||
introspection: {
|
||||
filesAnalyzed: string[];
|
||||
totalFunctions: number;
|
||||
totalClasses: number;
|
||||
duration: number;
|
||||
};
|
||||
|
||||
analysis: {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
toolCalls: ToolCall[];
|
||||
questionsAsked: Question[];
|
||||
decisions: Decision[];
|
||||
};
|
||||
|
||||
generation: {
|
||||
scenariosGenerated: number;
|
||||
filesWritten: string[];
|
||||
};
|
||||
|
||||
execution: {
|
||||
testsPassed: number;
|
||||
testsFailed: number;
|
||||
testsSkipped: number;
|
||||
failures: TestFailure[];
|
||||
};
|
||||
|
||||
errors: TraceError[];
|
||||
}
|
||||
|
||||
interface ToolCall {
|
||||
timestamp: string;
|
||||
tool: string;
|
||||
input: any;
|
||||
output: any;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
interface Decision {
|
||||
timestamp: string;
|
||||
type: 'include' | 'exclude' | 'prioritize' | 'question';
|
||||
subject: string; // What was decided about
|
||||
reasoning: string; // Why
|
||||
confidence: number; // 0-1
|
||||
}
|
||||
|
||||
interface TestFailure {
|
||||
scenarioId: string;
|
||||
error: string;
|
||||
stack?: string;
|
||||
expected?: any;
|
||||
actual?: any;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Hook-Based Collection
|
||||
|
||||
Use Claude Agent SDK hooks to capture everything:
|
||||
|
||||
```typescript
|
||||
import { ClaudeAgentOptions } from '@anthropic-ai/claude-agent-sdk';
|
||||
import { Tracer } from './tracer';
|
||||
|
||||
function createTracedOptions(tracer: Tracer): Partial<ClaudeAgentOptions> {
|
||||
return {
|
||||
hooks: {
|
||||
preToolUse: [{
|
||||
hooks: [async (input) => {
|
||||
tracer.recordToolStart(input.tool_name, input.tool_input);
|
||||
return { continue_: true };
|
||||
}]
|
||||
}],
|
||||
postToolUse: [{
|
||||
hooks: [async (input) => {
|
||||
tracer.recordToolEnd(input.tool_name, input.tool_output);
|
||||
return {};
|
||||
}]
|
||||
}],
|
||||
},
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Tracer Implementation
|
||||
|
||||
```typescript
|
||||
class Tracer {
|
||||
private trace: EvalTrace;
|
||||
private currentToolCall?: { name: string; input: any; startTime: number };
|
||||
|
||||
constructor(evalId: string) {
|
||||
this.trace = {
|
||||
id: crypto.randomUUID(),
|
||||
evalId,
|
||||
startedAt: new Date().toISOString(),
|
||||
completedAt: '',
|
||||
duration: 0,
|
||||
status: 'success',
|
||||
introspection: { filesAnalyzed: [], totalFunctions: 0, totalClasses: 0, duration: 0 },
|
||||
analysis: { promptTokens: 0, completionTokens: 0, toolCalls: [], questionsAsked: [], decisions: [] },
|
||||
generation: { scenariosGenerated: 0, filesWritten: [] },
|
||||
execution: { testsPassed: 0, testsFailed: 0, testsSkipped: 0, failures: [] },
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
|
||||
recordToolStart(name: string, input: any): void {
|
||||
this.currentToolCall = { name, input, startTime: Date.now() };
|
||||
}
|
||||
|
||||
recordToolEnd(name: string, output: any): void {
|
||||
if (this.currentToolCall?.name === name) {
|
||||
this.trace.analysis.toolCalls.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
tool: name,
|
||||
input: this.currentToolCall.input,
|
||||
output,
|
||||
duration: Date.now() - this.currentToolCall.startTime,
|
||||
});
|
||||
this.currentToolCall = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
recordQuestion(question: any, answer: string): void {
|
||||
this.trace.analysis.questionsAsked.push({
|
||||
...question,
|
||||
answer,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
|
||||
recordDecision(type: Decision['type'], subject: string, reasoning: string, confidence: number): void {
|
||||
this.trace.analysis.decisions.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
type,
|
||||
subject,
|
||||
reasoning,
|
||||
confidence,
|
||||
});
|
||||
}
|
||||
|
||||
recordError(error: Error, context?: string): void {
|
||||
this.trace.errors.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
message: error.message,
|
||||
stack: error.stack,
|
||||
context,
|
||||
});
|
||||
this.trace.status = 'failed';
|
||||
}
|
||||
|
||||
finalize(): EvalTrace {
|
||||
this.trace.completedAt = new Date().toISOString();
|
||||
this.trace.duration = new Date(this.trace.completedAt).getTime() - new Date(this.trace.startedAt).getTime();
|
||||
return this.trace;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Trace Storage
|
||||
|
||||
```typescript
|
||||
const TRACES_DIR = '.evaluclaude/traces';
|
||||
|
||||
async function saveTrace(trace: EvalTrace): Promise<string> {
|
||||
await fs.mkdir(TRACES_DIR, { recursive: true });
|
||||
const filePath = path.join(TRACES_DIR, `${trace.id}.json`);
|
||||
await fs.writeFile(filePath, JSON.stringify(trace, null, 2));
|
||||
return filePath;
|
||||
}
|
||||
|
||||
async function loadTrace(traceId: string): Promise<EvalTrace> {
|
||||
const filePath = path.join(TRACES_DIR, `${traceId}.json`);
|
||||
const content = await fs.readFile(filePath, 'utf-8');
|
||||
return JSON.parse(content);
|
||||
}
|
||||
|
||||
async function listTraces(evalId?: string): Promise<EvalTrace[]> {
|
||||
const files = await fs.readdir(TRACES_DIR);
|
||||
const traces = await Promise.all(
|
||||
files.filter(f => f.endsWith('.json')).map(f => loadTrace(f.replace('.json', '')))
|
||||
);
|
||||
return evalId ? traces.filter(t => t.evalId === evalId) : traces;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Promptfoo Integration
|
||||
|
||||
Link traces to test results:
|
||||
|
||||
```yaml
|
||||
# promptfooconfig.yaml
|
||||
defaultTest:
|
||||
metadata:
|
||||
traceFile: .evaluclaude/traces/{{evalId}}.json
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Trace Viewer CLI
|
||||
|
||||
```typescript
|
||||
// src/cli/commands/view.ts
|
||||
import { Command } from 'commander';
|
||||
import { loadTrace, listTraces } from '../observability/trace-store';
|
||||
|
||||
export const viewCommand = new Command('view')
|
||||
.description('View eval trace')
|
||||
.argument('[trace-id]', 'Specific trace ID')
|
||||
.option('--last', 'View most recent trace')
|
||||
.option('--json', 'Output raw JSON')
|
||||
.action(async (traceId, options) => {
|
||||
let trace: EvalTrace;
|
||||
|
||||
if (options.last) {
|
||||
const traces = await listTraces();
|
||||
trace = traces.sort((a, b) =>
|
||||
new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime()
|
||||
)[0];
|
||||
} else {
|
||||
trace = await loadTrace(traceId);
|
||||
}
|
||||
|
||||
if (options.json) {
|
||||
console.log(JSON.stringify(trace, null, 2));
|
||||
} else {
|
||||
displayTrace(trace);
|
||||
}
|
||||
});
|
||||
|
||||
function displayTrace(trace: EvalTrace): void {
|
||||
console.log(`\n📊 Trace: ${trace.id}`);
|
||||
console.log(` Status: ${trace.status}`);
|
||||
console.log(` Duration: ${trace.duration}ms`);
|
||||
console.log(`\n📂 Introspection:`);
|
||||
console.log(` Files: ${trace.introspection.filesAnalyzed.length}`);
|
||||
console.log(` Functions: ${trace.introspection.totalFunctions}`);
|
||||
console.log(`\n🤖 Analysis:`);
|
||||
console.log(` Tool calls: ${trace.analysis.toolCalls.length}`);
|
||||
console.log(` Questions: ${trace.analysis.questionsAsked.length}`);
|
||||
console.log(` Decisions: ${trace.analysis.decisions.length}`);
|
||||
console.log(`\n🧪 Execution:`);
|
||||
console.log(` ✅ Passed: ${trace.execution.testsPassed}`);
|
||||
console.log(` ❌ Failed: ${trace.execution.testsFailed}`);
|
||||
|
||||
if (trace.execution.failures.length > 0) {
|
||||
console.log(`\n❌ Failures:`);
|
||||
trace.execution.failures.forEach(f => {
|
||||
console.log(` - ${f.scenarioId}: ${f.error}`);
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/observability/
|
||||
├── index.ts # Main exports
|
||||
├── tracer.ts # Hook-based collection
|
||||
├── trace-store.ts # Persist to filesystem
|
||||
├── trace-viewer.ts # Format for display
|
||||
└── types.ts # EvalTrace interface
|
||||
|
||||
.evaluclaude/
|
||||
└── traces/
|
||||
├── abc123.json
|
||||
├── def456.json
|
||||
└── ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What Gets Traced
|
||||
|
||||
| Phase | Captured |
|
||||
|-------|----------|
|
||||
| Introspection | Files parsed, functions/classes found, duration |
|
||||
| Analysis | Every tool call, questions asked, decisions made |
|
||||
| Generation | Scenarios created, files written |
|
||||
| Execution | Test results, failures with context |
|
||||
| Errors | Any exceptions with stack traces |
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"dependencies": {}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Every eval run produces a trace
|
||||
- [ ] Traces capture all tool calls
|
||||
- [ ] Questions and answers are recorded
|
||||
- [ ] Test failures link to trace
|
||||
- [ ] CLI viewer displays traces clearly
|
||||
- [ ] Traces stored efficiently (<1MB each)
|
||||
Loading…
Add table
Add a link
Reference in a new issue