mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-17 20:05:07 +00:00
iteration 0
This commit is contained in:
commit
4b24606d0e
25 changed files with 7843 additions and 0 deletions
133
docs/00-tree-sitter-introspector.md
Normal file
133
docs/00-tree-sitter-introspector.md
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
# 0. Tree-Sitter Introspector - System Design
|
||||
|
||||
> **Priority**: 🔴 FOUNDATIONAL — Build this first
|
||||
> **Complexity**: Medium
|
||||
> **Effort Estimate**: 6-10 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Tree-Sitter Introspector parses Python and TypeScript codebases locally using tree-sitter AST parsing, extracting structured metadata (functions, classes, imports) **without** sending raw code to Claude. This saves tokens, is faster, and produces reliable structured data.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Introspector Module │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ File Scanner │───▶│ Tree-Sitter │───▶│ Summarizer │ │
|
||||
│ │ (glob/git) │ │ Parsers │ │ │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ ▼ ▼ ▼ │
|
||||
│ File list + Per-file AST RepoSummary │
|
||||
│ metadata extracts JSON │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface RepoSummary {
|
||||
languages: ('python' | 'typescript')[];
|
||||
root: string;
|
||||
analyzedAt: string;
|
||||
files: FileInfo[];
|
||||
modules: ModuleInfo[];
|
||||
config: ConfigInfo;
|
||||
git?: GitInfo;
|
||||
}
|
||||
|
||||
interface ModuleInfo {
|
||||
path: string;
|
||||
exports: ExportInfo[];
|
||||
imports: string[];
|
||||
complexity: 'low' | 'medium' | 'high';
|
||||
}
|
||||
|
||||
interface ExportInfo {
|
||||
name: string;
|
||||
kind: 'function' | 'class' | 'constant' | 'type';
|
||||
signature?: string;
|
||||
docstring?: string;
|
||||
lineNumber: number;
|
||||
isAsync?: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Implementation Details
|
||||
|
||||
### Tree-Sitter Queries (Python)
|
||||
|
||||
```typescript
|
||||
const FUNCTION_QUERY = `
|
||||
(function_definition
|
||||
name: (identifier) @name
|
||||
parameters: (parameters) @params
|
||||
return_type: (type)? @return_type
|
||||
) @func
|
||||
`;
|
||||
|
||||
const CLASS_QUERY = `
|
||||
(class_definition
|
||||
name: (identifier) @name
|
||||
body: (block) @body
|
||||
) @class
|
||||
`;
|
||||
```
|
||||
|
||||
### Git-Aware Incremental
|
||||
|
||||
```typescript
|
||||
async function getChangedFiles(since: string): Promise<string[]> {
|
||||
const { stdout } = await exec(`git diff --name-only ${since}`);
|
||||
return stdout.split('\n').filter(f => /\.(py|ts|tsx)$/.test(f));
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/introspector/
|
||||
├── index.ts # Main entry point
|
||||
├── types.ts # TypeScript interfaces
|
||||
├── scanner.ts # File discovery
|
||||
├── parsers/
|
||||
│ ├── python.ts # Python tree-sitter queries
|
||||
│ └── typescript.ts # TS tree-sitter queries
|
||||
├── git.ts # Git integration
|
||||
└── summarizer.ts # Combine into RepoSummary
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"tree-sitter": "^0.21.0",
|
||||
"tree-sitter-python": "^0.21.0",
|
||||
"tree-sitter-typescript": "^0.21.0",
|
||||
"glob": "^10.3.0"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Parses Python files (functions, classes, imports)
|
||||
- [ ] Parses TypeScript files (functions, classes, imports)
|
||||
- [ ] Handles 1000+ file repos in <10 seconds
|
||||
- [ ] Incremental mode only parses changed files
|
||||
- [ ] Gracefully handles syntax errors
|
||||
142
docs/01-codebase-analyzer-prompt.md
Normal file
142
docs/01-codebase-analyzer-prompt.md
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
# 1. Codebase Analyzer Prompt - System Design
|
||||
|
||||
> **Priority**: 🟡 HIGH — Core LLM logic
|
||||
> **Complexity**: High (prompt engineering)
|
||||
> **Effort Estimate**: 8-12 hours (iterative refinement)
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Codebase Analyzer takes structured `RepoSummary` from the introspector and generates `EvalSpec` JSON defining what tests to create. Key insight: **Claude generates specs, not code**. Test code is deterministically rendered from specs.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Codebase Analyzer Agent │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ RepoSummary │───▶│ Claude Agent │───▶│ EvalSpec │ │
|
||||
│ │ JSON │ │ SDK │ │ JSON │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │AskUserQuestion│ │
|
||||
│ │ (optional) │ │
|
||||
│ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface EvalSpec {
|
||||
version: '1.0';
|
||||
repo: { name: string; languages: string[]; analyzedAt: string };
|
||||
scenarios: EvalScenario[];
|
||||
grading: {
|
||||
deterministic: DeterministicGrade[];
|
||||
rubrics: RubricGrade[];
|
||||
};
|
||||
metadata: {
|
||||
generatedBy: string;
|
||||
totalTokens: number;
|
||||
questionsAsked: number;
|
||||
confidence: 'low' | 'medium' | 'high';
|
||||
};
|
||||
}
|
||||
|
||||
interface EvalScenario {
|
||||
id: string; // "auth-login-success"
|
||||
name: string;
|
||||
description: string;
|
||||
target: {
|
||||
module: string;
|
||||
function: string;
|
||||
type: 'function' | 'method' | 'class';
|
||||
};
|
||||
category: 'unit' | 'integration' | 'edge-case' | 'negative';
|
||||
priority: 'critical' | 'high' | 'medium' | 'low';
|
||||
setup?: { fixtures: string[]; mocks: MockSpec[] };
|
||||
input: { args: Record<string, any>; kwargs?: Record<string, any> };
|
||||
assertions: Assertion[];
|
||||
tags: string[];
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Prompt Architecture (Three-Part)
|
||||
|
||||
### 1. System Prompt
|
||||
- Defines Claude's identity as codebase analyzer
|
||||
- Constraints: functional tests only, no syntax checks, ask don't assume
|
||||
|
||||
### 2. Developer Prompt
|
||||
- Contains EvalSpec JSON schema
|
||||
- Formatting rules (snake_case, kebab-case IDs)
|
||||
- Assertion type reference
|
||||
|
||||
### 3. User Prompt (Template)
|
||||
- Injects RepoSummary JSON
|
||||
- User context about what to evaluate
|
||||
- Instructions for output format
|
||||
|
||||
---
|
||||
|
||||
## Key Implementation
|
||||
|
||||
```typescript
|
||||
async function generateEvalSpec(options: GenerateOptions): Promise<EvalSpec> {
|
||||
const agentOptions: ClaudeAgentOptions = {
|
||||
systemPrompt: await loadPrompt('analyzer-system.md'),
|
||||
permissionMode: options.interactive ? 'default' : 'dontAsk',
|
||||
canUseTool: async ({ toolName, input }) => {
|
||||
if (toolName === 'AskUserQuestion' && options.onQuestion) {
|
||||
const answer = await options.onQuestion(input);
|
||||
return { behavior: 'allow', updatedInput: { ...input, answers: { [input.question]: answer } } };
|
||||
}
|
||||
return { behavior: 'deny' };
|
||||
},
|
||||
outputFormat: { type: 'json_schema', json_schema: { name: 'EvalSpec', schema: EVAL_SPEC_SCHEMA } },
|
||||
};
|
||||
|
||||
for await (const msg of query(prompt, agentOptions)) {
|
||||
if (msg.type === 'result') return msg.output as EvalSpec;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/analyzer/
|
||||
├── index.ts # Main entry point
|
||||
├── types.ts # EvalSpec types
|
||||
├── spec-generator.ts # Claude Agent SDK integration
|
||||
├── validator.ts # JSON schema validation
|
||||
└── prompt-builder.ts # Builds prompts from templates
|
||||
|
||||
prompts/
|
||||
├── analyzer-system.md
|
||||
├── analyzer-developer.md
|
||||
└── analyzer-user.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Generates valid EvalSpec JSON for Python repos
|
||||
- [ ] Generates valid EvalSpec JSON for TypeScript repos
|
||||
- [ ] Asks 2-3 clarifying questions on complex repos
|
||||
- [ ] <10k tokens per analysis
|
||||
- [ ] 100% assertion coverage (every scenario has assertions)
|
||||
159
docs/02-synchronous-claude-session.md
Normal file
159
docs/02-synchronous-claude-session.md
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
# 2. Synchronous Claude Session with Questions - System Design
|
||||
|
||||
> **Priority**: 🟡 HIGH — Interactive UX
|
||||
> **Complexity**: Medium
|
||||
> **Effort Estimate**: 4-6 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Handles **interactive communication** between Claude and the user during eval generation. When Claude calls `AskUserQuestion`, we display it in CLI, collect the answer, and return it to Claude.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Claude Session Manager │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Claude Agent │◀──────────────────▶│ Question │ │
|
||||
│ │ SDK │ AskUserQuestion │ Handler │ │
|
||||
│ └──────────────┘ └──────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ Result CLI/stdin │
|
||||
│ (EvalSpec) (inquirer) │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Session Modes
|
||||
|
||||
| Mode | Usage | Behavior |
|
||||
|------|-------|----------|
|
||||
| `interactive` | Local dev | Full CLI prompts via inquirer |
|
||||
| `non-interactive` | CI/CD | Deny questions, use defaults |
|
||||
| `auto-answer` | Scripted | Use provided default answers |
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface Question {
|
||||
header: string;
|
||||
question: string;
|
||||
options?: QuestionOption[];
|
||||
multiSelect?: boolean;
|
||||
freeText?: boolean;
|
||||
defaultValue?: string;
|
||||
}
|
||||
|
||||
interface SessionOptions {
|
||||
interactive: boolean;
|
||||
defaultAnswers?: Record<string, string>;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
type SessionMode = 'interactive' | 'non-interactive' | 'auto-answer';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Implementation
|
||||
|
||||
```typescript
|
||||
class ClaudeSession {
|
||||
async run<T>(systemPrompt: string, userPrompt: string, outputSchema?: object): Promise<T> {
|
||||
const agentOptions: ClaudeAgentOptions = {
|
||||
systemPrompt,
|
||||
permissionMode: this.getPermissionMode(),
|
||||
canUseTool: this.createToolHandler(),
|
||||
outputFormat: outputSchema ? { type: 'json_schema', json_schema: { name: 'Output', schema: outputSchema } } : undefined,
|
||||
};
|
||||
|
||||
for await (const msg of query(userPrompt, agentOptions)) {
|
||||
if (msg.type === 'result') return msg.output as T;
|
||||
}
|
||||
}
|
||||
|
||||
private async handleAskUserQuestion(input: any) {
|
||||
if (this.mode === 'non-interactive') {
|
||||
return { behavior: 'deny', message: 'Interactive questions not allowed in CI' };
|
||||
}
|
||||
|
||||
const answers: Record<string, string> = {};
|
||||
for (const question of input.questions) {
|
||||
answers[question.question] = await promptCLI(question);
|
||||
}
|
||||
return { behavior: 'allow', updatedInput: { questions: input.questions, answers } };
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## CLI Adapter (inquirer)
|
||||
|
||||
```typescript
|
||||
async function promptSelect(question: Question): Promise<string> {
|
||||
const { answer } = await inquirer.prompt([{
|
||||
type: 'list',
|
||||
name: 'answer',
|
||||
message: question.question,
|
||||
choices: question.options!.map(opt => ({ name: `${opt.label} - ${opt.description}`, value: opt.label })),
|
||||
}]);
|
||||
return answer;
|
||||
}
|
||||
```
|
||||
|
||||
**User sees:**
|
||||
```
|
||||
┌─ Priority ────────────────────────
|
||||
│ I found 47 utility functions. Which should I prioritize?
|
||||
|
||||
? Select an option:
|
||||
❯ all - Test all 47 functions
|
||||
top-10 - Focus on 10 most-used
|
||||
critical - Only critical path functions
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/session/
|
||||
├── index.ts # Main exports
|
||||
├── types.ts # TypeScript interfaces
|
||||
├── client.ts # Claude SDK wrapper
|
||||
├── question-handler.ts # AskUserQuestion logic
|
||||
├── cli-adapter.ts # Terminal UI (inquirer)
|
||||
├── modes.ts # Mode detection
|
||||
└── persistence.ts # Save/resume session
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.1.0",
|
||||
"inquirer": "^9.2.0"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Interactive mode works in terminal
|
||||
- [ ] Non-interactive mode works in CI
|
||||
- [ ] Auto-answer mode uses provided defaults
|
||||
- [ ] Session state can be saved and resumed
|
||||
- [ ] Ctrl+C exits cleanly
|
||||
157
docs/03-test-renderers.md
Normal file
157
docs/03-test-renderers.md
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
# 3. Test Renderers - System Design
|
||||
|
||||
> **Priority**: 🟢 MEDIUM — Deterministic layer
|
||||
> **Complexity**: Medium
|
||||
> **Effort Estimate**: 8-12 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Test Renderers **deterministically transform** `EvalSpec` JSON into runnable test files. Key insight:
|
||||
- **Claude generates specs** (what to test, inputs, assertions)
|
||||
- **Renderers generate code** (deterministic, templated, no LLM)
|
||||
|
||||
This makes tests reliable, debuggable, and version-controllable.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Renderer Pipeline │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ EvalSpec │───▶│ Renderer │───▶│ Test Files │ │
|
||||
│ │ JSON │ │ (per-lang) │ │ (.py/.ts) │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │
|
||||
│ Supported: pytest (Python) | vitest (TS) | jest (TS) │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface RenderOptions {
|
||||
outputDir: string;
|
||||
framework: 'pytest' | 'vitest' | 'jest';
|
||||
includeFixtures: boolean;
|
||||
generateMocks: boolean;
|
||||
}
|
||||
|
||||
interface RenderResult {
|
||||
files: GeneratedFile[];
|
||||
stats: { scenarioCount: number; fileCount: number; assertionCount: number };
|
||||
}
|
||||
|
||||
interface GeneratedFile {
|
||||
path: string;
|
||||
content: string;
|
||||
scenarios: string[]; // Which scenario IDs
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Assertion Mapping
|
||||
|
||||
| EvalSpec Type | Python (pytest) | TypeScript (vitest) |
|
||||
|---------------|-----------------|---------------------|
|
||||
| `equals` | `assert result == expected` | `expect(result).toBe(expected)` |
|
||||
| `contains` | `assert key in result` | `expect(result).toContain(key)` |
|
||||
| `matches` | `assert re.match(pattern, result)` | `expect(result).toMatch(pattern)` |
|
||||
| `throws` | `pytest.raises(ExceptionType)` | `expect(() => fn()).toThrow()` |
|
||||
| `type` | `assert isinstance(result, Type)` | `expect(typeof result).toBe('type')` |
|
||||
|
||||
---
|
||||
|
||||
## Example Transformation
|
||||
|
||||
**EvalSpec scenario:**
|
||||
```json
|
||||
{
|
||||
"id": "auth-login-success",
|
||||
"target": { "module": "src/auth/login.py", "function": "login" },
|
||||
"input": { "args": { "username": "test", "password": "valid" } },
|
||||
"assertions": [
|
||||
{ "type": "type", "target": "return", "expected": "dict" },
|
||||
{ "type": "contains", "target": "return", "expected": "token" }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Generated pytest:**
|
||||
```python
|
||||
def test_auth_login_success():
|
||||
"""Verify login returns JWT on valid credentials"""
|
||||
result = login("test", "valid")
|
||||
assert isinstance(result, dict)
|
||||
assert "token" in result
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/renderers/
|
||||
├── index.ts # Registry + main export
|
||||
├── types.ts # Interfaces
|
||||
├── base.ts # Abstract base renderer
|
||||
├── python/
|
||||
│ ├── pytest-renderer.ts
|
||||
│ ├── assertions.ts
|
||||
│ └── templates/
|
||||
│ └── test-file.py.hbs
|
||||
├── typescript/
|
||||
│ ├── vitest-renderer.ts
|
||||
│ ├── jest-renderer.ts
|
||||
│ └── assertions.ts
|
||||
└── utils/
|
||||
└── template-engine.ts
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Incremental Rendering
|
||||
|
||||
```typescript
|
||||
async function renderIncremental(
|
||||
spec: EvalSpec,
|
||||
options: RenderOptions,
|
||||
changedFiles: string[]
|
||||
): Promise<RenderResult> {
|
||||
const filteredSpec = {
|
||||
...spec,
|
||||
scenarios: spec.scenarios.filter(s =>
|
||||
changedFiles.some(f => s.target.module.includes(f))
|
||||
),
|
||||
};
|
||||
return renderSpec(filteredSpec, options);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"handlebars": "^4.7.8"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Pytest renderer generates valid Python test files
|
||||
- [ ] Vitest renderer generates valid TypeScript test files
|
||||
- [ ] Generated tests pass linting
|
||||
- [ ] All assertion types are supported
|
||||
- [ ] Mocks and fixtures correctly generated
|
||||
- [ ] Incremental rendering works
|
||||
269
docs/04-functional-test-execution.md
Normal file
269
docs/04-functional-test-execution.md
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
# 4. Functional Test Execution & Grading - System Design
|
||||
|
||||
> **Priority**: 🟢 MEDIUM — Runtime layer
|
||||
> **Complexity**: Medium-High
|
||||
> **Effort Estimate**: 6-10 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Executes generated tests in a **sandboxed environment** and produces structured results. Tests run in isolation to prevent accidental side effects. Results feed into Promptfoo for aggregation and UI.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Test Execution Pipeline │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Test Files │───▶│ Sandbox │───▶│ Results │ │
|
||||
│ │ (.py/.ts) │ │ Runner │ │ JSON │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ pytest/ │ │ Promptfoo │ │
|
||||
│ │ vitest │ │ Integration │ │
|
||||
│ └──────────────┘ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface ExecutionOptions {
|
||||
framework: 'pytest' | 'vitest' | 'jest';
|
||||
sandbox: boolean;
|
||||
timeout: number; // ms per test
|
||||
parallel: boolean;
|
||||
filter?: string[]; // Run specific test IDs
|
||||
}
|
||||
|
||||
interface ExecutionResult {
|
||||
summary: {
|
||||
total: number;
|
||||
passed: number;
|
||||
failed: number;
|
||||
skipped: number;
|
||||
duration: number;
|
||||
};
|
||||
tests: TestResult[];
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
interface TestResult {
|
||||
id: string; // Maps to EvalScenario.id
|
||||
name: string;
|
||||
status: 'passed' | 'failed' | 'skipped' | 'error';
|
||||
duration: number;
|
||||
assertions: {
|
||||
passed: number;
|
||||
failed: number;
|
||||
details: AssertionResult[];
|
||||
};
|
||||
error?: { message: string; stack?: string };
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sandbox Configuration
|
||||
|
||||
```typescript
|
||||
const SANDBOX_CONFIG = {
|
||||
enabled: true,
|
||||
autoAllowBashIfSandboxed: true,
|
||||
network: {
|
||||
allowLocalBinding: true,
|
||||
allowOutbound: false, // No external network
|
||||
},
|
||||
filesystem: {
|
||||
readOnly: ['/'],
|
||||
writable: ['/tmp', './test-output'],
|
||||
},
|
||||
env: {
|
||||
inherit: ['PATH', 'HOME'],
|
||||
set: { CI: 'true', NODE_ENV: 'test' },
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Runner Implementations
|
||||
|
||||
### Pytest Runner
|
||||
|
||||
```typescript
|
||||
async function runPytest(testDir: string, options: ExecutionOptions): Promise<ExecutionResult> {
|
||||
const args = [
|
||||
'-v',
|
||||
'--tb=short',
|
||||
'--json-report',
|
||||
'--json-report-file=results.json',
|
||||
options.parallel ? '-n auto' : '',
|
||||
options.filter?.map(f => `-k ${f}`).join(' ') || '',
|
||||
].filter(Boolean);
|
||||
|
||||
const { exitCode, stdout, stderr } = await exec(
|
||||
`pytest ${args.join(' ')} ${testDir}`,
|
||||
{ timeout: options.timeout, cwd: testDir }
|
||||
);
|
||||
|
||||
const report = JSON.parse(await fs.readFile('results.json', 'utf-8'));
|
||||
return parseJsonReport(report);
|
||||
}
|
||||
```
|
||||
|
||||
### Vitest Runner
|
||||
|
||||
```typescript
|
||||
async function runVitest(testDir: string, options: ExecutionOptions): Promise<ExecutionResult> {
|
||||
const args = [
|
||||
'run',
|
||||
'--reporter=json',
|
||||
'--outputFile=results.json',
|
||||
options.filter?.length ? `--testNamePattern="${options.filter.join('|')}"` : '',
|
||||
].filter(Boolean);
|
||||
|
||||
const { exitCode } = await exec(
|
||||
`npx vitest ${args.join(' ')}`,
|
||||
{ timeout: options.timeout, cwd: testDir }
|
||||
);
|
||||
|
||||
const report = JSON.parse(await fs.readFile('results.json', 'utf-8'));
|
||||
return parseVitestReport(report);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Promptfoo Integration
|
||||
|
||||
### Custom Provider (`providers/test-runner.py`)
|
||||
|
||||
```python
|
||||
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
||||
"""Runs tests and returns structured results."""
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
test_dir = options.get('test_dir', './tests')
|
||||
framework = options.get('framework', 'pytest')
|
||||
|
||||
if framework == 'pytest':
|
||||
result = subprocess.run(
|
||||
['pytest', '--json-report', '--json-report-file=/tmp/results.json', test_dir],
|
||||
capture_output=True, text=True, timeout=300
|
||||
)
|
||||
with open('/tmp/results.json') as f:
|
||||
report = json.load(f)
|
||||
|
||||
return {
|
||||
'output': json.dumps({
|
||||
'passed': report['summary']['passed'],
|
||||
'failed': report['summary']['failed'],
|
||||
'tests': report['tests'],
|
||||
}),
|
||||
'error': None,
|
||||
}
|
||||
```
|
||||
|
||||
### Promptfoo Config
|
||||
|
||||
```yaml
|
||||
providers:
|
||||
- id: file://providers/test-runner.py
|
||||
label: functional-tests
|
||||
config:
|
||||
test_dir: .evaluclaude/tests
|
||||
framework: pytest
|
||||
timeout: 300
|
||||
|
||||
tests:
|
||||
- vars:
|
||||
scenario_id: auth-login-success
|
||||
assert:
|
||||
- type: python
|
||||
value: |
|
||||
import json
|
||||
result = json.loads(output)
|
||||
result['passed'] > 0 and result['failed'] == 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/runners/
|
||||
├── index.ts # Main entry + registry
|
||||
├── types.ts # Interfaces
|
||||
├── sandbox.ts # Isolation wrapper
|
||||
├── pytest-runner.ts # Python test execution
|
||||
├── vitest-runner.ts # Vitest execution
|
||||
├── jest-runner.ts # Jest execution
|
||||
└── result-parser.ts # Normalize results
|
||||
|
||||
providers/
|
||||
└── test-runner.py # Promptfoo provider
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Result Parsing
|
||||
|
||||
```typescript
|
||||
function parseJsonReport(report: any): ExecutionResult {
|
||||
return {
|
||||
summary: {
|
||||
total: report.summary.total,
|
||||
passed: report.summary.passed,
|
||||
failed: report.summary.failed,
|
||||
skipped: report.summary.skipped || 0,
|
||||
duration: report.duration,
|
||||
},
|
||||
tests: report.tests.map((t: any) => ({
|
||||
id: extractScenarioId(t.nodeid),
|
||||
name: t.nodeid,
|
||||
status: t.outcome,
|
||||
duration: t.call?.duration || 0,
|
||||
assertions: { passed: 0, failed: 0, details: [] },
|
||||
error: t.call?.crash ? { message: t.call.crash.message } : undefined,
|
||||
})),
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"dependencies": {}
|
||||
}
|
||||
```
|
||||
|
||||
**Test framework deps (installed in target repo):**
|
||||
- `pytest`, `pytest-json-report`, `pytest-xdist` (Python)
|
||||
- `vitest` (TypeScript)
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Pytest tests run and produce JSON results
|
||||
- [ ] Vitest tests run and produce JSON results
|
||||
- [ ] Sandbox prevents network/filesystem escape
|
||||
- [ ] Results map back to EvalScenario IDs
|
||||
- [ ] Promptfoo integration works
|
||||
- [ ] Parallel execution supported
|
||||
305
docs/05-llm-rubric-graders.md
Normal file
305
docs/05-llm-rubric-graders.md
Normal file
|
|
@ -0,0 +1,305 @@
|
|||
# 5. LLM Rubric Graders - System Design
|
||||
|
||||
> **Priority**: 🟢 MEDIUM — Subjective quality layer
|
||||
> **Complexity**: Medium
|
||||
> **Effort Estimate**: 4-6 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
LLM Rubric Graders use Claude to evaluate **subjective quality** that deterministic tests can't measure:
|
||||
- Code readability
|
||||
- Error message helpfulness
|
||||
- Documentation quality
|
||||
- API design consistency
|
||||
|
||||
These complement functional tests with human-like judgment.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ LLM Grading Pipeline │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Output │───▶│ Rubric │───▶│ Grading │ │
|
||||
│ │ (code/ │ │ + Claude │ │ Result │ │
|
||||
│ │ text) │ │ │ │ │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │
|
||||
│ Uses Promptfoo │
|
||||
│ llm-rubric assertion │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface Rubric {
|
||||
name: string;
|
||||
description: string;
|
||||
criteria: RubricCriterion[];
|
||||
passingThreshold: number; // 0-1
|
||||
}
|
||||
|
||||
interface RubricCriterion {
|
||||
name: string;
|
||||
description: string;
|
||||
weight: number; // Relative weight
|
||||
examples?: {
|
||||
good: string;
|
||||
bad: string;
|
||||
};
|
||||
}
|
||||
|
||||
interface RubricGradingResult {
|
||||
pass: boolean;
|
||||
score: number; // 0-1
|
||||
reason: string;
|
||||
criterionScores: {
|
||||
name: string;
|
||||
score: number;
|
||||
feedback: string;
|
||||
}[];
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rubric Examples
|
||||
|
||||
### Code Quality Rubric (`rubrics/code-quality.yaml`)
|
||||
|
||||
```yaml
|
||||
name: code-quality
|
||||
description: Evaluates generated code for quality and maintainability
|
||||
passingThreshold: 0.7
|
||||
|
||||
criteria:
|
||||
- name: readability
|
||||
weight: 0.3
|
||||
description: Code is easy to read and understand
|
||||
examples:
|
||||
good: "Clear variable names, logical flow, proper indentation"
|
||||
bad: "Single-letter variables, deeply nested logic, inconsistent style"
|
||||
|
||||
- name: correctness
|
||||
weight: 0.4
|
||||
description: Code correctly implements the intended behavior
|
||||
examples:
|
||||
good: "Handles edge cases, correct algorithm, proper error handling"
|
||||
bad: "Missing edge cases, off-by-one errors, swallowed exceptions"
|
||||
|
||||
- name: efficiency
|
||||
weight: 0.2
|
||||
description: Code uses appropriate data structures and algorithms
|
||||
examples:
|
||||
good: "O(n) where O(n) is optimal, avoids unnecessary allocations"
|
||||
bad: "O(n²) when O(n) is possible, creates objects in tight loops"
|
||||
|
||||
- name: maintainability
|
||||
weight: 0.1
|
||||
description: Code is easy to modify and extend
|
||||
examples:
|
||||
good: "Single responsibility, low coupling, clear interfaces"
|
||||
bad: "God functions, tight coupling, magic numbers"
|
||||
```
|
||||
|
||||
### Error Messages Rubric (`rubrics/error-messages.yaml`)
|
||||
|
||||
```yaml
|
||||
name: error-messages
|
||||
description: Evaluates quality of error messages
|
||||
passingThreshold: 0.6
|
||||
|
||||
criteria:
|
||||
- name: clarity
|
||||
weight: 0.4
|
||||
description: Error message clearly explains what went wrong
|
||||
|
||||
- name: actionability
|
||||
weight: 0.4
|
||||
description: Error message suggests how to fix the problem
|
||||
|
||||
- name: context
|
||||
weight: 0.2
|
||||
description: Error message includes relevant context (file, line, values)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Promptfoo Integration
|
||||
|
||||
### Using `llm-rubric` Assertion
|
||||
|
||||
```yaml
|
||||
# promptfooconfig.yaml
|
||||
tests:
|
||||
- vars:
|
||||
code_output: "{{generated_code}}"
|
||||
assert:
|
||||
- type: llm-rubric
|
||||
value: |
|
||||
Evaluate this code for quality:
|
||||
|
||||
{{code_output}}
|
||||
|
||||
Score on:
|
||||
1. Readability (0-10)
|
||||
2. Correctness (0-10)
|
||||
3. Efficiency (0-10)
|
||||
4. Maintainability (0-10)
|
||||
|
||||
Provide overall score and specific feedback.
|
||||
threshold: 0.7
|
||||
```
|
||||
|
||||
### Custom Python Grader
|
||||
|
||||
```python
|
||||
# graders/rubric_grader.py
|
||||
import json
|
||||
from anthropic import Anthropic
|
||||
|
||||
def get_assert(output: str, context: dict) -> dict:
|
||||
"""Grade output using LLM rubric."""
|
||||
rubric = context.get('config', {}).get('rubric', 'code-quality')
|
||||
rubric_def = load_rubric(rubric)
|
||||
|
||||
client = Anthropic()
|
||||
|
||||
prompt = f"""
|
||||
You are evaluating code quality against this rubric:
|
||||
|
||||
{json.dumps(rubric_def, indent=2)}
|
||||
|
||||
Code to evaluate:
|
||||
```
|
||||
{output}
|
||||
```
|
||||
|
||||
For each criterion, provide:
|
||||
1. Score (0-1)
|
||||
2. Brief feedback
|
||||
|
||||
Return JSON:
|
||||
{{
|
||||
"scores": {{"criterion_name": {{"score": 0.8, "feedback": "..."}}}},
|
||||
"overall": 0.75,
|
||||
"summary": "..."
|
||||
}}
|
||||
"""
|
||||
|
||||
response = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=1024,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
result = json.loads(response.content[0].text)
|
||||
|
||||
return {
|
||||
"pass": result["overall"] >= rubric_def["passingThreshold"],
|
||||
"score": result["overall"],
|
||||
"reason": result["summary"],
|
||||
"namedScores": {k: v["score"] for k, v in result["scores"].items()},
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Calibration
|
||||
|
||||
LLM graders need calibration to ensure consistency:
|
||||
|
||||
```typescript
|
||||
interface CalibrationSet {
|
||||
rubric: string;
|
||||
examples: CalibrationExample[];
|
||||
}
|
||||
|
||||
interface CalibrationExample {
|
||||
input: string;
|
||||
expectedScore: number;
|
||||
expectedFeedback: string[];
|
||||
}
|
||||
|
||||
async function calibrate(rubric: Rubric, examples: CalibrationExample[]): Promise<CalibrationResult> {
|
||||
const results = await Promise.all(
|
||||
examples.map(ex => gradeWithRubric(ex.input, rubric))
|
||||
);
|
||||
|
||||
const agreement = results.filter((r, i) =>
|
||||
Math.abs(r.score - examples[i].expectedScore) < 0.1
|
||||
).length / results.length;
|
||||
|
||||
return {
|
||||
agreement,
|
||||
drift: results.map((r, i) => r.score - examples[i].expectedScore),
|
||||
needsAdjustment: agreement < 0.8,
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/graders/
|
||||
├── llm/
|
||||
│ ├── index.ts # Main entry
|
||||
│ ├── provider.ts # Promptfoo custom provider
|
||||
│ ├── rubric-loader.ts # Load YAML rubrics
|
||||
│ └── grader.ts # Core grading logic
|
||||
└── calibration/
|
||||
├── calibrator.ts # Calibration runner
|
||||
└── examples/ # Calibration datasets
|
||||
|
||||
rubrics/
|
||||
├── code-quality.yaml
|
||||
├── error-messages.yaml
|
||||
├── documentation.yaml
|
||||
└── api-design.yaml
|
||||
|
||||
graders/
|
||||
└── rubric_grader.py # Python grader for Promptfoo
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## When to Use LLM vs Deterministic
|
||||
|
||||
| Use LLM Graders | Use Deterministic |
|
||||
|-----------------|-------------------|
|
||||
| Subjective quality | Pass/fail assertions |
|
||||
| Style/readability | Type checking |
|
||||
| Helpfulness | Value equality |
|
||||
| Consistency | Error presence |
|
||||
| User experience | Performance thresholds |
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"js-yaml": "^4.1.0"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Rubrics load from YAML files
|
||||
- [ ] LLM grader produces consistent scores
|
||||
- [ ] Calibration detects drift
|
||||
- [ ] Integrates with Promptfoo `llm-rubric`
|
||||
- [ ] Custom Python grader works
|
||||
- [ ] >80% agreement with human judgment
|
||||
364
docs/06-observability-tracing.md
Normal file
364
docs/06-observability-tracing.md
Normal file
|
|
@ -0,0 +1,364 @@
|
|||
# 6. Observability & Tracing - System Design
|
||||
|
||||
> **Priority**: 🟡 HIGH — Debugging is critical
|
||||
> **Complexity**: Medium
|
||||
> **Effort Estimate**: 4-6 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Every eval run produces a **trace** capturing what Claude did and why. No black boxes. When a test fails, you can see:
|
||||
- What files Claude analyzed
|
||||
- What questions it asked
|
||||
- What specs it generated
|
||||
- The reasoning behind each decision
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Observability Pipeline │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Claude Agent │───▶│ Tracer │───▶│ Trace Store │ │
|
||||
│ │ Hooks │ │ (collector) │ │ (.json) │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │ Trace Viewer │ │
|
||||
│ │ (Promptfoo) │ │
|
||||
│ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface EvalTrace {
|
||||
id: string; // UUID
|
||||
evalId: string; // Links to EvalSpec
|
||||
startedAt: string;
|
||||
completedAt: string;
|
||||
duration: number; // ms
|
||||
|
||||
status: 'success' | 'partial' | 'failed';
|
||||
|
||||
introspection: {
|
||||
filesAnalyzed: string[];
|
||||
totalFunctions: number;
|
||||
totalClasses: number;
|
||||
duration: number;
|
||||
};
|
||||
|
||||
analysis: {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
toolCalls: ToolCall[];
|
||||
questionsAsked: Question[];
|
||||
decisions: Decision[];
|
||||
};
|
||||
|
||||
generation: {
|
||||
scenariosGenerated: number;
|
||||
filesWritten: string[];
|
||||
};
|
||||
|
||||
execution: {
|
||||
testsPassed: number;
|
||||
testsFailed: number;
|
||||
testsSkipped: number;
|
||||
failures: TestFailure[];
|
||||
};
|
||||
|
||||
errors: TraceError[];
|
||||
}
|
||||
|
||||
interface ToolCall {
|
||||
timestamp: string;
|
||||
tool: string;
|
||||
input: any;
|
||||
output: any;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
interface Decision {
|
||||
timestamp: string;
|
||||
type: 'include' | 'exclude' | 'prioritize' | 'question';
|
||||
subject: string; // What was decided about
|
||||
reasoning: string; // Why
|
||||
confidence: number; // 0-1
|
||||
}
|
||||
|
||||
interface TestFailure {
|
||||
scenarioId: string;
|
||||
error: string;
|
||||
stack?: string;
|
||||
expected?: any;
|
||||
actual?: any;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Hook-Based Collection
|
||||
|
||||
Use Claude Agent SDK hooks to capture everything:
|
||||
|
||||
```typescript
|
||||
import { ClaudeAgentOptions } from '@anthropic-ai/claude-agent-sdk';
|
||||
import { Tracer } from './tracer';
|
||||
|
||||
function createTracedOptions(tracer: Tracer): Partial<ClaudeAgentOptions> {
|
||||
return {
|
||||
hooks: {
|
||||
preToolUse: [{
|
||||
hooks: [async (input) => {
|
||||
tracer.recordToolStart(input.tool_name, input.tool_input);
|
||||
return { continue_: true };
|
||||
}]
|
||||
}],
|
||||
postToolUse: [{
|
||||
hooks: [async (input) => {
|
||||
tracer.recordToolEnd(input.tool_name, input.tool_output);
|
||||
return {};
|
||||
}]
|
||||
}],
|
||||
},
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Tracer Implementation
|
||||
|
||||
```typescript
|
||||
class Tracer {
|
||||
private trace: EvalTrace;
|
||||
private currentToolCall?: { name: string; input: any; startTime: number };
|
||||
|
||||
constructor(evalId: string) {
|
||||
this.trace = {
|
||||
id: crypto.randomUUID(),
|
||||
evalId,
|
||||
startedAt: new Date().toISOString(),
|
||||
completedAt: '',
|
||||
duration: 0,
|
||||
status: 'success',
|
||||
introspection: { filesAnalyzed: [], totalFunctions: 0, totalClasses: 0, duration: 0 },
|
||||
analysis: { promptTokens: 0, completionTokens: 0, toolCalls: [], questionsAsked: [], decisions: [] },
|
||||
generation: { scenariosGenerated: 0, filesWritten: [] },
|
||||
execution: { testsPassed: 0, testsFailed: 0, testsSkipped: 0, failures: [] },
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
|
||||
recordToolStart(name: string, input: any): void {
|
||||
this.currentToolCall = { name, input, startTime: Date.now() };
|
||||
}
|
||||
|
||||
recordToolEnd(name: string, output: any): void {
|
||||
if (this.currentToolCall?.name === name) {
|
||||
this.trace.analysis.toolCalls.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
tool: name,
|
||||
input: this.currentToolCall.input,
|
||||
output,
|
||||
duration: Date.now() - this.currentToolCall.startTime,
|
||||
});
|
||||
this.currentToolCall = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
recordQuestion(question: any, answer: string): void {
|
||||
this.trace.analysis.questionsAsked.push({
|
||||
...question,
|
||||
answer,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
|
||||
recordDecision(type: Decision['type'], subject: string, reasoning: string, confidence: number): void {
|
||||
this.trace.analysis.decisions.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
type,
|
||||
subject,
|
||||
reasoning,
|
||||
confidence,
|
||||
});
|
||||
}
|
||||
|
||||
recordError(error: Error, context?: string): void {
|
||||
this.trace.errors.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
message: error.message,
|
||||
stack: error.stack,
|
||||
context,
|
||||
});
|
||||
this.trace.status = 'failed';
|
||||
}
|
||||
|
||||
finalize(): EvalTrace {
|
||||
this.trace.completedAt = new Date().toISOString();
|
||||
this.trace.duration = new Date(this.trace.completedAt).getTime() - new Date(this.trace.startedAt).getTime();
|
||||
return this.trace;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Trace Storage
|
||||
|
||||
```typescript
|
||||
const TRACES_DIR = '.evaluclaude/traces';
|
||||
|
||||
async function saveTrace(trace: EvalTrace): Promise<string> {
|
||||
await fs.mkdir(TRACES_DIR, { recursive: true });
|
||||
const filePath = path.join(TRACES_DIR, `${trace.id}.json`);
|
||||
await fs.writeFile(filePath, JSON.stringify(trace, null, 2));
|
||||
return filePath;
|
||||
}
|
||||
|
||||
async function loadTrace(traceId: string): Promise<EvalTrace> {
|
||||
const filePath = path.join(TRACES_DIR, `${traceId}.json`);
|
||||
const content = await fs.readFile(filePath, 'utf-8');
|
||||
return JSON.parse(content);
|
||||
}
|
||||
|
||||
async function listTraces(evalId?: string): Promise<EvalTrace[]> {
|
||||
const files = await fs.readdir(TRACES_DIR);
|
||||
const traces = await Promise.all(
|
||||
files.filter(f => f.endsWith('.json')).map(f => loadTrace(f.replace('.json', '')))
|
||||
);
|
||||
return evalId ? traces.filter(t => t.evalId === evalId) : traces;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Promptfoo Integration
|
||||
|
||||
Link traces to test results:
|
||||
|
||||
```yaml
|
||||
# promptfooconfig.yaml
|
||||
defaultTest:
|
||||
metadata:
|
||||
traceFile: .evaluclaude/traces/{{evalId}}.json
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Trace Viewer CLI
|
||||
|
||||
```typescript
|
||||
// src/cli/commands/view.ts
|
||||
import { Command } from 'commander';
|
||||
import { loadTrace, listTraces } from '../observability/trace-store';
|
||||
|
||||
export const viewCommand = new Command('view')
|
||||
.description('View eval trace')
|
||||
.argument('[trace-id]', 'Specific trace ID')
|
||||
.option('--last', 'View most recent trace')
|
||||
.option('--json', 'Output raw JSON')
|
||||
.action(async (traceId, options) => {
|
||||
let trace: EvalTrace;
|
||||
|
||||
if (options.last) {
|
||||
const traces = await listTraces();
|
||||
trace = traces.sort((a, b) =>
|
||||
new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime()
|
||||
)[0];
|
||||
} else {
|
||||
trace = await loadTrace(traceId);
|
||||
}
|
||||
|
||||
if (options.json) {
|
||||
console.log(JSON.stringify(trace, null, 2));
|
||||
} else {
|
||||
displayTrace(trace);
|
||||
}
|
||||
});
|
||||
|
||||
function displayTrace(trace: EvalTrace): void {
|
||||
console.log(`\n📊 Trace: ${trace.id}`);
|
||||
console.log(` Status: ${trace.status}`);
|
||||
console.log(` Duration: ${trace.duration}ms`);
|
||||
console.log(`\n📂 Introspection:`);
|
||||
console.log(` Files: ${trace.introspection.filesAnalyzed.length}`);
|
||||
console.log(` Functions: ${trace.introspection.totalFunctions}`);
|
||||
console.log(`\n🤖 Analysis:`);
|
||||
console.log(` Tool calls: ${trace.analysis.toolCalls.length}`);
|
||||
console.log(` Questions: ${trace.analysis.questionsAsked.length}`);
|
||||
console.log(` Decisions: ${trace.analysis.decisions.length}`);
|
||||
console.log(`\n🧪 Execution:`);
|
||||
console.log(` ✅ Passed: ${trace.execution.testsPassed}`);
|
||||
console.log(` ❌ Failed: ${trace.execution.testsFailed}`);
|
||||
|
||||
if (trace.execution.failures.length > 0) {
|
||||
console.log(`\n❌ Failures:`);
|
||||
trace.execution.failures.forEach(f => {
|
||||
console.log(` - ${f.scenarioId}: ${f.error}`);
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/observability/
|
||||
├── index.ts # Main exports
|
||||
├── tracer.ts # Hook-based collection
|
||||
├── trace-store.ts # Persist to filesystem
|
||||
├── trace-viewer.ts # Format for display
|
||||
└── types.ts # EvalTrace interface
|
||||
|
||||
.evaluclaude/
|
||||
└── traces/
|
||||
├── abc123.json
|
||||
├── def456.json
|
||||
└── ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What Gets Traced
|
||||
|
||||
| Phase | Captured |
|
||||
|-------|----------|
|
||||
| Introspection | Files parsed, functions/classes found, duration |
|
||||
| Analysis | Every tool call, questions asked, decisions made |
|
||||
| Generation | Scenarios created, files written |
|
||||
| Execution | Test results, failures with context |
|
||||
| Errors | Any exceptions with stack traces |
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"dependencies": {}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Every eval run produces a trace
|
||||
- [ ] Traces capture all tool calls
|
||||
- [ ] Questions and answers are recorded
|
||||
- [ ] Test failures link to trace
|
||||
- [ ] CLI viewer displays traces clearly
|
||||
- [ ] Traces stored efficiently (<1MB each)
|
||||
Loading…
Add table
Add a link
Reference in a new issue