ui polish

This commit is contained in:
Harivansh Rathi 2026-01-11 20:38:57 -05:00
parent ff5300f4e0
commit 69c08c9d6b
12 changed files with 1430 additions and 308 deletions

202
README.md Normal file
View file

@ -0,0 +1,202 @@
# evaluclaude
> **Zero-to-evals in one command.** Claude analyzes your codebase and generates functional tests.
![Version](https://img.shields.io/badge/version-0.1.0-blue)
![Node](https://img.shields.io/badge/node-%3E%3D18.0.0-green)
![License](https://img.shields.io/badge/license-MIT-brightgreen)
## What is this?
**evaluclaude** is a CLI tool that uses Claude to understand your codebase and generate real, runnable functional tests. Unlike traditional test generators that produce boilerplate, evaluclaude:
- **Parses your code** with tree-sitter (no LLM tokens wasted on structure)
- **Asks smart questions** to understand your testing priorities
- **Generates specs, not code** — deterministic renderers create the actual tests
- **Full observability** — every run produces a trace you can inspect
## Quick Start
```bash
# Install
npm install -g evaluclaude-harness
# Run the full pipeline
evaluclaude pipeline .
# Or step by step
evaluclaude intro . # Introspect codebase
evaluclaude analyze . -o spec.json -i # Generate spec (interactive)
evaluclaude render spec.json # Create test files
evaluclaude run # Execute tests
```
## How It Works
```
┌─────────────────────────────────────────────────────────┐
│ evaluclaude pipeline │
├─────────────────────────────────────────────────────────┤
│ │
│ 1. INTROSPECT Parse code with tree-sitter │
│ 📂 → 📋 Extract functions, classes │
│ │
│ 2. ANALYZE Claude generates EvalSpec │
│ 📋 → 🧠 Asks clarifying questions │
│ │
│ 3. RENDER Deterministic code generation │
│ 🧠 → 📄 pytest / vitest / jest │
│ │
│ 4. RUN Execute in sandbox │
│ 📄 → 🧪 Collect results + traces │
│ │
└─────────────────────────────────────────────────────────┘
```
## Commands
### Core Pipeline
| Command | Description |
|---------|-------------|
| `pipeline [path]` | Run the full pipeline: introspect → analyze → render → run |
| `intro [path]` | Introspect codebase with tree-sitter |
| `analyze [path]` | Generate EvalSpec with Claude |
| `render <spec>` | Render EvalSpec to test files |
| `run [test-dir]` | Execute tests and collect results |
### Grading & Rubrics
| Command | Description |
|---------|-------------|
| `grade <input>` | Grade output using LLM rubric |
| `rubrics` | List available rubrics |
| `calibrate` | Calibrate rubric against examples |
### Observability
| Command | Description |
|---------|-------------|
| `view [trace-id]` | View trace details |
| `traces` | List all traces |
| `ui` | Launch Promptfoo dashboard |
| `eval` | Run Promptfoo evaluations |
## Examples
### Analyze a Python project interactively
```bash
evaluclaude analyze ./my-python-project -i -o spec.json
```
Claude will ask questions like:
- "I see 3 database models. Which is the core domain object?"
- "Found 47 utility functions. Want me to prioritize the most-used ones?"
### Focus on specific modules
```bash
evaluclaude pipeline . --focus auth,payments --max-scenarios 20
```
### View test results in browser
```bash
evaluclaude run --export-promptfoo
evaluclaude ui
```
### Skip steps in the pipeline
```bash
# Use existing spec, just run tests
evaluclaude pipeline . --skip-analyze --skip-render
# Generate tests without running
evaluclaude pipeline . --skip-run
```
## Configuration
### Environment Variables
| Variable | Description |
|----------|-------------|
| `ANTHROPIC_API_KEY` | Your Anthropic API key |
### Output Structure
```
.evaluclaude/
├── spec.json # Generated EvalSpec
├── traces/ # Execution traces
│ └── trace-xxx.json
├── results/ # Test results
│ └── run-xxx.json
└── promptfooconfig.yaml # Promptfoo config (with --promptfoo)
```
## Rubrics
Create custom grading rubrics in YAML:
```yaml
# rubrics/my-rubric.yaml
name: my-rubric
description: Custom quality checks
passingThreshold: 0.7
criteria:
- name: correctness
description: Code produces correct results
weight: 0.5
- name: clarity
description: Code is clear and readable
weight: 0.3
- name: efficiency
description: Code is reasonably efficient
weight: 0.2
```
Use it:
```bash
evaluclaude grade output.txt -r my-rubric
```
## Architecture
evaluclaude follows key principles:
1. **Tree-sitter for introspection** — Never send raw code to Claude for structure extraction
2. **Claude generates specs, not code** — EvalSpec JSON is LLM output; test code is deterministic
3. **Functional tests only** — Every test must invoke actual code, no syntax checks
4. **Full observability** — Every eval run produces an inspectable trace
## Supported Languages
| Language | Parser | Test Framework |
|----------|--------|----------------|
| Python | tree-sitter-python | pytest |
| TypeScript | tree-sitter-typescript | vitest, jest |
| JavaScript | tree-sitter-typescript | vitest, jest |
## Development
```bash
# Build
npm run build
# Run in dev mode
npm run dev
# Run tests
npm test
# Type check
npm run typecheck
```
## License
MIT

View file

@ -3,6 +3,18 @@ import * as path from 'node:path';
import * as fs from 'node:fs/promises'; import * as fs from 'node:fs/promises';
import { analyze } from '../../introspector/index.js'; import { analyze } from '../../introspector/index.js';
import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js'; import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
import {
style,
icons,
header,
step,
keyValue,
Spinner,
formatError,
nextSteps,
box,
BANNER_MINIMAL,
} from '../theme.js';
interface StructuredQuestion { interface StructuredQuestion {
questions: { questions: {
@ -19,7 +31,6 @@ interface StructuredQuestion {
async function handleQuestion(questionData: string): Promise<string> { async function handleQuestion(questionData: string): Promise<string> {
const { default: inquirer } = await import('inquirer'); const { default: inquirer } = await import('inquirer');
// Try to parse as structured question
let parsed: StructuredQuestion | null = null; let parsed: StructuredQuestion | null = null;
try { try {
parsed = JSON.parse(questionData); parsed = JSON.parse(questionData);
@ -31,29 +42,27 @@ async function handleQuestion(questionData: string): Promise<string> {
const answers: string[] = []; const answers: string[] = [];
for (const q of parsed.questions) { for (const q of parsed.questions) {
console.log(`\n🤖 ${q.header || 'Question'}:\n`); console.log(`\n${style.highlight(icons.brain)} ${style.bold(q.header || 'Question')}:\n`);
if (q.options && q.options.length > 0) { if (q.options && q.options.length > 0) {
// Render as selection
const choices = q.options.map(opt => ({ const choices = q.options.map(opt => ({
name: opt.description ? `${opt.label} - ${opt.description}` : opt.label, name: opt.description ? `${style.bold(opt.label)} ${style.dim('─')} ${opt.description}` : opt.label,
value: opt.label, value: opt.label,
})); }));
const { selection } = await inquirer.prompt([{ const { selection } = await inquirer.prompt([{
type: q.multiSelect ? 'checkbox' : 'list', type: q.multiSelect ? 'checkbox' : 'list',
name: 'selection', name: 'selection',
message: q.question, message: style.info(q.question),
choices, choices,
}]); }]);
answers.push(Array.isArray(selection) ? selection.join(', ') : selection); answers.push(Array.isArray(selection) ? selection.join(', ') : selection);
} else { } else {
// Plain text input
const { answer } = await inquirer.prompt([{ const { answer } = await inquirer.prompt([{
type: 'input', type: 'input',
name: 'answer', name: 'answer',
message: q.question, message: style.info(q.question),
}]); }]);
answers.push(answer); answers.push(answer);
} }
@ -66,7 +75,7 @@ async function handleQuestion(questionData: string): Promise<string> {
const { answer } = await inquirer.prompt([{ const { answer } = await inquirer.prompt([{
type: 'input', type: 'input',
name: 'answer', name: 'answer',
message: `🤖 Claude asks: ${questionData}`, message: `${style.highlight(icons.brain)} ${style.bold('Claude asks:')} ${questionData}`,
}]); }]);
return answer; return answer;
@ -80,20 +89,40 @@ export const analyzeCommand = new Command('analyze')
.option('--focus <modules>', 'Comma-separated list of modules/functions to focus on') .option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
.option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10') .option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
.option('--quiet', 'Suppress progress messages') .option('--quiet', 'Suppress progress messages')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude analyze .')} ${style.dim('Analyze current directory')}
${style.command('evaluclaude analyze ./src -o spec.json')} ${style.dim('Save output to file')}
${style.command('evaluclaude analyze . -i')} ${style.dim('Interactive mode with questions')}
${style.command('evaluclaude analyze . --focus auth,api')} ${style.dim('Focus on specific modules')}
${style.command('evaluclaude analyze . --max-scenarios 20')} ${style.dim('Generate more scenarios')}
`)
.action(async (repoPath: string, options: AnalyzeOptions) => { .action(async (repoPath: string, options: AnalyzeOptions) => {
const absolutePath = path.resolve(repoPath); const absolutePath = path.resolve(repoPath);
const log = options.quiet ? () => {} : console.log; const quiet = options.quiet;
log(`\n🔬 Analyzing codebase: ${absolutePath}\n`); if (!quiet) {
console.log(`\n${BANNER_MINIMAL}\n`);
console.log(header('Analyze Codebase'));
console.log(keyValue('Path', style.path(absolutePath)));
console.log();
}
try { try {
log('Step 1: Running tree-sitter introspection...'); // Step 1: Tree-sitter introspection
const introSpinner = quiet ? null : new Spinner('Running tree-sitter introspection...');
introSpinner?.start();
const repoSummary = await analyze({ const repoSummary = await analyze({
root: absolutePath, root: absolutePath,
onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`), onProgress: quiet ? undefined : (msg) => introSpinner?.update(`Introspecting: ${msg}`),
}); });
log(`\nStep 2: Generating EvalSpec with Claude...\n`); introSpinner?.succeed('Tree-sitter introspection complete');
// Step 2: Claude analysis
const claudeSpinner = quiet ? null : new Spinner('Generating EvalSpec with Claude...');
claudeSpinner?.start();
const focus = options.focus?.split(',').map(s => s.trim()); const focus = options.focus?.split(',').map(s => s.trim());
const maxScenarios = parseInt(options.maxScenarios, 10); const maxScenarios = parseInt(options.maxScenarios, 10);
@ -101,6 +130,9 @@ export const analyzeCommand = new Command('analyze')
let result; let result;
if (options.interactive) { if (options.interactive) {
claudeSpinner?.stop();
console.log(`\n${style.info(icons.info)} ${style.bold('Interactive mode enabled')}\n`);
result = await generateEvalSpecInteractive( result = await generateEvalSpecInteractive(
repoSummary, repoSummary,
handleQuestion, handleQuestion,
@ -112,26 +144,51 @@ export const analyzeCommand = new Command('analyze')
focus, focus,
maxScenarios, maxScenarios,
}); });
claudeSpinner?.succeed('EvalSpec generated with Claude');
} }
const { spec, tokensUsed, questionsAsked } = result; const { spec, tokensUsed, questionsAsked } = result;
log('\n✅ EvalSpec generated successfully!'); // Results summary
log(` Scenarios: ${spec.scenarios.length}`); if (!quiet) {
log(` Tokens used: ${tokensUsed}`); console.log();
log(` Questions asked: ${questionsAsked}`); console.log(`${style.success(icons.success)} ${style.bold('EvalSpec generated successfully!')}`);
log(` Confidence: ${spec.metadata.confidence}`); console.log();
console.log(` ${style.primary(box.vertical)} ${keyValue('Scenarios', style.number(String(spec.scenarios.length)))}`);
console.log(` ${style.primary(box.vertical)} ${keyValue('Tokens used', style.number(String(tokensUsed)))}`);
console.log(` ${style.primary(box.vertical)} ${keyValue('Questions asked', style.number(String(questionsAsked)))}`);
console.log(` ${style.primary(box.vertical)} ${keyValue('Confidence', style.highlight(spec.metadata.confidence))}`);
}
const json = JSON.stringify(spec, null, 2); const json = JSON.stringify(spec, null, 2);
if (options.output) { if (options.output) {
await fs.writeFile(options.output, json); await fs.writeFile(options.output, json);
log(`\n📄 Written to: ${options.output}`); if (!quiet) {
console.log();
console.log(`${style.success(icons.success)} Written to: ${style.path(options.output)}`);
console.log(nextSteps([
{ command: `evaluclaude render ${options.output}`, description: 'Render tests from the spec' },
{ command: `evaluclaude pipeline . -o ./tests`, description: 'Run the full pipeline' },
]));
}
} else { } else {
console.log('\n' + json); console.log('\n' + json);
if (!quiet) {
console.log(nextSteps([
{ command: 'evaluclaude analyze . -o spec.json', description: 'Save the spec to a file' },
{ command: 'evaluclaude render spec.json', description: 'Then render tests from it' },
]));
}
} }
} catch (error) { } catch (error) {
console.error('\n❌ Error:', error instanceof Error ? error.message : error); const message = error instanceof Error ? error.message : String(error);
console.error(formatError(message, [
'Check that the path exists and contains source files',
'Ensure ANTHROPIC_API_KEY is set in your environment',
'Try running with --quiet to see raw errors',
'Use evaluclaude intro <path> to verify introspection works',
]));
process.exit(1); process.exit(1);
} }
}); });

View file

@ -2,6 +2,7 @@ import { Command } from 'commander';
import { readFileSync, existsSync } from 'fs'; import { readFileSync, existsSync } from 'fs';
import { gradeWithRubric, loadAllRubrics, analyzeCalibration, calibrate } from '../../graders/index.js'; import { gradeWithRubric, loadAllRubrics, analyzeCalibration, calibrate } from '../../graders/index.js';
import type { CalibrationExample } from '../../graders/types.js'; import type { CalibrationExample } from '../../graders/types.js';
import { style, icons, Spinner, formatError, progressBar, subheader, keyValue } from '../theme.js';
export const gradeCommand = new Command('grade') export const gradeCommand = new Command('grade')
.description('Grade output using LLM rubric') .description('Grade output using LLM rubric')
@ -9,6 +10,12 @@ export const gradeCommand = new Command('grade')
.option('-r, --rubric <name>', 'Rubric name or path', 'code-quality') .option('-r, --rubric <name>', 'Rubric name or path', 'code-quality')
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics') .option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
.option('--json', 'Output result as JSON', false) .option('--json', 'Output result as JSON', false)
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude grade output.txt')} ${style.dim('Grade file with default rubric')}
${style.command('evaluclaude grade output.txt -r safety')} ${style.dim('Use specific rubric')}
${style.command('evaluclaude grade "inline text" --json')} ${style.dim('Grade string, output JSON')}
`)
.action(async (input: string, options) => { .action(async (input: string, options) => {
try { try {
let content: string; let content: string;
@ -19,29 +26,48 @@ export const gradeCommand = new Command('grade')
content = input; content = input;
} }
console.log(`Grading with rubric: ${options.rubric}`); const spinner = new Spinner(`Grading with rubric ${style.highlight(options.rubric)}...`);
spinner.start();
const result = await gradeWithRubric(content, options.rubric, { const result = await gradeWithRubric(content, options.rubric, {
rubricsDir: options.rubricsDir, rubricsDir: options.rubricsDir,
}); });
if (options.json) { if (options.json) {
spinner.stop();
console.log(JSON.stringify(result, null, 2)); console.log(JSON.stringify(result, null, 2));
return; return;
} }
console.log(`\n${result.pass ? '✅ PASS' : '❌ FAIL'}`); if (result.pass) {
console.log(`Score: ${(result.score * 100).toFixed(1)}%`); spinner.succeed(`Graded with rubric ${style.highlight(options.rubric)}`);
console.log(`\nSummary: ${result.reason}`); } else {
spinner.fail(`Graded with rubric ${style.highlight(options.rubric)}`);
}
console.log();
console.log(result.pass
? `${style.success(icons.passed)} ${style.bold(style.success('PASS'))}`
: `${style.error(icons.failed)} ${style.bold(style.error('FAIL'))}`);
console.log(keyValue('Score', style.number(`${(result.score * 100).toFixed(1)}%`)));
console.log();
console.log(keyValue('Summary', result.reason));
console.log('\nCriterion Scores:'); console.log(subheader('Criterion Scores'));
for (const cs of result.criterionScores) { for (const cs of result.criterionScores) {
const bar = '█'.repeat(Math.round(cs.score * 10)) + '░'.repeat(10 - Math.round(cs.score * 10)); const bar = progressBar(cs.score, 1, 20);
console.log(` ${cs.name}: ${bar} ${(cs.score * 100).toFixed(0)}%`); console.log(` ${style.bold(cs.name)}: ${bar}`);
console.log(` ${cs.feedback}`); console.log(` ${style.dim(cs.feedback)}`);
} }
} catch (error) { } catch (error) {
console.error('Error grading:', error instanceof Error ? error.message : error); console.error(formatError(
error instanceof Error ? error.message : String(error),
[
'Check that the rubric exists in the rubrics directory',
'Ensure ANTHROPIC_API_KEY is set',
`Run ${style.command('evaluclaude rubrics')} to list available rubrics`,
]
));
process.exit(1); process.exit(1);
} }
}); });
@ -49,26 +75,44 @@ export const gradeCommand = new Command('grade')
export const listRubricsCommand = new Command('rubrics') export const listRubricsCommand = new Command('rubrics')
.description('List available rubrics') .description('List available rubrics')
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics') .option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude rubrics')} ${style.dim('List all rubrics')}
${style.command('evaluclaude rubrics --rubrics-dir ./my-rubrics')} ${style.dim('Use custom directory')}
`)
.action(async (options) => { .action(async (options) => {
try { try {
const rubrics = loadAllRubrics(options.rubricsDir); const rubrics = loadAllRubrics(options.rubricsDir);
if (rubrics.size === 0) { if (rubrics.size === 0) {
console.log(`No rubrics found in ${options.rubricsDir}`); console.log(formatError(
`No rubrics found in ${style.path(options.rubricsDir)}`,
[
'Create rubric YAML files in the rubrics directory',
'Use --rubrics-dir to specify a different location',
]
));
return; return;
} }
console.log(`Available rubrics (${rubrics.size}):\n`); console.log(subheader(`Available Rubrics (${style.number(String(rubrics.size))})`));
console.log();
for (const [name, rubric] of rubrics) { for (const [name, rubric] of rubrics) {
console.log(`📋 ${name}`); console.log(`${icons.spec} ${style.bold(style.primary(name))}`);
console.log(` ${rubric.description}`); console.log(keyValue('Description', rubric.description, 1));
console.log(` Threshold: ${(rubric.passingThreshold * 100).toFixed(0)}%`); console.log(keyValue('Threshold', style.number(`${(rubric.passingThreshold * 100).toFixed(0)}%`), 1));
console.log(` Criteria: ${rubric.criteria.map(c => c.name).join(', ')}`); console.log(keyValue('Criteria', rubric.criteria.map(c => style.highlight(c.name)).join(', '), 1));
console.log(''); console.log();
} }
} catch (error) { } catch (error) {
console.error('Error listing rubrics:', error instanceof Error ? error.message : error); console.error(formatError(
error instanceof Error ? error.message : String(error),
[
'Check that the rubrics directory exists',
'Ensure rubric files are valid YAML',
]
));
process.exit(1); process.exit(1);
} }
}); });
@ -78,24 +122,49 @@ export const calibrateCommand = new Command('calibrate')
.argument('<rubric>', 'Rubric name or path') .argument('<rubric>', 'Rubric name or path')
.argument('<examples>', 'Path to calibration examples JSON') .argument('<examples>', 'Path to calibration examples JSON')
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics') .option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude calibrate code-quality examples.json')} ${style.dim('Calibrate with examples')}
${style.bold('Examples file format:')}
${style.dim('[')}
${style.dim('{ "content": "...", "expectedPass": true, "expectedScore": 0.8 },')}
${style.dim('{ "content": "...", "expectedPass": false }')}
${style.dim(']')}
`)
.action(async (rubricName: string, examplesPath: string, options) => { .action(async (rubricName: string, examplesPath: string, options) => {
try { try {
if (!existsSync(examplesPath)) { if (!existsSync(examplesPath)) {
console.error(`Examples file not found: ${examplesPath}`); console.error(formatError(
`Examples file not found: ${style.path(examplesPath)}`,
[
'Check that the file path is correct',
'Ensure the file exists and is readable',
]
));
process.exit(1); process.exit(1);
} }
const examples: CalibrationExample[] = JSON.parse(readFileSync(examplesPath, 'utf-8')); const examples: CalibrationExample[] = JSON.parse(readFileSync(examplesPath, 'utf-8'));
console.log(`Calibrating rubric '${rubricName}' with ${examples.length} examples...`); const spinner = new Spinner(`Calibrating rubric ${style.highlight(rubricName)} with ${style.number(String(examples.length))} examples...`);
spinner.start();
const result = await calibrate(rubricName, examples, { const result = await calibrate(rubricName, examples, {
rubricsDir: options.rubricsDir, rubricsDir: options.rubricsDir,
}); });
spinner.succeed(`Calibration complete for ${style.highlight(rubricName)}`);
console.log('\n' + analyzeCalibration(result)); console.log('\n' + analyzeCalibration(result));
} catch (error) { } catch (error) {
console.error('Error calibrating:', error instanceof Error ? error.message : error); console.error(formatError(
error instanceof Error ? error.message : String(error),
[
'Check that the rubric exists',
'Ensure the examples file is valid JSON',
'Ensure ANTHROPIC_API_KEY is set',
]
));
process.exit(1); process.exit(1);
} }
}); });

View file

@ -1,6 +1,7 @@
import { Command } from 'commander'; import { Command } from 'commander';
import * as path from 'node:path'; import * as path from 'node:path';
import { analyze, treeToString } from '../../introspector/index.js'; import { analyze, treeToString } from '../../introspector/index.js';
import { style, icons, header, subheader, keyValue, Spinner, formatError, nextSteps, box } from '../theme.js';
export const introCommand = new Command('intro') export const introCommand = new Command('intro')
.description('Introspect a codebase and output its structure (tree-sitter analysis)') .description('Introspect a codebase and output its structure (tree-sitter analysis)')
@ -9,21 +10,35 @@ export const introCommand = new Command('intro')
.option('--json', 'Output as JSON (default)') .option('--json', 'Output as JSON (default)')
.option('--summary', 'Output a human-readable summary instead of JSON') .option('--summary', 'Output a human-readable summary instead of JSON')
.option('--tree', 'Show file tree structure') .option('--tree', 'Show file tree structure')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude intro')} ${style.dim('Analyze current directory')}
${style.command('evaluclaude intro ./my-project')} ${style.dim('Analyze specific path')}
${style.command('evaluclaude intro . --summary')} ${style.dim('Human-readable summary')}
${style.command('evaluclaude intro . --tree')} ${style.dim('Show file tree')}
${style.command('evaluclaude intro . -o out.json')} ${style.dim('Save to file')}
`)
.action(async (repoPath: string, options: { output?: string; json?: boolean; summary?: boolean; tree?: boolean }) => { .action(async (repoPath: string, options: { output?: string; json?: boolean; summary?: boolean; tree?: boolean }) => {
const absolutePath = path.resolve(repoPath); const absolutePath = path.resolve(repoPath);
console.log(`\n🔍 Analyzing: ${absolutePath}\n`); console.log(header('Introspecting Codebase'));
console.log(keyValue('Path', style.path(absolutePath)));
console.log('');
const spinner = new Spinner('Analyzing codebase with tree-sitter...');
spinner.start();
try { try {
const summary = await analyze({ const summary = await analyze({
root: absolutePath, root: absolutePath,
onProgress: (msg) => console.log(` ${msg}`), onProgress: (msg) => spinner.update(msg),
}); });
spinner.succeed('Analysis complete');
console.log(''); console.log('');
if (options.tree && summary.tree) { if (options.tree && summary.tree) {
console.log('📁 File Tree:\n'); console.log(subheader(`${icons.folder} File Tree`));
console.log(treeToString(summary.tree)); console.log(treeToString(summary.tree));
console.log(''); console.log('');
} else if (options.summary) { } else if (options.summary) {
@ -34,85 +49,96 @@ export const introCommand = new Command('intro')
if (options.output) { if (options.output) {
const fs = await import('node:fs/promises'); const fs = await import('node:fs/promises');
await fs.writeFile(options.output, json); await fs.writeFile(options.output, json);
console.log(`📄 Written to: ${options.output}`); console.log(`${style.success(icons.success)} Written to: ${style.path(options.output)}`);
} else { } else {
console.log(json); console.log(json);
} }
} }
console.log(nextSteps([
{ command: 'evaluclaude analyze .', description: 'Generate EvalSpec with Claude' },
{ command: 'evaluclaude intro . --summary', description: 'View human-readable summary' },
]));
} catch (error) { } catch (error) {
console.error('❌ Error analyzing repository:', error); spinner.fail('Analysis failed');
console.error(formatError(
error instanceof Error ? error.message : 'Unknown error analyzing repository',
[
'Check that the path exists and is accessible',
'Ensure the directory contains source files',
'Try running with --tree to see the file structure',
]
));
process.exit(1); process.exit(1);
} }
}); });
function printHumanSummary(summary: import('../../introspector/types.js').RepoSummary): void { function printHumanSummary(summary: import('../../introspector/types.js').RepoSummary): void {
console.log('📊 Repository Summary'); console.log(subheader(`${icons.trace} Repository Summary`));
console.log('─'.repeat(50)); console.log(keyValue('Root', style.path(summary.root)));
console.log(`📁 Root: ${summary.root}`); console.log(keyValue('Analyzed', summary.analyzedAt));
console.log(`🗓️ Analyzed: ${summary.analyzedAt}`); console.log(keyValue('Languages', summary.languages.join(', ') || style.muted('none detected')));
console.log(`🔤 Languages: ${summary.languages.join(', ') || 'none detected'}`);
console.log('\n📂 Files:'); console.log(subheader(`${icons.folder} Files`));
console.log(` Total: ${summary.files.length}`); console.log(keyValue('Total', style.number(String(summary.files.length)), 1));
console.log(` Source: ${summary.files.filter(f => f.role === 'source').length}`); console.log(keyValue('Source', style.number(String(summary.files.filter(f => f.role === 'source').length)), 1));
console.log(` Test: ${summary.files.filter(f => f.role === 'test').length}`); console.log(keyValue('Test', style.number(String(summary.files.filter(f => f.role === 'test').length)), 1));
console.log(` Config: ${summary.files.filter(f => f.role === 'config').length}`); console.log(keyValue('Config', style.number(String(summary.files.filter(f => f.role === 'config').length)), 1));
console.log('\n📦 Modules:'); console.log(subheader(`${icons.code} Modules`));
console.log(` Total: ${summary.modules.length}`); console.log(keyValue('Total', style.number(String(summary.modules.length)), 1));
const totalExports = summary.modules.reduce((sum, m) => sum + m.exports.length, 0); const totalExports = summary.modules.reduce((sum, m) => sum + m.exports.length, 0);
const functions = summary.modules.flatMap(m => m.exports.filter(e => e.kind === 'function')); const functions = summary.modules.flatMap(m => m.exports.filter(e => e.kind === 'function'));
const classes = summary.modules.flatMap(m => m.exports.filter(e => e.kind === 'class')); const classes = summary.modules.flatMap(m => m.exports.filter(e => e.kind === 'class'));
console.log(` Functions: ${functions.length}`); console.log(keyValue('Functions', style.number(String(functions.length)), 1));
console.log(` Classes: ${classes.length}`); console.log(keyValue('Classes', style.number(String(classes.length)), 1));
console.log(` Total exports: ${totalExports}`); console.log(keyValue('Total exports', style.number(String(totalExports)), 1));
if (summary.config.python) { if (summary.config.python) {
console.log('\n🐍 Python:'); console.log(subheader(`${icons.python} Python`));
console.log(` Test framework: ${summary.config.python.testFramework}`); console.log(keyValue('Test framework', summary.config.python.testFramework, 1));
console.log(` pyproject.toml: ${summary.config.python.pyprojectToml ? '✓' : '✗'}`); console.log(keyValue('pyproject.toml', summary.config.python.pyprojectToml ? style.success(icons.success) : style.error(icons.error), 1));
console.log(` setup.py: ${summary.config.python.setupPy ? '✓' : '✗'}`); console.log(keyValue('setup.py', summary.config.python.setupPy ? style.success(icons.success) : style.error(icons.error), 1));
} }
if (summary.config.typescript) { if (summary.config.typescript) {
console.log('\n📘 TypeScript:'); console.log(subheader(`${icons.typescript} TypeScript`));
console.log(` Test framework: ${summary.config.typescript.testFramework}`); console.log(keyValue('Test framework', summary.config.typescript.testFramework, 1));
console.log(` package.json: ${summary.config.typescript.packageJson ? '✓' : '✗'}`); console.log(keyValue('package.json', summary.config.typescript.packageJson ? style.success(icons.success) : style.error(icons.error), 1));
console.log(` tsconfig.json: ${summary.config.typescript.tsconfig ? '✓' : '✗'}`); console.log(keyValue('tsconfig.json', summary.config.typescript.tsconfig ? style.success(icons.success) : style.error(icons.error), 1));
} }
if (summary.git) { if (summary.git) {
console.log('\n📌 Git:'); console.log(subheader(`${icons.gear} Git`));
console.log(` Branch: ${summary.git.branch}`); console.log(keyValue('Branch', summary.git.branch, 1));
console.log(` Commit: ${summary.git.currentCommit.slice(0, 8)}`); console.log(keyValue('Commit', style.muted(summary.git.currentCommit.slice(0, 8)), 1));
if (summary.git.recentCommits && summary.git.recentCommits.length > 0) { if (summary.git.recentCommits && summary.git.recentCommits.length > 0) {
console.log('\n📜 Recent Commits:'); console.log(subheader(`${icons.file} Recent Commits`));
for (const commit of summary.git.recentCommits.slice(0, 5)) { for (const commit of summary.git.recentCommits.slice(0, 5)) {
const date = new Date(commit.date).toLocaleDateString(); const date = new Date(commit.date).toLocaleDateString();
console.log(` ${commit.shortHash} ${date} - ${commit.message.slice(0, 50)}${commit.message.length > 50 ? '...' : ''}`); console.log(` ${style.muted(commit.shortHash)} ${style.dim(date)} ${box.horizontal} ${commit.message.slice(0, 50)}${commit.message.length > 50 ? '...' : ''}`);
} }
} }
if (summary.git.fileHistory && summary.git.fileHistory.length > 0) { if (summary.git.fileHistory && summary.git.fileHistory.length > 0) {
console.log('\n🔥 Most Active Files (by commit count):'); console.log(subheader(`${icons.lightning} Most Active Files`));
for (const file of summary.git.fileHistory.slice(0, 5)) { for (const file of summary.git.fileHistory.slice(0, 5)) {
console.log(` ${file.path} (${file.commitCount} commits)`); console.log(` ${style.path(file.path)} ${style.dim(`(${style.number(String(file.commitCount))} commits)`)}`);
} }
} }
} }
// Show top modules by export count
const topModules = [...summary.modules] const topModules = [...summary.modules]
.sort((a, b) => b.exports.length - a.exports.length) .sort((a, b) => b.exports.length - a.exports.length)
.slice(0, 5); .slice(0, 5);
if (topModules.length > 0) { if (topModules.length > 0) {
console.log('\n🏆 Top modules by exports:'); console.log(subheader(`${icons.sparkle} Top Modules by Exports`));
for (const mod of topModules) { for (const mod of topModules) {
console.log(` ${mod.path}: ${mod.exports.length} exports`); console.log(` ${style.path(mod.path)}: ${style.number(String(mod.exports.length))} exports`);
} }
} }
} }

View file

@ -8,6 +8,19 @@ import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/i
import { createTracer, saveTrace } from '../../observability/index.js'; import { createTracer, saveTrace } from '../../observability/index.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js'; import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
import type { EvalSpec } from '../../analyzer/types.js'; import type { EvalSpec } from '../../analyzer/types.js';
import {
style,
icons,
header,
step,
keyValue,
resultBox,
nextSteps,
Spinner,
formatError,
BANNER,
box
} from '../theme.js';
const EVALUCLAUDE_DIR = '.evaluclaude'; const EVALUCLAUDE_DIR = '.evaluclaude';
@ -26,29 +39,49 @@ interface PipelineOptions {
} }
export const pipelineCommand = new Command('pipeline') export const pipelineCommand = new Command('pipeline')
.description('Run the full eval generation pipeline: introspect → analyze → render → run') .description('Run the complete eval pipeline: introspect → analyze → render → run')
.argument('[path]', 'Path to the repository to analyze', '.') .argument('[path]', 'Path to the repository to analyze', '.')
.option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude') .option('-o, --output <dir>', 'Output directory for artifacts', '.evaluclaude')
.option('-i, --interactive', 'Enable interactive mode with clarifying questions') .option('-i, --interactive', 'Enable interactive mode with clarifying questions')
.option('--focus <modules>', 'Comma-separated list of modules/functions to focus on') .option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
.option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10') .option('--max-scenarios <n>', 'Maximum number of test scenarios', '10')
.option('--test-dir <dir>', 'Directory for generated tests', './tests/generated') .option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)') .option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
.option('--skip-analyze', 'Skip analysis, use existing spec') .option('--skip-analyze', 'Skip analysis, use existing spec')
.option('--skip-render', 'Skip rendering, use existing tests') .option('--skip-render', 'Skip rendering, use existing tests')
.option('--skip-run', 'Skip test execution') .option('--skip-run', 'Skip test execution')
.option('--promptfoo', 'Generate Promptfoo configuration for UI viewing') .option('--promptfoo', 'Generate Promptfoo configuration')
.option('--quiet', 'Suppress progress messages') .option('--quiet', 'Suppress progress messages')
.addHelpText('after', `
${style.bold('Examples:')}
${style.dim('# Analyze current directory')}
$ evaluclaude pipeline .
${style.dim('# Interactive mode with focus on specific modules')}
$ evaluclaude pipeline ./my-project -i --focus auth,payments
${style.dim('# Generate tests without running them')}
$ evaluclaude pipeline . --skip-run
${style.dim('# Use existing spec and run tests')}
$ evaluclaude pipeline . --skip-analyze
`)
.action(async (repoPath: string, options: PipelineOptions) => { .action(async (repoPath: string, options: PipelineOptions) => {
const absolutePath = resolve(repoPath); const absolutePath = resolve(repoPath);
const log = options.quiet ? () => {} : console.log; const quiet = options.quiet;
const outputDir = options.output || EVALUCLAUDE_DIR; const outputDir = options.output || EVALUCLAUDE_DIR;
console.log('\n🚀 Evaluclaude Pipeline'); // Print header
console.log('═'.repeat(50)); console.log(BANNER);
console.log(` Repository: ${absolutePath}`); console.log(style.primary(box.dHorizontal.repeat(55)));
console.log(` Output: ${outputDir}`); console.log(` ${icons.folder} ${style.bold('Repository:')} ${style.path(absolutePath)}`);
console.log('═'.repeat(50) + '\n'); console.log(` ${icons.file} ${style.bold('Output:')} ${style.path(outputDir)}`);
if (options.interactive) {
console.log(` ${icons.brain} ${style.bold('Mode:')} ${style.highlight('Interactive')}`);
}
console.log(style.primary(box.dHorizontal.repeat(55)));
console.log('');
// Ensure output directories exist // Ensure output directories exist
mkdirSync(outputDir, { recursive: true }); mkdirSync(outputDir, { recursive: true });
@ -65,23 +98,30 @@ export const pipelineCommand = new Command('pipeline')
// Step 1: Introspection + Analysis // Step 1: Introspection + Analysis
if (options.skipAnalyze && existsSync(specPath)) { if (options.skipAnalyze && existsSync(specPath)) {
log('📋 Using existing EvalSpec...'); console.log(step(1, 'Using existing EvalSpec', 'done'));
spec = JSON.parse(readFileSync(specPath, 'utf-8')); spec = JSON.parse(readFileSync(specPath, 'utf-8'));
log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`); console.log(` ${style.dim('└─')} Loaded ${style.number(String(spec.scenarios.length))} scenarios from ${style.path(specPath)}`);
console.log('');
} else { } else {
log('🔬 Step 1: Introspecting codebase...'); console.log(step(1, 'Introspecting codebase...', 'running'));
let spinner: Spinner | null = null;
if (!quiet) {
spinner = new Spinner('Parsing files with tree-sitter...');
spinner.start();
}
try { try {
const repoSummary = await analyze({ const repoSummary = await analyze({
root: absolutePath, root: absolutePath,
onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`), onProgress: quiet ? undefined : (msg) => spinner?.update(msg),
}); });
log(` Files: ${repoSummary.files.length}`); spinner?.succeed(`Analyzed ${style.number(String(repoSummary.files.length))} files`);
log(` Languages: ${repoSummary.languages.join(', ')}`); console.log(` ${style.dim('└─')} Languages: ${repoSummary.languages.map(l => style.info(l)).join(', ')}`);
log(''); console.log('');
log('🤖 Step 2: Generating EvalSpec with Claude...\n'); console.log(step(2, 'Generating EvalSpec with Claude...', 'running'));
const focus = options.focus?.split(',').map(s => s.trim()); const focus = options.focus?.split(',').map(s => s.trim());
const maxScenarios = parseInt(options.maxScenarios, 10); const maxScenarios = parseInt(options.maxScenarios, 10);
@ -93,21 +133,30 @@ export const pipelineCommand = new Command('pipeline')
result = await generateEvalSpecInteractive( result = await generateEvalSpecInteractive(
repoSummary, repoSummary,
async (question: string) => { async (question: string) => {
console.log('');
const { answer } = await inquirer.prompt([{ const { answer } = await inquirer.prompt([{
type: 'input', type: 'input',
name: 'answer', name: 'answer',
message: `🤖 Claude asks: ${question}`, message: `${icons.brain} ${style.highlight('Claude asks:')} ${question}`,
prefix: '',
}]); }]);
return answer; return answer;
}, },
{ focus, maxScenarios } { focus, maxScenarios }
); );
} else { } else {
if (!quiet) {
spinner = new Spinner('Claude is analyzing the codebase...');
spinner.start();
}
result = await generateEvalSpec(repoSummary, { result = await generateEvalSpec(repoSummary, {
interactive: false, interactive: false,
focus, focus,
maxScenarios, maxScenarios,
}); });
spinner?.succeed('EvalSpec generated');
} }
spec = result.spec; spec = result.spec;
@ -115,19 +164,29 @@ export const pipelineCommand = new Command('pipeline')
// Save the spec // Save the spec
writeFileSync(specPath, JSON.stringify(spec, null, 2)); writeFileSync(specPath, JSON.stringify(spec, null, 2));
log(`\n✅ EvalSpec generated!`); console.log(` ${style.dim('├─')} Scenarios: ${style.number(String(spec.scenarios.length))}`);
log(` Scenarios: ${spec.scenarios.length}`); console.log(` ${style.dim('├─')} Tokens: ${style.number(String(result.tokensUsed))}`);
log(` Tokens: ${result.tokensUsed}`); console.log(` ${style.dim('└─')} Saved: ${style.path(specPath)}`);
log(` Saved: ${specPath}\n`); console.log('');
} catch (error) { } catch (error) {
console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error); spinner?.fail('Analysis failed');
console.error(formatError(
error instanceof Error ? error.message : String(error),
['Check that ANTHROPIC_API_KEY is set', 'Verify the path exists and contains source files']
));
process.exit(1); process.exit(1);
} }
} }
// Step 2: Render tests // Step 2: Render tests
if (!options.skipRender) { if (!options.skipRender) {
log('📝 Step 3: Rendering test files...'); console.log(step(3, 'Rendering test files...', 'running'));
let spinner: Spinner | null = null;
if (!quiet) {
spinner = new Spinner('Generating test code...');
spinner.start();
}
try { try {
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec); const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
@ -140,20 +199,31 @@ export const pipelineCommand = new Command('pipeline')
dryRun: false, dryRun: false,
}); });
log(` Framework: ${framework}`); spinner?.succeed(`Generated ${style.number(String(renderResult.stats.fileCount))} test files`);
log(` Files: ${renderResult.stats.fileCount}`); console.log(` ${style.dim('├─')} Framework: ${style.info(framework)}`);
log(` Scenarios: ${renderResult.stats.scenarioCount}`); console.log(` ${style.dim('├─')} Scenarios: ${style.number(String(renderResult.stats.scenarioCount))}`);
log(` Assertions: ${renderResult.stats.assertionCount}`); console.log(` ${style.dim('├─')} Assertions: ${style.number(String(renderResult.stats.assertionCount))}`);
log(` Output: ${options.testDir}\n`); console.log(` ${style.dim('└─')} Output: ${style.path(options.testDir)}`);
console.log('');
} catch (error) { } catch (error) {
console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error); spinner?.fail('Rendering failed');
console.error(formatError(
error instanceof Error ? error.message : String(error),
['Verify the EvalSpec is valid JSON', 'Check the output directory is writable']
));
process.exit(1); process.exit(1);
} }
} }
// Step 3: Run tests // Step 3: Run tests
if (!options.skipRun) { if (!options.skipRun) {
log('🧪 Step 4: Running tests...\n'); console.log(step(4, 'Running tests...', 'running'));
let spinner: Spinner | null = null;
if (!quiet) {
spinner = new Spinner('Executing test suite...');
spinner.start();
}
try { try {
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec); const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
@ -202,23 +272,41 @@ export const pipelineCommand = new Command('pipeline')
const trace = tracer.finalize(); const trace = tracer.finalize();
const tracePath = await saveTrace(trace); const tracePath = await saveTrace(trace);
log(formatResults(result)); spinner?.stop();
log(`📊 Trace saved: ${tracePath}`);
log(` View with: evaluclaude view ${trace.id}\n`); // Show results box
console.log('');
console.log(resultBox({
passed: result.summary.passed,
failed: result.summary.failed,
skipped: result.summary.skipped,
duration: result.summary.duration,
}));
console.log('');
console.log(` ${icons.trace} Trace: ${style.path(tracePath)}`);
console.log(` ${style.dim('└─')} View: ${style.command(`evaluclaude view ${trace.id}`)}`);
console.log('');
// Save results // Save results
const resultsPath = join(resultsDir, `run-${Date.now()}.json`); const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
writeFileSync(resultsPath, JSON.stringify(result, null, 2)); writeFileSync(resultsPath, JSON.stringify(result, null, 2));
} catch (error) { } catch (error) {
console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error); spinner?.fail('Test execution failed');
console.error(formatError(
error instanceof Error ? error.message : String(error),
['Check the test framework is installed', 'Verify the test directory exists']
));
process.exit(1); process.exit(1);
} }
} }
// Step 4: Generate Promptfoo config // Step 4: Generate Promptfoo config
if (options.promptfoo) { if (options.promptfoo) {
log('📦 Step 5: Generating Promptfoo configuration...'); console.log(step(5, 'Generating Promptfoo configuration...', 'running'));
const spinner = new Spinner('Creating Promptfoo config...');
spinner.start();
try { try {
const configPath = join(outputDir, 'promptfooconfig.yaml'); const configPath = join(outputDir, 'promptfooconfig.yaml');
@ -235,23 +323,27 @@ export const pipelineCommand = new Command('pipeline')
await generateTestProvider(providerPath); await generateTestProvider(providerPath);
log(` Config: ${configPath}`); spinner.succeed('Promptfoo config created');
log(` Provider: ${providerPath}`); console.log(` ${style.dim('├─')} Config: ${style.path(configPath)}`);
log(`\n Launch UI with: evaluclaude ui\n`); console.log(` ${style.dim('└─')} Provider: ${style.path(providerPath)}`);
console.log('');
} catch (error) { } catch (error) {
console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error); spinner.fail('Promptfoo config generation failed');
console.error(formatError(error instanceof Error ? error.message : String(error)));
} }
} }
console.log('═'.repeat(50)); // Final summary
console.log('✅ Pipeline complete!'); console.log(style.success(box.dHorizontal.repeat(55)));
console.log('═'.repeat(50)); console.log(` ${icons.sparkle} ${style.success(style.bold('Pipeline complete!'))}`);
console.log(`\nNext steps:`); console.log(style.success(box.dHorizontal.repeat(55)));
console.log(` View traces: evaluclaude view --last`);
console.log(` List all traces: evaluclaude traces`); console.log(nextSteps([
if (options.promptfoo) { { command: 'evaluclaude view --last', description: 'View the latest trace' },
console.log(` Launch UI: evaluclaude ui`); { command: 'evaluclaude traces', description: 'List all traces' },
console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`); ...(options.promptfoo ? [
} { command: 'evaluclaude ui', description: 'Launch the dashboard UI' },
console.log(''); { command: `evaluclaude eval --spec ${specPath}`, description: 'Run Promptfoo evaluations' },
] : []),
]));
}); });

View file

@ -2,6 +2,7 @@ import { Command } from 'commander';
import { readFileSync, existsSync } from 'fs'; import { readFileSync, existsSync } from 'fs';
import { renderSpec, detectFramework, type Framework } from '../../renderers/index.js'; import { renderSpec, detectFramework, type Framework } from '../../renderers/index.js';
import type { EvalSpec } from '../../analyzer/types.js'; import type { EvalSpec } from '../../analyzer/types.js';
import { style, icons, Spinner, formatError, nextSteps, keyValue } from '../theme.js';
export const renderCommand = new Command('render') export const renderCommand = new Command('render')
.description('Render EvalSpec JSON into runnable test files') .description('Render EvalSpec JSON into runnable test files')
@ -11,19 +12,41 @@ export const renderCommand = new Command('render')
.option('--fixtures', 'Generate fixture stubs', false) .option('--fixtures', 'Generate fixture stubs', false)
.option('--mocks', 'Generate mock stubs', false) .option('--mocks', 'Generate mock stubs', false)
.option('--dry-run', 'Preview without writing files', false) .option('--dry-run', 'Preview without writing files', false)
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude render spec.json')} ${style.dim('Render with auto-detected framework')}
${style.command('evaluclaude render spec.json -f vitest')} ${style.dim('Use Vitest framework')}
${style.command('evaluclaude render spec.json --dry-run')} ${style.dim('Preview output without writing')}
${style.command('evaluclaude render spec.json --fixtures')} ${style.dim('Include fixture stubs')}
`)
.action(async (specPath: string, options) => { .action(async (specPath: string, options) => {
try { try {
if (!existsSync(specPath)) { if (!existsSync(specPath)) {
console.error(`Error: Spec file not found: ${specPath}`); console.error(formatError(`Spec file not found: ${style.path(specPath)}`, [
'Check that the spec file exists',
'Run `evaluclaude analyze` to generate a spec file first',
'Verify the path is correct',
]));
process.exit(1); process.exit(1);
} }
const specContent = readFileSync(specPath, 'utf-8'); const specContent = readFileSync(specPath, 'utf-8');
const spec: EvalSpec = JSON.parse(specContent); let spec: EvalSpec;
try {
spec = JSON.parse(specContent);
} catch {
console.error(formatError('Invalid JSON in spec file', [
'Ensure the file contains valid JSON',
'Check for syntax errors in the spec file',
]));
process.exit(1);
}
const framework = (options.framework as Framework) || detectFramework(spec); const framework = (options.framework as Framework) || detectFramework(spec);
console.log(`Rendering ${spec.scenarios.length} scenarios with ${framework}...`); const spinner = new Spinner(`Rendering ${style.number(String(spec.scenarios.length))} scenarios with ${style.highlight(framework)}...`);
spinner.start();
const result = await renderSpec(spec, { const result = await renderSpec(spec, {
outputDir: options.output, outputDir: options.output,
@ -33,29 +56,44 @@ export const renderCommand = new Command('render')
dryRun: options.dryRun, dryRun: options.dryRun,
}); });
spinner.succeed(`Rendered ${style.number(String(spec.scenarios.length))} scenarios with ${style.highlight(framework)}`);
if (options.dryRun) { if (options.dryRun) {
console.log('\n--- DRY RUN ---\n'); console.log(`\n${style.warning('DRY RUN')} ${style.dim('─ Preview only, no files written')}\n`);
for (const file of result.files) { for (const file of result.files) {
console.log(`📄 ${file.path}`); console.log(`${icons.file} ${style.path(file.path)}`);
console.log('---'); console.log(style.dim('─'.repeat(50)));
console.log(file.content); console.log(style.muted(file.content));
console.log('---\n'); console.log(style.dim('─'.repeat(50)) + '\n');
} }
} }
console.log(`\n✅ Rendered ${result.stats.scenarioCount} scenarios`); console.log(`\n${style.success(icons.check)} ${style.bold('Render complete')}`);
console.log(` 📁 ${result.stats.fileCount} test files`); console.log(keyValue(` ${icons.spec} Scenarios`, style.number(String(result.stats.scenarioCount)), 0));
console.log(` 🔍 ${result.stats.assertionCount} assertions`); console.log(keyValue(` ${icons.file} Test files`, style.number(String(result.stats.fileCount)), 0));
console.log(keyValue(` ${icons.magnify} Assertions`, style.number(String(result.stats.assertionCount)), 0));
if (result.stats.skippedCount > 0) { if (result.stats.skippedCount > 0) {
console.log(` ⏭️ ${result.stats.skippedCount} scenarios skipped (LLM rubric assertions)`); console.log(keyValue(` ${icons.skipped} Skipped`, `${style.number(String(result.stats.skippedCount))} ${style.dim('(LLM rubric assertions)')}`, 0));
} }
if (!options.dryRun) { if (!options.dryRun) {
console.log(`\n📂 Output: ${options.output}`); console.log(`\n${icons.folder} ${style.label('Output:')} ${style.path(options.output)}`);
console.log(nextSteps([
{ command: `evaluclaude run ${options.output}`, description: 'Run the generated tests' },
{ command: `evaluclaude render ${specPath} --dry-run`, description: 'Preview changes before writing' },
]));
} }
} catch (error) { } catch (error) {
console.error('Error rendering spec:', error instanceof Error ? error.message : error); console.error(formatError(
error instanceof Error ? error.message : String(error),
[
'Check that the spec file is valid',
'Ensure the output directory is writable',
'Try running with --dry-run to debug',
]
));
process.exit(1); process.exit(1);
} }
}); });

View file

@ -1,6 +1,5 @@
import { Command } from 'commander'; import { Command } from 'commander';
import { existsSync, readFileSync } from 'fs'; import { existsSync, readFileSync } from 'fs';
import { join } from 'path';
import { import {
runTests, runTests,
formatResults, formatResults,
@ -12,6 +11,17 @@ import {
import { createTracer, saveTrace } from '../../observability/index.js'; import { createTracer, saveTrace } from '../../observability/index.js';
import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js'; import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
import type { EvalSpec } from '../../analyzer/types.js'; import type { EvalSpec } from '../../analyzer/types.js';
import {
style,
icons,
Spinner,
formatError,
nextSteps,
keyValue,
resultBox,
section,
formatDuration
} from '../theme.js';
export const runCommand = new Command('run') export const runCommand = new Command('run')
.description('Run generated tests and collect results') .description('Run generated tests and collect results')
@ -28,24 +38,37 @@ export const runCommand = new Command('run')
.option('--no-trace', 'Disable execution tracing') .option('--no-trace', 'Disable execution tracing')
.option('--export-promptfoo', 'Export results in Promptfoo format', false) .option('--export-promptfoo', 'Export results in Promptfoo format', false)
.option('-w, --watch', 'Watch mode (rerun on changes)', false) .option('-w, --watch', 'Watch mode (rerun on changes)', false)
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude run')} ${style.dim('Run tests from ./tests/generated')}
${style.command('evaluclaude run ./my-tests')} ${style.dim('Run tests from custom directory')}
${style.command('evaluclaude run -f pytest')} ${style.dim('Use pytest framework')}
${style.command('evaluclaude run --spec eval-spec.json')} ${style.dim('Map results to EvalSpec')}
${style.command('evaluclaude run --export-promptfoo')} ${style.dim('Export for Promptfoo UI')}
${style.command('evaluclaude run --no-sandbox')} ${style.dim('Disable sandboxing')}
`)
.action(async (testDir: string, options) => { .action(async (testDir: string, options) => {
try { try {
console.log(`\n🧪 Running tests from ${testDir}...\n`); console.log(`\n${icons.test} ${style.bold('Running tests from')} ${style.path(testDir)}\n`);
if (!existsSync(testDir)) { if (!existsSync(testDir)) {
console.error(`Error: Test directory not found: ${testDir}`); console.log(formatError(`Test directory not found: ${testDir}`, [
`Create the directory: ${style.command(`mkdir -p ${testDir}`)}`,
`Generate tests first: ${style.command('evaluclaude render <spec>')}`,
'Check the path is correct'
]));
process.exit(1); process.exit(1);
} }
const framework: TestFramework = options.framework || detectTestFramework(testDir); const framework: TestFramework = options.framework || detectTestFramework(testDir);
console.log(` Framework: ${framework}`); console.log(keyValue('Framework', style.info(framework), 1));
console.log(` Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`); console.log(keyValue('Sandbox', options.sandbox ? style.success('enabled') : style.warning('disabled'), 1));
console.log(` Timeout: ${options.timeout}ms`); console.log(keyValue('Timeout', style.number(`${options.timeout}ms`), 1));
let spec: EvalSpec | undefined; let spec: EvalSpec | undefined;
if (options.spec && existsSync(options.spec)) { if (options.spec && existsSync(options.spec)) {
spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec; spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
console.log(` Spec: ${options.spec} (${spec.scenarios.length} scenarios)`); console.log(keyValue('Spec', `${style.path(options.spec)} ${style.muted(`(${spec.scenarios.length} scenarios)`)}`, 1));
} }
const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null; const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
@ -66,7 +89,8 @@ export const runCommand = new Command('run')
}); });
} }
console.log('\n Running tests...\n'); const spinner = new Spinner('Running tests...');
spinner.start();
const startTime = Date.now(); const startTime = Date.now();
const result = await runTests( const result = await runTests(
@ -75,6 +99,14 @@ export const runCommand = new Command('run')
options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
); );
const duration = Date.now() - startTime;
if (result.summary.failed > 0) {
spinner.fail(`Tests completed with ${style.error(`${result.summary.failed} failures`)}`);
} else {
spinner.succeed(`Tests completed in ${style.number(formatDuration(duration))}`);
}
if (tracer) { if (tracer) {
tracer.recordExecution({ tracer.recordExecution({
testsPassed: result.summary.passed, testsPassed: result.summary.passed,
@ -94,13 +126,20 @@ export const runCommand = new Command('run')
} }
} }
console.log(formatResults(result)); console.log('\n' + resultBox({
passed: result.summary.passed,
failed: result.summary.failed,
skipped: result.summary.skipped,
duration,
}));
if (spec) { if (spec) {
const mappedResults = mapResultsToScenarios(result, spec); const mappedResults = mapResultsToScenarios(result, spec);
console.log(`\n📊 Scenario Coverage:`); console.log(section('Scenario Coverage'));
console.log(` Covered: ${mappedResults.covered}/${spec.scenarios.length}`); console.log(keyValue('Covered', `${style.success(String(mappedResults.covered))}/${style.number(String(spec.scenarios.length))}`, 1));
console.log(` Unmapped: ${mappedResults.unmapped}`); if (mappedResults.unmapped > 0) {
console.log(keyValue('Unmapped', style.warning(String(mappedResults.unmapped)), 1));
}
} }
if (options.output) { if (options.output) {
@ -108,31 +147,40 @@ export const runCommand = new Command('run')
const { dirname } = await import('path'); const { dirname } = await import('path');
mkdirSync(dirname(options.output), { recursive: true }); mkdirSync(dirname(options.output), { recursive: true });
writeFileSync(options.output, JSON.stringify(result, null, 2)); writeFileSync(options.output, JSON.stringify(result, null, 2));
console.log(`\n📁 Results saved to: ${options.output}`); console.log(`\n${icons.folder} Results saved to: ${style.path(options.output)}`);
} }
// Export to Promptfoo format for UI viewing
if (options.exportPromptfoo) { if (options.exportPromptfoo) {
const exportPath = await exportToPromptfooFormat(result, spec, { const exportPath = await exportToPromptfooFormat(result, spec, {
outputDir: '.evaluclaude/results', outputDir: '.evaluclaude/results',
evalId: `eval-${Date.now()}`, evalId: `eval-${Date.now()}`,
}); });
console.log(`\n📦 Promptfoo results exported: ${exportPath}`); console.log(`\n${icons.spec} Promptfoo results exported: ${style.path(exportPath)}`);
console.log(` View with: evaluclaude ui`);
} }
if (tracer) { if (tracer) {
const trace = tracer.finalize(); const trace = tracer.finalize();
const tracePath = await saveTrace(trace); const tracePath = await saveTrace(trace);
console.log(`\n📊 Trace saved: ${tracePath}`); console.log(`\n${icons.trace} Trace saved: ${style.path(tracePath)}`);
console.log(` View with: evaluclaude view ${trace.id}`);
} }
console.log(nextSteps([
{ command: 'evaluclaude view <trace-id>', description: 'View execution trace' },
{ command: 'evaluclaude ui', description: 'Launch interactive results viewer' },
]));
if (result.summary.failed > 0) { if (result.summary.failed > 0) {
process.exit(1); process.exit(1);
} }
} catch (error) { } catch (error) {
console.error('Error running tests:', error instanceof Error ? error.message : error); console.log(formatError(
error instanceof Error ? error.message : String(error),
[
'Check that the test directory exists and contains valid tests',
'Ensure the test framework is installed',
`Run with ${style.command('--no-sandbox')} if sandbox is causing issues`
]
));
process.exit(1); process.exit(1);
} }
}); });

View file

@ -4,6 +4,7 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join, dirname, resolve as resolvePath } from 'path'; import { join, dirname, resolve as resolvePath } from 'path';
import type { EvalSpec } from '../../analyzer/types.js'; import type { EvalSpec } from '../../analyzer/types.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js'; import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
import { style, icons, Spinner, formatError, nextSteps, header, keyValue } from '../theme.js';
const EVALUCLAUDE_DIR = '.evaluclaude'; const EVALUCLAUDE_DIR = '.evaluclaude';
const CONFIG_FILE = 'promptfooconfig.yaml'; const CONFIG_FILE = 'promptfooconfig.yaml';
@ -15,6 +16,16 @@ export const uiCommand = new Command('ui')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file') .option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
.option('--generate', 'Regenerate Promptfoo config from spec') .option('--generate', 'Regenerate Promptfoo config from spec')
.option('--no-open', 'Do not auto-open browser') .option('--no-open', 'Do not auto-open browser')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude ui')} Launch UI with existing results
${style.command('evaluclaude ui -p 8080')} Use custom port
${style.command('evaluclaude ui -s spec.json --generate')} Generate config and launch
${style.bold('Workflow:')}
1. Run ${style.command('evaluclaude run --export-promptfoo')} to generate results
2. Run ${style.command('evaluclaude ui')} to view them in the dashboard
`)
.action(async (options) => { .action(async (options) => {
try { try {
const port = parseInt(options.port, 10); const port = parseInt(options.port, 10);
@ -23,10 +34,15 @@ export const uiCommand = new Command('ui')
// If spec provided with --generate, create/update Promptfoo config // If spec provided with --generate, create/update Promptfoo config
if (options.spec && options.generate) { if (options.spec && options.generate) {
console.log('\n📄 Generating Promptfoo configuration...'); const spinner = new Spinner('Generating Promptfoo configuration...');
spinner.start();
if (!existsSync(options.spec)) { if (!existsSync(options.spec)) {
console.error(`Error: Spec file not found: ${options.spec}`); spinner.fail('Spec file not found');
console.log(formatError(`Spec file not found: ${style.path(options.spec)}`, [
`Check the file path and try again`,
`Generate a spec with: ${style.command('evaluclaude analyze <path>')}`,
]));
process.exit(1); process.exit(1);
} }
@ -42,17 +58,20 @@ export const uiCommand = new Command('ui')
await generateTestProvider(providerPath); await generateTestProvider(providerPath);
console.log(` Config: ${configPath}`); spinner.succeed('Promptfoo configuration generated');
console.log(` Provider: ${providerPath}`); console.log(keyValue('Config', style.path(configPath), 1));
console.log(keyValue('Provider', style.path(providerPath), 1));
} }
// Check for existing config, create default if missing // Check for existing config, create default if missing
if (!existsSync(configPath)) { if (!existsSync(configPath)) {
console.log('\n⚠ No Promptfoo config found.'); console.log(`\n${style.warning(icons.warning)} No Promptfoo config found.`);
console.log(' Creating default configuration...\n');
const spinner = new Spinner('Creating default configuration...');
spinner.start();
await createDefaultConfig(configPath, providerPath); await createDefaultConfig(configPath, providerPath);
console.log(` Created: ${configPath}`); spinner.succeed('Default configuration created');
console.log(keyValue('Created', style.path(configPath), 1));
} }
// Check for results to display // Check for results to display
@ -60,19 +79,27 @@ export const uiCommand = new Command('ui')
const latestResults = join(resultsDir, 'latest.json'); const latestResults = join(resultsDir, 'latest.json');
if (!existsSync(latestResults)) { if (!existsSync(latestResults)) {
console.log('\n⚠ No evaluation results found.'); console.log(formatError('No evaluation results found.', [
console.log(' Run `evaluclaude run --export-promptfoo` first to generate results.\n'); `Run ${style.command('evaluclaude run --export-promptfoo')} first to generate results`,
console.log(' Or run the full pipeline:'); `Or run the full pipeline: ${style.command('evaluclaude pipeline <path> --promptfoo')}`,
console.log(' evaluclaude pipeline <path> --promptfoo\n'); ]));
} }
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`); console.log(header('Launching Promptfoo UI'));
console.log(` Results: ${latestResults}\n`); console.log(keyValue('Port', style.number(String(port)), 1));
console.log(keyValue('Results', style.path(latestResults), 1));
console.log('');
const spinner = new Spinner(`${icons.rocket} Starting Promptfoo UI...`);
spinner.start();
// Use promptfoo view with the results file // Use promptfoo view with the results file
await launchPromptfooView(port, latestResults, options.open); await launchPromptfooView(port, latestResults, options.open, spinner);
} catch (error) { } catch (error) {
console.error('Error launching UI:', error instanceof Error ? error.message : error); console.log(formatError(
error instanceof Error ? error.message : String(error),
['Check the console output for more details']
));
process.exit(1); process.exit(1);
} }
}); });
@ -85,6 +112,17 @@ export const evalCommand = new Command('eval')
.option('--view', 'Launch UI after evaluation', false) .option('--view', 'Launch UI after evaluation', false)
.option('-p, --port <port>', 'Port for UI', '3000') .option('-p, --port <port>', 'Port for UI', '3000')
.option('--no-cache', 'Disable Promptfoo caching', false) .option('--no-cache', 'Disable Promptfoo caching', false)
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude eval -s spec.json')} Run evals from spec
${style.command('evaluclaude eval -c config.yaml')} Run with custom config
${style.command('evaluclaude eval -s spec.json --view')} Run and launch UI
${style.bold('Workflow:')}
1. Generate spec: ${style.command('evaluclaude analyze <path> -o spec.json')}
2. Run evals: ${style.command('evaluclaude eval -s spec.json')}
3. View results: ${style.command('evaluclaude ui')}
`)
.action(async (options) => { .action(async (options) => {
try { try {
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE); const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
@ -92,10 +130,15 @@ export const evalCommand = new Command('eval')
// Generate config from spec if provided // Generate config from spec if provided
if (options.spec) { if (options.spec) {
console.log('\n📄 Generating Promptfoo configuration from spec...'); const spinner = new Spinner('Generating Promptfoo configuration from spec...');
spinner.start();
if (!existsSync(options.spec)) { if (!existsSync(options.spec)) {
console.error(`Error: Spec file not found: ${options.spec}`); spinner.fail('Spec file not found');
console.log(formatError(`Spec file not found: ${style.path(options.spec)}`, [
`Check the file path and try again`,
`Generate a spec with: ${style.command('evaluclaude analyze <path>')}`,
]));
process.exit(1); process.exit(1);
} }
@ -111,34 +154,41 @@ export const evalCommand = new Command('eval')
await generateTestProvider(providerPath); await generateTestProvider(providerPath);
console.log(` Config: ${configPath}`); spinner.succeed('Promptfoo configuration generated');
console.log(` Provider: ${providerPath}`); console.log(keyValue('Config', style.path(configPath), 1));
console.log(` Scenarios: ${spec.scenarios.length}`); console.log(keyValue('Provider', style.path(providerPath), 1));
console.log(keyValue('Scenarios', style.number(String(spec.scenarios.length)), 1));
} }
if (!existsSync(configPath)) { if (!existsSync(configPath)) {
console.error(`\nError: Config not found: ${configPath}`); console.log(formatError(`Config not found: ${style.path(configPath)}`, [
console.log('Run with --spec <file> to generate from EvalSpec, or create config manually.'); `Run with ${style.command('--spec <file>')} to generate from EvalSpec`,
`Or create a config manually`,
]));
process.exit(1); process.exit(1);
} }
// Ensure output directory exists // Ensure output directory exists
mkdirSync(options.output, { recursive: true }); mkdirSync(options.output, { recursive: true });
console.log('\n🧪 Running Promptfoo evaluations...'); console.log(header('Running Promptfoo Evaluations'));
console.log(` Config: ${configPath}`); console.log(keyValue('Config', style.path(configPath), 1));
console.log(` Output: ${options.output}\n`); console.log(keyValue('Output', style.path(options.output), 1));
console.log('');
const outputFile = join(options.output, `eval-${Date.now()}.json`); const outputFile = join(options.output, `eval-${Date.now()}.json`);
const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache); const spinner = new Spinner(`${icons.test} Running evaluations...`);
spinner.start();
const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache, spinner);
if (exitCode === 0) { if (exitCode === 0) {
console.log(`\n✅ Evaluation complete!`); spinner.succeed('Evaluation complete!');
console.log(`📁 Results: ${outputFile}`); console.log(keyValue('Results', style.path(outputFile), 1));
} else { } else {
console.log(`\n⚠ Evaluation finished with exit code ${exitCode}`); spinner.warn(`Evaluation finished with exit code ${exitCode}`);
console.log(`📁 Results: ${outputFile}`); console.log(keyValue('Results', style.path(outputFile), 1));
} }
// List traces generated during evaluation // List traces generated during evaluation
@ -147,19 +197,27 @@ export const evalCommand = new Command('eval')
const { readdirSync } = await import('fs'); const { readdirSync } = await import('fs');
const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json')); const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json'));
if (traces.length > 0) { if (traces.length > 0) {
console.log(`\n📊 Traces generated: ${traces.length}`); console.log(`\n${icons.trace} ${style.bold('Traces generated:')} ${style.number(String(traces.length))}`);
console.log(` View with: evaluclaude view --last`); console.log(style.dim(` View with: ${style.command('evaluclaude view --last')}`));
} }
} }
if (options.view) { if (options.view) {
console.log(`\n🚀 Launching UI on port ${options.port}...`); console.log('');
await launchPromptfooUI(parseInt(options.port, 10), configPath, true); const uiSpinner = new Spinner(`${icons.rocket} Launching UI on port ${options.port}...`);
uiSpinner.start();
await launchPromptfooUI(parseInt(options.port, 10), configPath, true, uiSpinner);
} else { } else {
console.log(`\n View results: evaluclaude ui`); console.log(nextSteps([
{ command: 'evaluclaude ui', description: 'View results in dashboard' },
{ command: 'evaluclaude view --last', description: 'View latest trace' },
]));
} }
} catch (error) { } catch (error) {
console.error('Error running eval:', error instanceof Error ? error.message : error); console.log(formatError(
error instanceof Error ? error.message : String(error),
['Check the console output for more details']
));
process.exit(1); process.exit(1);
} }
}); });
@ -170,7 +228,8 @@ export const evalCommand = new Command('eval')
async function launchPromptfooView( async function launchPromptfooView(
port: number, port: number,
resultsFile: string, resultsFile: string,
openBrowser: boolean openBrowser: boolean,
spinner?: Spinner
): Promise<void> { ): Promise<void> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
// Use 'promptfoo view' which opens the web UI showing results from the output directory // Use 'promptfoo view' which opens the web UI showing results from the output directory
@ -186,7 +245,11 @@ async function launchPromptfooView(
// Pass the directory containing results // Pass the directory containing results
args.push(resultsDir); args.push(resultsDir);
console.log(` Running: npx ${args.join(' ')}\n`); if (spinner) {
spinner.succeed(`Promptfoo UI starting on port ${style.number(String(port))}`);
}
console.log(style.dim(` Running: npx ${args.join(' ')}`));
console.log('');
const child = spawn('npx', args, { const child = spawn('npx', args, {
stdio: 'inherit', stdio: 'inherit',
@ -195,9 +258,10 @@ async function launchPromptfooView(
child.on('error', (error) => { child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') { if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found.'); console.log(formatError('Promptfoo not found.', [
console.error(' Install with: npm install -g promptfoo'); `Install with: ${style.command('npm install -g promptfoo')}`,
console.error(' Or run: npx promptfoo --version\n'); `Or run: ${style.command('npx promptfoo --version')}`,
]));
} else { } else {
reject(error); reject(error);
} }
@ -225,7 +289,8 @@ async function launchPromptfooView(
async function launchPromptfooUI( async function launchPromptfooUI(
port: number, port: number,
configPath: string, configPath: string,
openBrowser: boolean openBrowser: boolean,
spinner?: Spinner
): Promise<void> { ): Promise<void> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const args = ['promptfoo', 'view', '--port', String(port)]; const args = ['promptfoo', 'view', '--port', String(port)];
@ -240,7 +305,11 @@ async function launchPromptfooUI(
const configDir = dirname(resolvePath(configPath)); const configDir = dirname(resolvePath(configPath));
args.push(configDir); args.push(configDir);
console.log(` Running: npx ${args.join(' ')}\n`); if (spinner) {
spinner.succeed(`Promptfoo UI starting on port ${style.number(String(port))}`);
}
console.log(style.dim(` Running: npx ${args.join(' ')}`));
console.log('');
const child = spawn('npx', args, { const child = spawn('npx', args, {
stdio: 'inherit', stdio: 'inherit',
@ -249,9 +318,10 @@ async function launchPromptfooUI(
child.on('error', (error) => { child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') { if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found.'); console.log(formatError('Promptfoo not found.', [
console.error(' Install with: npm install -g promptfoo'); `Install with: ${style.command('npm install -g promptfoo')}`,
console.error(' Or run: npx promptfoo --version\n'); `Or run: ${style.command('npx promptfoo --version')}`,
]));
} else { } else {
reject(error); reject(error);
} }
@ -276,7 +346,8 @@ async function launchPromptfooUI(
async function runPromptfooEval( async function runPromptfooEval(
configPath: string, configPath: string,
outputFile: string, outputFile: string,
noCache: boolean noCache: boolean,
spinner?: Spinner
): Promise<number> { ): Promise<number> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const args = [ const args = [
@ -290,7 +361,11 @@ async function runPromptfooEval(
args.push('--no-cache'); args.push('--no-cache');
} }
console.log(` Running: npx ${args.join(' ')}\n`); if (spinner) {
spinner.stop();
}
console.log(style.dim(` Running: npx ${args.join(' ')}`));
console.log('');
const child = spawn('npx', args, { const child = spawn('npx', args, {
stdio: 'inherit', stdio: 'inherit',
@ -299,8 +374,9 @@ async function runPromptfooEval(
child.on('error', (error) => { child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') { if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found.'); console.log(formatError('Promptfoo not found.', [
console.error(' Install with: npm install -g promptfoo\n'); `Install with: ${style.command('npm install -g promptfoo')}`,
]));
reject(error); reject(error);
} else { } else {
reject(error); reject(error);

View file

@ -6,6 +6,7 @@ import {
formatTrace, formatTrace,
formatTraceList formatTraceList
} from '../../observability/index.js'; } from '../../observability/index.js';
import { style, icons, formatError, nextSteps } from '../theme.js';
export const viewCommand = new Command('view') export const viewCommand = new Command('view')
.description('View evaluation traces') .description('View evaluation traces')
@ -19,6 +20,14 @@ export const viewCommand = new Command('view')
.option('--decisions', 'Show decisions made', true) .option('--decisions', 'Show decisions made', true)
.option('-n, --limit <count>', 'Limit number of traces listed', '20') .option('-n, --limit <count>', 'Limit number of traces listed', '20')
.option('--eval <eval-id>', 'Filter traces by eval ID') .option('--eval <eval-id>', 'Filter traces by eval ID')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude view')} ${style.dim('View the most recent trace')}
${style.command('evaluclaude view --list')} ${style.dim('List all available traces')}
${style.command('evaluclaude view abc123')} ${style.dim('View a specific trace by ID')}
${style.command('evaluclaude view --json')} ${style.dim('Output trace as raw JSON')}
${style.command('evaluclaude view -v')} ${style.dim('Verbose output with tool calls')}
`)
.action(async (traceId: string | undefined, options) => { .action(async (traceId: string | undefined, options) => {
try { try {
if (options.list) { if (options.list) {
@ -26,16 +35,19 @@ export const viewCommand = new Command('view')
const limited = traces.slice(0, parseInt(options.limit, 10)); const limited = traces.slice(0, parseInt(options.limit, 10));
if (traces.length === 0) { if (traces.length === 0) {
console.log('\nNo traces found.'); console.log(`\n${style.warning(`${icons.warning} No traces found.`)}`);
console.log('Run `evaluclaude run` to generate traces.\n'); console.log(nextSteps([
{ command: 'evaluclaude run', description: 'Run evals to generate traces' },
{ command: 'evaluclaude pipeline .', description: 'Run full pipeline from scratch' },
]));
return; return;
} }
console.log(formatTraceList(limited)); console.log(formatTraceList(limited));
if (traces.length > limited.length) { if (traces.length > limited.length) {
console.log(`Showing ${limited.length} of ${traces.length} traces.`); console.log(style.muted(`Showing ${limited.length} of ${traces.length} traces.`));
console.log(`Use --limit to see more.\n`); console.log(style.muted(`Use ${style.command('--limit')} to see more.\n`));
} }
return; return;
} }
@ -45,15 +57,20 @@ export const viewCommand = new Command('view')
if (options.last || !traceId) { if (options.last || !traceId) {
trace = await getLatestTrace(); trace = await getLatestTrace();
if (!trace) { if (!trace) {
console.log('\nNo traces found.'); console.log(`\n${style.warning(`${icons.warning} No traces found.`)}`);
console.log('Run `evaluclaude run` to generate traces.\n'); console.log(nextSteps([
{ command: 'evaluclaude run', description: 'Run evals to generate traces' },
{ command: 'evaluclaude pipeline .', description: 'Run full pipeline from scratch' },
]));
return; return;
} }
} else { } else {
trace = await loadTrace(traceId); trace = await loadTrace(traceId);
if (!trace) { if (!trace) {
console.error(`\nTrace not found: ${traceId}`); console.log(formatError(`Trace not found: ${style.path(traceId)}`, [
console.log('Use `evaluclaude view --list` to see available traces.\n'); `Run ${style.command('evaluclaude view --list')} to see available traces`,
`Check that the trace ID is correct`,
]));
process.exit(1); process.exit(1);
} }
} }
@ -68,7 +85,10 @@ export const viewCommand = new Command('view')
console.log(output); console.log(output);
} catch (error) { } catch (error) {
console.error('Error viewing trace:', error instanceof Error ? error.message : error); console.log(formatError(
error instanceof Error ? error.message : String(error),
['Run evaluclaude run first to generate traces']
));
process.exit(1); process.exit(1);
} }
}); });
@ -77,14 +97,36 @@ export const tracesCommand = new Command('traces')
.description('List all evaluation traces (alias for view --list)') .description('List all evaluation traces (alias for view --list)')
.option('-n, --limit <count>', 'Limit number of traces', '20') .option('-n, --limit <count>', 'Limit number of traces', '20')
.option('--eval <eval-id>', 'Filter by eval ID') .option('--eval <eval-id>', 'Filter by eval ID')
.addHelpText('after', `
${style.bold('Examples:')}
${style.command('evaluclaude traces')} ${style.dim('List all traces')}
${style.command('evaluclaude traces -n 50')} ${style.dim('Show up to 50 traces')}
${style.command('evaluclaude traces --eval X')} ${style.dim('Filter by eval ID')}
`)
.action(async (options) => { .action(async (options) => {
const traces = await listTraces(options.eval); try {
const limited = traces.slice(0, parseInt(options.limit, 10)); const traces = await listTraces(options.eval);
const limited = traces.slice(0, parseInt(options.limit, 10));
if (traces.length === 0) {
console.log('\nNo traces found.'); if (traces.length === 0) {
return; console.log(`\n${style.warning(`${icons.warning} No traces found.`)}`);
console.log(nextSteps([
{ command: 'evaluclaude run', description: 'Run evals to generate traces' },
]));
return;
}
console.log(formatTraceList(limited));
if (traces.length > limited.length) {
console.log(style.muted(`Showing ${limited.length} of ${traces.length} traces.`));
console.log(style.muted(`Use ${style.command('--limit')} to see more.\n`));
}
} catch (error) {
console.log(formatError(
error instanceof Error ? error.message : String(error),
['Run evaluclaude run first to generate traces']
));
process.exit(1);
} }
console.log(formatTraceList(limited));
}); });

View file

@ -9,13 +9,43 @@ import { runCommand } from './commands/run.js';
import { viewCommand, tracesCommand } from './commands/view.js'; import { viewCommand, tracesCommand } from './commands/view.js';
import { uiCommand, evalCommand } from './commands/ui.js'; import { uiCommand, evalCommand } from './commands/ui.js';
import { pipelineCommand } from './commands/pipeline.js'; import { pipelineCommand } from './commands/pipeline.js';
import { BANNER_MINIMAL, style, welcomeMessage, icons } from './theme.js';
const program = new Command(); const program = new Command();
program program
.name('evaluclaude') .name('evaluclaude')
.description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.') .description(`${BANNER_MINIMAL}\n\nClaude-powered functional test generation for any codebase.`)
.version('0.1.0'); .version('0.1.0')
.configureHelp({
sortSubcommands: true,
subcommandTerm: (cmd) => style.command(cmd.name()) + ' ' + style.dim(cmd.usage()),
})
.addHelpText('beforeAll', '')
.addHelpText('afterAll', `
${style.bold('Examples:')}
${style.dim('# Run the full pipeline on current directory')}
$ evaluclaude pipeline .
${style.dim('# Analyze a Python project interactively')}
$ evaluclaude analyze ./my-project -i -o spec.json
${style.dim('# Generate and run tests')}
$ evaluclaude render spec.json && evaluclaude run
${style.dim('# View results in browser')}
$ evaluclaude run --export-promptfoo && evaluclaude ui
${style.muted('For more info, run any command with --help')}
`);
// Add welcome command for first-time users
const welcomeCmd = new Command('welcome')
.description('Show welcome message and quick start guide')
.action(() => {
console.log(welcomeMessage());
});
// Core pipeline command - the "zero to evals" experience // Core pipeline command - the "zero to evals" experience
program.addCommand(pipelineCommand); program.addCommand(pipelineCommand);
@ -39,4 +69,16 @@ program.addCommand(tracesCommand);
program.addCommand(uiCommand); program.addCommand(uiCommand);
program.addCommand(evalCommand); program.addCommand(evalCommand);
// Utility commands
program.addCommand(welcomeCmd);
// Show welcome on no args if first time (check for .evaluclaude directory)
if (process.argv.length === 2) {
const fs = await import('fs');
if (!fs.existsSync('.evaluclaude')) {
console.log(welcomeMessage());
process.exit(0);
}
}
program.parse(process.argv); program.parse(process.argv);

357
src/cli/theme.ts Normal file
View file

@ -0,0 +1,357 @@
/**
* Evaluclaude CLI Theme
* Consistent styling, colors, and formatting for a beautiful CLI experience
*/
// ANSI color codes
const colors = {
reset: '\x1b[0m',
bold: '\x1b[1m',
dim: '\x1b[2m',
italic: '\x1b[3m',
underline: '\x1b[4m',
// Foreground colors
black: '\x1b[30m',
red: '\x1b[31m',
green: '\x1b[32m',
yellow: '\x1b[33m',
blue: '\x1b[34m',
magenta: '\x1b[35m',
cyan: '\x1b[36m',
white: '\x1b[37m',
// Bright foreground colors
brightBlack: '\x1b[90m',
brightRed: '\x1b[91m',
brightGreen: '\x1b[92m',
brightYellow: '\x1b[93m',
brightBlue: '\x1b[94m',
brightMagenta: '\x1b[95m',
brightCyan: '\x1b[96m',
brightWhite: '\x1b[97m',
// Background colors
bgBlack: '\x1b[40m',
bgRed: '\x1b[41m',
bgGreen: '\x1b[42m',
bgYellow: '\x1b[43m',
bgBlue: '\x1b[44m',
bgMagenta: '\x1b[45m',
bgCyan: '\x1b[46m',
bgWhite: '\x1b[47m',
};
// Semantic color helpers
export const style = {
// Text styles
bold: (text: string) => `${colors.bold}${text}${colors.reset}`,
dim: (text: string) => `${colors.dim}${text}${colors.reset}`,
italic: (text: string) => `${colors.italic}${text}${colors.reset}`,
// Semantic colors
success: (text: string) => `${colors.green}${text}${colors.reset}`,
error: (text: string) => `${colors.red}${text}${colors.reset}`,
warning: (text: string) => `${colors.yellow}${text}${colors.reset}`,
info: (text: string) => `${colors.cyan}${text}${colors.reset}`,
highlight: (text: string) => `${colors.brightMagenta}${text}${colors.reset}`,
muted: (text: string) => `${colors.brightBlack}${text}${colors.reset}`,
// Accent colors
primary: (text: string) => `${colors.brightCyan}${text}${colors.reset}`,
secondary: (text: string) => `${colors.brightBlue}${text}${colors.reset}`,
accent: (text: string) => `${colors.brightMagenta}${text}${colors.reset}`,
// Special combinations
command: (text: string) => `${colors.bold}${colors.cyan}${text}${colors.reset}`,
path: (text: string) => `${colors.brightBlue}${text}${colors.reset}`,
number: (text: string) => `${colors.brightYellow}${text}${colors.reset}`,
label: (text: string) => `${colors.dim}${text}${colors.reset}`,
};
// Icons for consistent visual language
export const icons = {
// Status
success: '✓',
error: '✗',
warning: '⚠',
info: '',
pending: '○',
running: '◐',
// Actions
arrow: '→',
arrowRight: '▸',
bullet: '•',
check: '✓',
cross: '✗',
// Objects
folder: '📁',
file: '📄',
code: '💻',
test: '🧪',
spec: '📋',
trace: '📊',
// Process
rocket: '🚀',
gear: '⚙',
magnify: '🔍',
brain: '🧠',
lightning: '⚡',
sparkle: '✨',
// Results
passed: '✅',
failed: '❌',
skipped: '⏭️',
// Categories
python: '🐍',
typescript: '📘',
javascript: '📙',
};
// Box drawing characters
export const box = {
topLeft: '╭',
topRight: '╮',
bottomLeft: '╰',
bottomRight: '╯',
horizontal: '─',
vertical: '│',
tLeft: '├',
tRight: '┤',
cross: '┼',
// Double lines
dHorizontal: '═',
dVertical: '║',
dTopLeft: '╔',
dTopRight: '╗',
dBottomLeft: '╚',
dBottomRight: '╝',
};
// Banner and branding
export const BANNER = `
${style.primary(' ╔═══════════════════════════════════════════════════════╗')}
${style.primary(' ║')} ${style.bold(style.accent('evaluclaude'))}${style.muted(' · zero-to-evals in one command')} ${style.primary('║')}
${style.primary(' ╚═══════════════════════════════════════════════════════╝')}
`;
export const BANNER_MINIMAL = `${style.accent('evaluclaude')} ${style.muted('·')} ${style.dim('zero-to-evals in one command')}`;
// Common output formatters
export function header(title: string): string {
const width = 60;
const padding = Math.max(0, width - title.length - 4);
return `\n${style.primary(box.dHorizontal.repeat(width))}
${style.bold(title)}
${style.primary(box.dHorizontal.repeat(width))}\n`;
}
export function subheader(title: string): string {
return `\n${style.bold(title)}\n${style.dim(box.horizontal.repeat(40))}`;
}
export function section(title: string): string {
return `\n${style.dim(box.horizontal.repeat(4))} ${style.bold(title)} ${style.dim(box.horizontal.repeat(Math.max(0, 34 - title.length)))}`;
}
export function keyValue(key: string, value: string | number, indent = 0): string {
const pad = ' '.repeat(indent);
return `${pad}${style.label(key + ':')} ${value}`;
}
export function bullet(text: string, indent = 0): string {
const pad = ' '.repeat(indent);
return `${pad}${style.dim(icons.bullet)} ${text}`;
}
export function step(num: number, text: string, status: 'pending' | 'running' | 'done' | 'error' = 'pending'): string {
const statusIcon = {
pending: style.dim(`${num}.`),
running: style.info(`${icons.running}`),
done: style.success(icons.success),
error: style.error(icons.error),
}[status];
return ` ${statusIcon} ${status === 'done' ? style.muted(text) : text}`;
}
export function progressBar(current: number, total: number, width = 30): string {
const percentage = Math.round((current / total) * 100);
const filled = Math.round((current / total) * width);
const empty = width - filled;
const bar = style.success('█'.repeat(filled)) + style.dim('░'.repeat(empty));
return `${bar} ${style.muted(`${percentage}%`)}`;
}
export function table(rows: string[][]): string {
if (rows.length === 0) return '';
const colWidths = rows[0].map((_, i) =>
Math.max(...rows.map(row => (row[i] || '').length))
);
return rows.map(row =>
row.map((cell, i) => cell.padEnd(colWidths[i])).join(' ')
).join('\n');
}
// Spinner for async operations
export class Spinner {
private frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
private frameIndex = 0;
private intervalId: NodeJS.Timeout | null = null;
private text: string;
constructor(text: string) {
this.text = text;
}
start(): void {
process.stdout.write('\x1b[?25l'); // Hide cursor
this.render();
this.intervalId = setInterval(() => {
this.frameIndex = (this.frameIndex + 1) % this.frames.length;
this.render();
}, 80);
}
private render(): void {
process.stdout.write(`\r${style.info(this.frames[this.frameIndex])} ${this.text}`);
}
update(text: string): void {
this.text = text;
this.render();
}
succeed(text?: string): void {
this.stop();
console.log(`\r${style.success(icons.success)} ${text || this.text}`);
}
fail(text?: string): void {
this.stop();
console.log(`\r${style.error(icons.error)} ${text || this.text}`);
}
warn(text?: string): void {
this.stop();
console.log(`\r${style.warning(icons.warning)} ${text || this.text}`);
}
stop(): void {
if (this.intervalId) {
clearInterval(this.intervalId);
this.intervalId = null;
}
process.stdout.write('\x1b[?25h'); // Show cursor
process.stdout.write('\r' + ' '.repeat(80) + '\r'); // Clear line
}
}
// Result summary box
export function resultBox(results: { passed: number; failed: number; skipped?: number; duration?: number }): string {
const { passed, failed, skipped = 0, duration } = results;
const total = passed + failed + skipped;
const lines: string[] = [];
lines.push(style.primary(` ${box.topLeft}${box.horizontal.repeat(38)}${box.topRight}`));
lines.push(style.primary(` ${box.vertical}`) + ' '.repeat(38) + style.primary(box.vertical));
lines.push(style.primary(` ${box.vertical}`) + ` ${style.bold('Test Results')}`.padEnd(45) + style.primary(box.vertical));
lines.push(style.primary(` ${box.vertical}`) + ' '.repeat(38) + style.primary(box.vertical));
lines.push(style.primary(` ${box.vertical}`) + ` ${style.success(icons.passed)} Passed: ${String(passed).padStart(4)}`.padEnd(45) + style.primary(box.vertical));
lines.push(style.primary(` ${box.vertical}`) + ` ${style.error(icons.failed)} Failed: ${String(failed).padStart(4)}`.padEnd(45) + style.primary(box.vertical));
if (skipped > 0) {
lines.push(style.primary(` ${box.vertical}`) + ` ${icons.skipped} Skipped: ${String(skipped).padStart(4)}`.padEnd(42) + style.primary(box.vertical));
}
lines.push(style.primary(` ${box.vertical}`) + style.dim(` ${'─'.repeat(20)}`).padEnd(45) + style.primary(box.vertical));
lines.push(style.primary(` ${box.vertical}`) + ` Total: ${String(total).padStart(4)}`.padEnd(45) + style.primary(box.vertical));
if (duration !== undefined) {
lines.push(style.primary(` ${box.vertical}`) + ` Duration: ${formatDuration(duration)}`.padEnd(45) + style.primary(box.vertical));
}
lines.push(style.primary(` ${box.vertical}`) + ' '.repeat(38) + style.primary(box.vertical));
lines.push(style.primary(` ${box.bottomLeft}${box.horizontal.repeat(38)}${box.bottomRight}`));
return lines.join('\n');
}
export function formatDuration(ms: number): string {
if (ms < 1000) return `${ms}ms`;
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
const minutes = Math.floor(ms / 60000);
const seconds = Math.floor((ms % 60000) / 1000);
return `${minutes}m ${seconds}s`;
}
// Error formatting with suggestions
export function formatError(message: string, suggestions?: string[]): string {
const lines: string[] = [];
lines.push(`\n${style.error(`${icons.error} Error:`)} ${message}`);
if (suggestions && suggestions.length > 0) {
lines.push('');
lines.push(style.dim(' Suggestions:'));
for (const suggestion of suggestions) {
lines.push(` ${style.dim(icons.arrowRight)} ${suggestion}`);
}
}
lines.push('');
return lines.join('\n');
}
// Command examples helper
export function commandExample(command: string, description?: string): string {
if (description) {
return ` ${style.command(command)} ${style.dim(description)}`;
}
return ` ${style.command(command)}`;
}
// Next steps helper
export function nextSteps(steps: { command: string; description: string }[]): string {
const lines: string[] = [];
lines.push(`\n${style.bold('Next steps:')}`);
for (const step of steps) {
lines.push(commandExample(step.command, step.description));
}
lines.push('');
return lines.join('\n');
}
// Welcome message for first-time users
export function welcomeMessage(): string {
return `
${BANNER}
${style.bold('Welcome to evaluclaude!')} ${icons.sparkle}
Generate functional tests for any codebase with the power of Claude.
${style.bold('Quick Start:')}
${style.command('evaluclaude pipeline .')} ${style.dim('Full pipeline: analyze → render → run')}
${style.command('evaluclaude intro .')} ${style.dim('Introspect codebase structure')}
${style.command('evaluclaude analyze .')} ${style.dim('Generate EvalSpec with Claude')}
${style.bold('Learn More:')}
${style.command('evaluclaude --help')} ${style.dim('Show all commands')}
${style.command('evaluclaude <cmd> --help')} ${style.dim('Help for specific command')}
${style.muted('Documentation: https://github.com/harivansh-afk/evaluclaude-harness')}
`;
}

View file

@ -1,5 +1,47 @@
import type { EvalTrace, ToolCall, Question, Decision, TestFailure } from './types.js'; import type { EvalTrace, ToolCall, Question, Decision, TestFailure } from './types.js';
// ANSI color codes for terminal styling
const colors = {
reset: '\x1b[0m',
bold: '\x1b[1m',
dim: '\x1b[2m',
green: '\x1b[32m',
red: '\x1b[31m',
yellow: '\x1b[33m',
cyan: '\x1b[36m',
magenta: '\x1b[35m',
blue: '\x1b[34m',
brightBlack: '\x1b[90m',
brightCyan: '\x1b[96m',
brightMagenta: '\x1b[95m',
brightYellow: '\x1b[93m',
};
const s = {
bold: (t: string) => `${colors.bold}${t}${colors.reset}`,
dim: (t: string) => `${colors.dim}${t}${colors.reset}`,
success: (t: string) => `${colors.green}${t}${colors.reset}`,
error: (t: string) => `${colors.red}${t}${colors.reset}`,
warning: (t: string) => `${colors.yellow}${t}${colors.reset}`,
info: (t: string) => `${colors.cyan}${t}${colors.reset}`,
highlight: (t: string) => `${colors.brightMagenta}${t}${colors.reset}`,
muted: (t: string) => `${colors.brightBlack}${t}${colors.reset}`,
number: (t: string) => `${colors.brightYellow}${t}${colors.reset}`,
primary: (t: string) => `${colors.brightCyan}${t}${colors.reset}`,
};
const box = {
horizontal: '─',
dHorizontal: '═',
topLeft: '╭',
topRight: '╮',
bottomLeft: '╰',
bottomRight: '╯',
vertical: '│',
tLeft: '├',
tRight: '┤',
};
export interface ViewOptions { export interface ViewOptions {
json: boolean; json: boolean;
verbose: boolean; verbose: boolean;
@ -24,118 +66,129 @@ export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}
} }
const lines: string[] = []; const lines: string[] = [];
const w = 60;
// Header
lines.push(''); lines.push('');
lines.push('═'.repeat(60)); lines.push(s.primary(box.dHorizontal.repeat(w)));
lines.push(`📊 Trace: ${trace.id}`); lines.push(` 📊 ${s.bold('Trace')} ${s.muted(trace.id)}`);
lines.push('═'.repeat(60)); lines.push(s.primary(box.dHorizontal.repeat(w)));
lines.push(''); lines.push('');
lines.push(` Status: ${formatStatus(trace.status)}`); // Overview
lines.push(` Started: ${formatDate(trace.startedAt)}`); lines.push(` ${s.dim('Status:')} ${formatStatus(trace.status)}`);
lines.push(` Duration: ${formatDuration(trace.duration)}`); lines.push(` ${s.dim('Started:')} ${s.muted(formatDate(trace.startedAt))}`);
lines.push(` Eval ID: ${trace.evalId}`); lines.push(` ${s.dim('Duration:')} ${s.number(formatDuration(trace.duration))}`);
lines.push(` ${s.dim('Eval ID:')} ${s.muted(trace.evalId)}`);
lines.push(''); lines.push('');
lines.push('📂 Introspection'); // Introspection section
lines.push('─'.repeat(40)); lines.push(sectionHeader('📂 Introspection'));
lines.push(` Files analyzed: ${trace.introspection.filesAnalyzed.length}`); lines.push(kv('Files analyzed', s.number(String(trace.introspection.filesAnalyzed.length))));
lines.push(` Functions found: ${trace.introspection.totalFunctions}`); lines.push(kv('Functions found', s.number(String(trace.introspection.totalFunctions))));
lines.push(` Classes found: ${trace.introspection.totalClasses}`); lines.push(kv('Classes found', s.number(String(trace.introspection.totalClasses))));
lines.push(` Duration: ${formatDuration(trace.introspection.duration)}`); lines.push(kv('Duration', s.number(formatDuration(trace.introspection.duration))));
lines.push(''); lines.push('');
lines.push('🤖 Analysis'); // Analysis section
lines.push('─'.repeat(40)); lines.push(sectionHeader('🧠 Analysis'));
lines.push(` Tool calls: ${trace.analysis.toolCalls.length}`); lines.push(kv('Tool calls', s.number(String(trace.analysis.toolCalls.length))));
lines.push(` Questions asked: ${trace.analysis.questionsAsked.length}`); lines.push(kv('Questions asked', s.number(String(trace.analysis.questionsAsked.length))));
lines.push(` Decisions made: ${trace.analysis.decisions.length}`); lines.push(kv('Decisions made', s.number(String(trace.analysis.decisions.length))));
lines.push(` Prompt tokens: ${trace.analysis.promptTokens.toLocaleString()}`); lines.push(kv('Prompt tokens', s.number(trace.analysis.promptTokens.toLocaleString())));
lines.push(` Completion tokens: ${trace.analysis.completionTokens.toLocaleString()}`); lines.push(kv('Completion tokens', s.number(trace.analysis.completionTokens.toLocaleString())));
lines.push(''); lines.push('');
lines.push('📝 Generation'); // Generation section
lines.push('─'.repeat(40)); lines.push(sectionHeader('📝 Generation'));
lines.push(` Scenarios: ${trace.generation.scenariosGenerated}`); lines.push(kv('Scenarios', s.number(String(trace.generation.scenariosGenerated))));
lines.push(` Files written: ${trace.generation.filesWritten.length}`); lines.push(kv('Files written', s.number(String(trace.generation.filesWritten.length))));
lines.push(''); lines.push('');
lines.push('🧪 Execution'); // Execution section
lines.push('─'.repeat(40)); lines.push(sectionHeader('🧪 Execution'));
lines.push(` ✅ Passed: ${trace.execution.testsPassed}`); lines.push(` ${s.success('✓')} Passed: ${s.success(String(trace.execution.testsPassed))}`);
lines.push(` ❌ Failed: ${trace.execution.testsFailed}`); lines.push(` ${s.error('✗')} Failed: ${s.error(String(trace.execution.testsFailed))}`);
lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped ?? 0}`); lines.push(` ${s.muted('○')} Skipped: ${s.muted(String(trace.execution.testsSkipped ?? 0))}`);
lines.push(''); lines.push('');
// Questions section
if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) { if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
lines.push('❓ Questions Asked'); lines.push(sectionHeader('❓ Questions Asked'));
lines.push('─'.repeat(40));
for (const q of trace.analysis.questionsAsked) { for (const q of trace.analysis.questionsAsked) {
lines.push(formatQuestion(q)); lines.push(formatQuestion(q));
} }
lines.push(''); lines.push('');
} }
// Decisions section
if (opts.showDecisions && trace.analysis.decisions.length > 0) { if (opts.showDecisions && trace.analysis.decisions.length > 0) {
lines.push('🎯 Key Decisions'); lines.push(sectionHeader('🎯 Key Decisions'));
lines.push('─'.repeat(40));
for (const d of trace.analysis.decisions.slice(0, 10)) { for (const d of trace.analysis.decisions.slice(0, 10)) {
lines.push(formatDecision(d)); lines.push(formatDecision(d));
} }
if (trace.analysis.decisions.length > 10) { if (trace.analysis.decisions.length > 10) {
lines.push(` ... and ${trace.analysis.decisions.length - 10} more`); lines.push(` ${s.dim(`... and ${trace.analysis.decisions.length - 10} more`)}`);
} }
lines.push(''); lines.push('');
} }
// Tool calls section
if (opts.showToolCalls && trace.analysis.toolCalls.length > 0) { if (opts.showToolCalls && trace.analysis.toolCalls.length > 0) {
lines.push('🔧 Tool Calls'); lines.push(sectionHeader('🔧 Tool Calls'));
lines.push('─'.repeat(40));
for (const tc of trace.analysis.toolCalls.slice(0, 20)) { for (const tc of trace.analysis.toolCalls.slice(0, 20)) {
lines.push(formatToolCall(tc, opts.verbose)); lines.push(formatToolCall(tc, opts.verbose));
} }
if (trace.analysis.toolCalls.length > 20) { if (trace.analysis.toolCalls.length > 20) {
lines.push(` ... and ${trace.analysis.toolCalls.length - 20} more`); lines.push(` ${s.dim(`... and ${trace.analysis.toolCalls.length - 20} more`)}`);
} }
lines.push(''); lines.push('');
} }
// Test failures section
if (trace.execution.failures.length > 0) { if (trace.execution.failures.length > 0) {
lines.push('❌ Test Failures'); lines.push(sectionHeader('❌ Test Failures'));
lines.push('─'.repeat(40));
for (const f of trace.execution.failures) { for (const f of trace.execution.failures) {
lines.push(formatFailure(f)); lines.push(formatFailure(f));
} }
lines.push(''); lines.push('');
} }
// Errors section
if (trace.errors.length > 0) { if (trace.errors.length > 0) {
lines.push('⚠️ Errors'); lines.push(sectionHeader('⚠️ Errors'));
lines.push('─'.repeat(40));
for (const e of trace.errors) { for (const e of trace.errors) {
lines.push(` [${formatDate(e.timestamp)}]`); lines.push(` ${s.dim('[')}${s.muted(formatDate(e.timestamp))}${s.dim(']')}`);
lines.push(` ${e.message}`); lines.push(` ${s.error(e.message)}`);
if (e.context) { if (e.context) {
lines.push(` Context: ${e.context}`); lines.push(` ${s.dim('Context:')} ${e.context}`);
} }
lines.push(''); lines.push('');
} }
} }
lines.push('═'.repeat(60)); lines.push(s.primary(box.dHorizontal.repeat(w)));
lines.push(''); lines.push('');
return lines.join('\n'); return lines.join('\n');
} }
function sectionHeader(title: string): string {
return `${s.dim(box.horizontal.repeat(3))} ${s.bold(title)} ${s.dim(box.horizontal.repeat(Math.max(0, 35 - title.length)))}`;
}
function kv(key: string, value: string): string {
return ` ${s.dim(key + ':')} ${value}`;
}
function formatStatus(status: EvalTrace['status']): string { function formatStatus(status: EvalTrace['status']): string {
switch (status) { switch (status) {
case 'success': case 'success':
return '✅ Success'; return s.success('✓ Success');
case 'partial': case 'partial':
return '⚠️ Partial'; return s.warning('⚠ Partial');
case 'failed': case 'failed':
return '❌ Failed'; return s.error('✗ Failed');
default: default:
return status; return status;
} }
@ -146,12 +199,8 @@ function formatDate(iso: string): string {
} }
function formatDuration(ms: number): string { function formatDuration(ms: number): string {
if (ms < 1000) { if (ms < 1000) return `${ms}ms`;
return `${ms}ms`; if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
}
if (ms < 60000) {
return `${(ms / 1000).toFixed(1)}s`;
}
const minutes = Math.floor(ms / 60000); const minutes = Math.floor(ms / 60000);
const seconds = ((ms % 60000) / 1000).toFixed(0); const seconds = ((ms % 60000) / 1000).toFixed(0);
return `${minutes}m ${seconds}s`; return `${minutes}m ${seconds}s`;
@ -159,37 +208,39 @@ function formatDuration(ms: number): string {
function formatQuestion(q: Question): string { function formatQuestion(q: Question): string {
const lines: string[] = []; const lines: string[] = [];
lines.push(` Q: ${q.question}`); lines.push(` ${s.highlight('Q:')} ${q.question}`);
if (q.answer) { if (q.answer) {
lines.push(` A: ${q.answer}`); lines.push(` ${s.info('A:')} ${q.answer}`);
} else { } else {
lines.push(` A: (no answer)`); lines.push(` ${s.dim('A: (no answer)')}`);
} }
lines.push(''); lines.push('');
return lines.join('\n'); return lines.join('\n');
} }
function formatDecision(d: Decision): string { function formatDecision(d: Decision): string {
const icon = d.type === 'include' ? '✓' : d.type === 'exclude' ? '✗' : '→'; const icon = d.type === 'include' ? s.success('✓') : d.type === 'exclude' ? s.error('✗') : s.info('→');
return ` ${icon} [${d.type}] ${d.subject}\n Reason: ${d.reasoning}\n Confidence: ${(d.confidence * 100).toFixed(0)}%\n`; const conf = (d.confidence * 100).toFixed(0);
return ` ${icon} ${s.dim(`[${d.type}]`)} ${d.subject}\n ${s.dim('Reason:')} ${d.reasoning}\n ${s.dim('Confidence:')} ${s.number(conf + '%')}\n`;
} }
function formatToolCall(tc: ToolCall, verbose: boolean): string { function formatToolCall(tc: ToolCall, verbose: boolean): string {
const duration = formatDuration(tc.duration); const duration = formatDuration(tc.duration);
if (verbose) { if (verbose) {
return ` [${tc.tool}] (${duration})\n Input: ${JSON.stringify(tc.input).slice(0, 100)}...\n`; const input = JSON.stringify(tc.input).slice(0, 100);
return ` ${s.info(tc.tool)} ${s.dim(`(${duration})`)}\n ${s.dim('Input:')} ${input}...\n`;
} }
return ` ${tc.tool} (${duration})`; return ` ${s.info(tc.tool)} ${s.dim(`(${duration})`)}`;
} }
function formatFailure(f: TestFailure): string { function formatFailure(f: TestFailure): string {
const lines: string[] = []; const lines: string[] = [];
lines.push(` ${f.testName}`); lines.push(` ${s.error('•')} ${s.bold(f.testName)}`);
lines.push(` Scenario: ${f.scenarioId}`); lines.push(` ${s.dim('Scenario:')} ${f.scenarioId}`);
lines.push(` Error: ${f.error}`); lines.push(` ${s.dim('Error:')} ${s.error(f.error)}`);
if (f.expected !== undefined && f.actual !== undefined) { if (f.expected !== undefined && f.actual !== undefined) {
lines.push(` Expected: ${JSON.stringify(f.expected)}`); lines.push(` ${s.dim('Expected:')} ${s.success(JSON.stringify(f.expected))}`);
lines.push(` Actual: ${JSON.stringify(f.actual)}`); lines.push(` ${s.dim('Actual:')} ${s.error(JSON.stringify(f.actual))}`);
} }
lines.push(''); lines.push('');
return lines.join('\n'); return lines.join('\n');
@ -206,21 +257,43 @@ export function formatTraceList(traces: Array<{
const lines: string[] = []; const lines: string[] = [];
lines.push(''); lines.push('');
lines.push('📋 Recent Traces'); lines.push(` ${s.bold('📋 Recent Traces')}`);
lines.push('═'.repeat(80)); lines.push(s.primary(` ${box.dHorizontal.repeat(76)}`));
lines.push(''); lines.push('');
lines.push('ID Status Passed Failed Duration');
lines.push('─'.repeat(80)); // Header row
const hId = s.dim('ID'.padEnd(38));
const hStatus = s.dim('Status'.padEnd(10));
const hPassed = s.dim('Passed'.padStart(8));
const hFailed = s.dim('Failed'.padStart(8));
const hDuration = s.dim('Duration'.padStart(10));
lines.push(` ${hId}${hStatus}${hPassed}${hFailed}${hDuration}`);
lines.push(s.dim(` ${box.horizontal.repeat(76)}`));
for (const t of traces) { for (const t of traces) {
const statusIcon = t.status === 'success' ? '✅' : t.status === 'partial' ? '⚠️ ' : '❌'; const id = s.muted(t.id.slice(0, 36).padEnd(38));
const id = t.id.slice(0, 36);
const passed = String(t.testsPassed).padStart(6); let statusIcon: string;
const failed = String(t.testsFailed).padStart(6); if (t.status === 'success') {
const duration = formatDuration(t.duration).padStart(8); statusIcon = s.success('✓ Pass'.padEnd(10));
lines.push(`${id} ${statusIcon} ${passed} ${failed} ${duration}`); } else if (t.status === 'partial') {
statusIcon = s.warning('⚠ Partial'.padEnd(10));
} else {
statusIcon = s.error('✗ Fail'.padEnd(10));
}
const passed = s.success(String(t.testsPassed).padStart(8));
const failed = t.testsFailed > 0
? s.error(String(t.testsFailed).padStart(8))
: s.dim(String(t.testsFailed).padStart(8));
const duration = s.number(formatDuration(t.duration).padStart(10));
lines.push(` ${id}${statusIcon}${passed}${failed}${duration}`);
} }
lines.push(''); lines.push('');
lines.push(` ${s.dim('View a trace:')} ${s.info('evaluclaude view <trace-id>')}`);
lines.push('');
return lines.join('\n'); return lines.join('\n');
} }