improvements and promptfoo

This commit is contained in:
Harivansh Rathi 2026-01-11 20:02:30 -05:00
parent 6698c12e5b
commit ff5300f4e0
13 changed files with 1082 additions and 117 deletions

32
rubrics/code-quality.yaml Normal file
View file

@ -0,0 +1,32 @@
name: code-quality
description: Evaluates generated code for quality and maintainability
passingThreshold: 0.7
criteria:
- name: readability
weight: 0.3
description: Code is easy to read and understand
examples:
good: "Clear variable names, logical flow, proper indentation"
bad: "Single-letter variables, deeply nested logic, inconsistent style"
- name: correctness
weight: 0.4
description: Code correctly implements the intended behavior
examples:
good: "Handles edge cases, correct algorithm, proper error handling"
bad: "Missing edge cases, off-by-one errors, swallowed exceptions"
- name: efficiency
weight: 0.2
description: Code uses appropriate data structures and algorithms
examples:
good: "O(n) where O(n) is optimal, avoids unnecessary allocations"
bad: "O(n²) when O(n) is possible, creates objects in tight loops"
- name: maintainability
weight: 0.1
description: Code is easy to modify and extend
examples:
good: "Single responsibility, low coupling, clear interfaces"
bad: "God functions, tight coupling, magic numbers"

View file

@ -0,0 +1,32 @@
name: documentation
description: Evaluates quality of code documentation and docstrings
passingThreshold: 0.65
criteria:
- name: completeness
weight: 0.35
description: Documentation covers all parameters, return values, and exceptions
examples:
good: "Fully documents args, returns, raises, and includes usage example"
bad: "Missing parameter descriptions or return type"
- name: accuracy
weight: 0.35
description: Documentation accurately describes the function's behavior
examples:
good: "Description matches implementation, types are correct"
bad: "Outdated docs that don't match current behavior"
- name: examples
weight: 0.2
description: Includes helpful usage examples
examples:
good: "Shows common use cases with expected outputs"
bad: "No examples or only trivial ones"
- name: style
weight: 0.1
description: Follows project/language documentation conventions
examples:
good: "Uses standard docstring format (Google, NumPy, or reStructuredText)"
bad: "Inconsistent or non-standard format"

View file

@ -0,0 +1,25 @@
name: error-messages
description: Evaluates quality of error messages
passingThreshold: 0.6
criteria:
- name: clarity
weight: 0.4
description: Error message clearly explains what went wrong
examples:
good: "Invalid email format: 'not-an-email' is missing '@' symbol"
bad: "Error: validation failed"
- name: actionability
weight: 0.4
description: Error message suggests how to fix the problem
examples:
good: "File not found. Create the file or check the path spelling."
bad: "ENOENT"
- name: context
weight: 0.2
description: Error message includes relevant context (file, line, values)
examples:
good: "TypeError at line 42 in auth.py: expected str, got int (value=123)"
bad: "type error"

View file

@ -0,0 +1,257 @@
import { Command } from 'commander';
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join, resolve } from 'path';
import { analyze } from '../../introspector/index.js';
import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
import { createTracer, saveTrace } from '../../observability/index.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
import type { EvalSpec } from '../../analyzer/types.js';
const EVALUCLAUDE_DIR = '.evaluclaude';
interface PipelineOptions {
output?: string;
interactive?: boolean;
focus?: string;
maxScenarios: string;
testDir: string;
framework?: string;
skipAnalyze?: boolean;
skipRender?: boolean;
skipRun?: boolean;
promptfoo?: boolean;
quiet?: boolean;
}
export const pipelineCommand = new Command('pipeline')
.description('Run the full eval generation pipeline: introspect → analyze → render → run')
.argument('[path]', 'Path to the repository to analyze', '.')
.option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude')
.option('-i, --interactive', 'Enable interactive mode with clarifying questions')
.option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
.option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
.option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
.option('--skip-analyze', 'Skip analysis, use existing spec')
.option('--skip-render', 'Skip rendering, use existing tests')
.option('--skip-run', 'Skip test execution')
.option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
.option('--quiet', 'Suppress progress messages')
.action(async (repoPath: string, options: PipelineOptions) => {
const absolutePath = resolve(repoPath);
const log = options.quiet ? () => {} : console.log;
const outputDir = options.output || EVALUCLAUDE_DIR;
console.log('\n🚀 Evaluclaude Pipeline');
console.log('═'.repeat(50));
console.log(` Repository: ${absolutePath}`);
console.log(` Output: ${outputDir}`);
console.log('═'.repeat(50) + '\n');
// Ensure output directories exist
mkdirSync(outputDir, { recursive: true });
mkdirSync(options.testDir, { recursive: true });
const specPath = join(outputDir, 'spec.json');
const tracesDir = join(outputDir, 'traces');
const resultsDir = join(outputDir, 'results');
mkdirSync(tracesDir, { recursive: true });
mkdirSync(resultsDir, { recursive: true });
let spec: EvalSpec;
// Step 1: Introspection + Analysis
if (options.skipAnalyze && existsSync(specPath)) {
log('📋 Using existing EvalSpec...');
spec = JSON.parse(readFileSync(specPath, 'utf-8'));
log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
} else {
log('🔬 Step 1: Introspecting codebase...');
try {
const repoSummary = await analyze({
root: absolutePath,
onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`),
});
log(` Files: ${repoSummary.files.length}`);
log(` Languages: ${repoSummary.languages.join(', ')}`);
log('');
log('🤖 Step 2: Generating EvalSpec with Claude...\n');
const focus = options.focus?.split(',').map(s => s.trim());
const maxScenarios = parseInt(options.maxScenarios, 10);
let result;
if (options.interactive) {
const { default: inquirer } = await import('inquirer');
result = await generateEvalSpecInteractive(
repoSummary,
async (question: string) => {
const { answer } = await inquirer.prompt([{
type: 'input',
name: 'answer',
message: `🤖 Claude asks: ${question}`,
}]);
return answer;
},
{ focus, maxScenarios }
);
} else {
result = await generateEvalSpec(repoSummary, {
interactive: false,
focus,
maxScenarios,
});
}
spec = result.spec;
// Save the spec
writeFileSync(specPath, JSON.stringify(spec, null, 2));
log(`\n✅ EvalSpec generated!`);
log(` Scenarios: ${spec.scenarios.length}`);
log(` Tokens: ${result.tokensUsed}`);
log(` Saved: ${specPath}\n`);
} catch (error) {
console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Step 2: Render tests
if (!options.skipRender) {
log('📝 Step 3: Rendering test files...');
try {
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
const renderResult = await renderSpec(spec, {
outputDir: options.testDir,
framework,
includeFixtures: true,
generateMocks: true,
dryRun: false,
});
log(` Framework: ${framework}`);
log(` Files: ${renderResult.stats.fileCount}`);
log(` Scenarios: ${renderResult.stats.scenarioCount}`);
log(` Assertions: ${renderResult.stats.assertionCount}`);
log(` Output: ${options.testDir}\n`);
} catch (error) {
console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Step 3: Run tests
if (!options.skipRun) {
log('🧪 Step 4: Running tests...\n');
try {
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
const tracer = createTracer(spec.repo.name);
tracer.recordIntrospection({
filesAnalyzed: spec.scenarios.map(s => s.target.module),
totalFunctions: spec.scenarios.length,
duration: 0,
});
tracer.recordGeneration({
scenariosGenerated: spec.scenarios.length,
filesWritten: [options.testDir],
});
const result = await runTests(
options.testDir,
{
framework,
sandbox: true,
timeout: 300000,
parallel: false,
cwd: process.cwd(),
},
DEFAULT_SANDBOX_CONFIG
);
tracer.recordExecution({
testsPassed: result.summary.passed,
testsFailed: result.summary.failed,
testsSkipped: result.summary.skipped,
});
for (const test of result.tests) {
if (test.status === 'failed' || test.status === 'error') {
tracer.recordTestFailure({
scenarioId: test.id,
testName: test.name,
error: test.error?.message || 'Unknown error',
stack: test.error?.stack,
});
}
}
const trace = tracer.finalize();
const tracePath = await saveTrace(trace);
log(formatResults(result));
log(`📊 Trace saved: ${tracePath}`);
log(` View with: evaluclaude view ${trace.id}\n`);
// Save results
const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
writeFileSync(resultsPath, JSON.stringify(result, null, 2));
} catch (error) {
console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Step 4: Generate Promptfoo config
if (options.promptfoo) {
log('📦 Step 5: Generating Promptfoo configuration...');
try {
const configPath = join(outputDir, 'promptfooconfig.yaml');
const providerPath = join(outputDir, 'providers', 'test-runner.py');
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
await generatePromptfooConfig(spec, {
testDir: options.testDir,
outputPath: configPath,
framework,
includeTraceLinks: true,
providerPath,
});
await generateTestProvider(providerPath);
log(` Config: ${configPath}`);
log(` Provider: ${providerPath}`);
log(`\n Launch UI with: evaluclaude ui\n`);
} catch (error) {
console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
}
}
console.log('═'.repeat(50));
console.log('✅ Pipeline complete!');
console.log('═'.repeat(50));
console.log(`\nNext steps:`);
console.log(` View traces: evaluclaude view --last`);
console.log(` List all traces: evaluclaude traces`);
if (options.promptfoo) {
console.log(` Launch UI: evaluclaude ui`);
console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`);
}
console.log('');
});

View file

@ -10,6 +10,7 @@ import {
DEFAULT_SANDBOX_CONFIG DEFAULT_SANDBOX_CONFIG
} from '../../runners/index.js'; } from '../../runners/index.js';
import { createTracer, saveTrace } from '../../observability/index.js'; import { createTracer, saveTrace } from '../../observability/index.js';
import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
import type { EvalSpec } from '../../analyzer/types.js'; import type { EvalSpec } from '../../analyzer/types.js';
export const runCommand = new Command('run') export const runCommand = new Command('run')
@ -25,6 +26,7 @@ export const runCommand = new Command('run')
.option('-o, --output <file>', 'Output results to JSON file') .option('-o, --output <file>', 'Output results to JSON file')
.option('--trace', 'Record execution trace', true) .option('--trace', 'Record execution trace', true)
.option('--no-trace', 'Disable execution tracing') .option('--no-trace', 'Disable execution tracing')
.option('--export-promptfoo', 'Export results in Promptfoo format', false)
.option('-w, --watch', 'Watch mode (rerun on changes)', false) .option('-w, --watch', 'Watch mode (rerun on changes)', false)
.action(async (testDir: string, options) => { .action(async (testDir: string, options) => {
try { try {
@ -109,6 +111,16 @@ export const runCommand = new Command('run')
console.log(`\n📁 Results saved to: ${options.output}`); console.log(`\n📁 Results saved to: ${options.output}`);
} }
// Export to Promptfoo format for UI viewing
if (options.exportPromptfoo) {
const exportPath = await exportToPromptfooFormat(result, spec, {
outputDir: '.evaluclaude/results',
evalId: `eval-${Date.now()}`,
});
console.log(`\n📦 Promptfoo results exported: ${exportPath}`);
console.log(` View with: evaluclaude ui`);
}
if (tracer) { if (tracer) {
const trace = tracer.finalize(); const trace = tracer.finalize();
const tracePath = await saveTrace(trace); const tracePath = await saveTrace(trace);

View file

@ -1,7 +1,7 @@
import { Command } from 'commander'; import { Command } from 'commander';
import { spawn, type ChildProcess } from 'child_process'; import { spawn } from 'child_process';
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs'; import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join, dirname } from 'path'; import { join, dirname, resolve as resolvePath } from 'path';
import type { EvalSpec } from '../../analyzer/types.js'; import type { EvalSpec } from '../../analyzer/types.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js'; import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
@ -21,6 +21,7 @@ export const uiCommand = new Command('ui')
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE); const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py'); const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
// If spec provided with --generate, create/update Promptfoo config
if (options.spec && options.generate) { if (options.spec && options.generate) {
console.log('\n📄 Generating Promptfoo configuration...'); console.log('\n📄 Generating Promptfoo configuration...');
@ -36,6 +37,7 @@ export const uiCommand = new Command('ui')
outputPath: configPath, outputPath: configPath,
framework: detectFramework(spec), framework: detectFramework(spec),
includeTraceLinks: true, includeTraceLinks: true,
providerPath: providerPath,
}); });
await generateTestProvider(providerPath); await generateTestProvider(providerPath);
@ -44,20 +46,31 @@ export const uiCommand = new Command('ui')
console.log(` Provider: ${providerPath}`); console.log(` Provider: ${providerPath}`);
} }
// Check for existing config, create default if missing
if (!existsSync(configPath)) { if (!existsSync(configPath)) {
console.log('\n⚠ No Promptfoo config found.'); console.log('\n⚠ No Promptfoo config found.');
console.log(' Run with --spec <file> --generate to create one.\n'); console.log(' Creating default configuration...\n');
console.log(' Or create one manually:');
console.log(` ${configPath}\n`);
await createDefaultConfig(configPath, providerPath); await createDefaultConfig(configPath, providerPath);
console.log(` Created default config at ${configPath}`); console.log(` Created: ${configPath}`);
}
// Check for results to display
const resultsDir = join(EVALUCLAUDE_DIR, 'results');
const latestResults = join(resultsDir, 'latest.json');
if (!existsSync(latestResults)) {
console.log('\n⚠ No evaluation results found.');
console.log(' Run `evaluclaude run --export-promptfoo` first to generate results.\n');
console.log(' Or run the full pipeline:');
console.log(' evaluclaude pipeline <path> --promptfoo\n');
} }
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`); console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
console.log(` Config: ${configPath}\n`); console.log(` Results: ${latestResults}\n`);
await launchPromptfooUI(port, configPath, options.open); // Use promptfoo view with the results file
await launchPromptfooView(port, latestResults, options.open);
} catch (error) { } catch (error) {
console.error('Error launching UI:', error instanceof Error ? error.message : error); console.error('Error launching UI:', error instanceof Error ? error.message : error);
process.exit(1); process.exit(1);
@ -71,12 +84,21 @@ export const evalCommand = new Command('eval')
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results') .option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
.option('--view', 'Launch UI after evaluation', false) .option('--view', 'Launch UI after evaluation', false)
.option('-p, --port <port>', 'Port for UI', '3000') .option('-p, --port <port>', 'Port for UI', '3000')
.option('--no-cache', 'Disable Promptfoo caching', false)
.action(async (options) => { .action(async (options) => {
try { try {
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE); const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
// Generate config from spec if provided
if (options.spec) { if (options.spec) {
console.log('\n📄 Generating Promptfoo configuration from spec...'); console.log('\n📄 Generating Promptfoo configuration from spec...');
if (!existsSync(options.spec)) {
console.error(`Error: Spec file not found: ${options.spec}`);
process.exit(1);
}
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8')); const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
await generatePromptfooConfig(spec, { await generatePromptfooConfig(spec, {
@ -84,30 +106,57 @@ export const evalCommand = new Command('eval')
outputPath: configPath, outputPath: configPath,
framework: detectFramework(spec), framework: detectFramework(spec),
includeTraceLinks: true, includeTraceLinks: true,
providerPath: providerPath,
}); });
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
await generateTestProvider(providerPath); await generateTestProvider(providerPath);
console.log(` Config: ${configPath}`);
console.log(` Provider: ${providerPath}`);
console.log(` Scenarios: ${spec.scenarios.length}`);
} }
if (!existsSync(configPath)) { if (!existsSync(configPath)) {
console.error(`Error: Config not found: ${configPath}`); console.error(`\nError: Config not found: ${configPath}`);
console.log('Run with --spec <file> to generate from EvalSpec.'); console.log('Run with --spec <file> to generate from EvalSpec, or create config manually.');
process.exit(1); process.exit(1);
} }
console.log('\n🧪 Running Promptfoo evaluations...\n'); // Ensure output directory exists
mkdirSync(options.output, { recursive: true });
console.log('\n🧪 Running Promptfoo evaluations...');
console.log(` Config: ${configPath}`);
console.log(` Output: ${options.output}\n`);
const outputFile = join(options.output, `eval-${Date.now()}.json`); const outputFile = join(options.output, `eval-${Date.now()}.json`);
mkdirSync(dirname(outputFile), { recursive: true });
await runPromptfooEval(configPath, outputFile); const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache);
console.log(`\n📁 Results saved: ${outputFile}`); if (exitCode === 0) {
console.log(`\n✅ Evaluation complete!`);
console.log(`📁 Results: ${outputFile}`);
} else {
console.log(`\n⚠ Evaluation finished with exit code ${exitCode}`);
console.log(`📁 Results: ${outputFile}`);
}
// List traces generated during evaluation
const tracesDir = join(EVALUCLAUDE_DIR, 'traces');
if (existsSync(tracesDir)) {
const { readdirSync } = await import('fs');
const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json'));
if (traces.length > 0) {
console.log(`\n📊 Traces generated: ${traces.length}`);
console.log(` View with: evaluclaude view --last`);
}
}
if (options.view) { if (options.view) {
console.log(`\n🚀 Launching UI on port ${options.port}...`); console.log(`\n🚀 Launching UI on port ${options.port}...`);
await launchPromptfooUI(parseInt(options.port, 10), configPath, true); await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
} else {
console.log(`\n View results: evaluclaude ui`);
} }
} catch (error) { } catch (error) {
console.error('Error running eval:', error instanceof Error ? error.message : error); console.error('Error running eval:', error instanceof Error ? error.message : error);
@ -115,6 +164,64 @@ export const evalCommand = new Command('eval')
} }
}); });
/**
* Launch Promptfoo view to display pre-computed results.
*/
async function launchPromptfooView(
port: number,
resultsFile: string,
openBrowser: boolean
): Promise<void> {
return new Promise((resolve, reject) => {
// Use 'promptfoo view' which opens the web UI showing results from the output directory
const resultsDir = dirname(resolvePath(resultsFile));
const args = ['promptfoo', 'view', '--port', String(port)];
if (openBrowser) {
args.push('-y');
} else {
args.push('-n');
}
// Pass the directory containing results
args.push(resultsDir);
console.log(` Running: npx ${args.join(' ')}\n`);
const child = spawn('npx', args, {
stdio: 'inherit',
env: { ...process.env },
});
child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found.');
console.error(' Install with: npm install -g promptfoo');
console.error(' Or run: npx promptfoo --version\n');
} else {
reject(error);
}
});
child.on('close', (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`Promptfoo exited with code ${code}`));
}
});
// Handle Ctrl+C gracefully
process.on('SIGINT', () => {
child.kill('SIGINT');
process.exit(0);
});
});
}
/**
* Launch Promptfoo with a config file (for running evals).
*/
async function launchPromptfooUI( async function launchPromptfooUI(
port: number, port: number,
configPath: string, configPath: string,
@ -129,7 +236,8 @@ async function launchPromptfooUI(
args.push('-n'); args.push('-n');
} }
const configDir = dirname(configPath); // Pass the directory containing the config
const configDir = dirname(resolvePath(configPath));
args.push(configDir); args.push(configDir);
console.log(` Running: npx ${args.join(' ')}\n`); console.log(` Running: npx ${args.join(' ')}\n`);
@ -141,7 +249,9 @@ async function launchPromptfooUI(
child.on('error', (error) => { child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') { if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo'); console.error('\n❌ Promptfoo not found.');
console.error(' Install with: npm install -g promptfoo');
console.error(' Or run: npx promptfoo --version\n');
} else { } else {
reject(error); reject(error);
} }
@ -155,6 +265,7 @@ async function launchPromptfooUI(
} }
}); });
// Handle Ctrl+C gracefully
process.on('SIGINT', () => { process.on('SIGINT', () => {
child.kill('SIGINT'); child.kill('SIGINT');
process.exit(0); process.exit(0);
@ -162,16 +273,23 @@ async function launchPromptfooUI(
}); });
} }
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> { async function runPromptfooEval(
configPath: string,
outputFile: string,
noCache: boolean
): Promise<number> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const args = [ const args = [
'promptfoo', 'promptfoo',
'eval', 'eval',
'-c', configPath, '-c', configPath,
'-o', outputFile, '-o', outputFile,
'--no-cache',
]; ];
if (noCache) {
args.push('--no-cache');
}
console.log(` Running: npx ${args.join(' ')}\n`); console.log(` Running: npx ${args.join(' ')}\n`);
const child = spawn('npx', args, { const child = spawn('npx', args, {
@ -179,14 +297,18 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
env: { ...process.env }, env: { ...process.env },
}); });
child.on('error', reject); child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found.');
console.error(' Install with: npm install -g promptfoo\n');
reject(error);
} else {
reject(error);
}
});
child.on('close', (code) => { child.on('close', (code) => {
if (code === 0) { resolve(code ?? 1);
resolve();
} else {
reject(new Error(`Promptfoo eval exited with code ${code}`));
}
}); });
}); });
} }
@ -194,6 +316,14 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> { async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
const defaultConfig = `# Evaluclaude Promptfoo Configuration const defaultConfig = `# Evaluclaude Promptfoo Configuration
# Generated by evaluclaude # Generated by evaluclaude
#
# To populate this config from an EvalSpec:
# evaluclaude eval --spec <evalspec.json>
#
# Or run the full pipeline:
# evaluclaude analyze <path> -o spec.json
# evaluclaude render spec.json -o tests/generated
# evaluclaude eval --spec spec.json
description: "Evaluclaude functional test evaluations" description: "Evaluclaude functional test evaluations"
@ -204,12 +334,13 @@ providers:
test_dir: ./tests/generated test_dir: ./tests/generated
framework: pytest framework: pytest
timeout: 300 timeout: 300
sandbox: true
prompts: prompts:
- "{{scenario_id}}" - "{{scenario_id}}"
tests: tests:
- description: "Example test" - description: "Example test - replace with real scenarios"
vars: vars:
scenario_id: "test_example" scenario_id: "test_example"
assert: assert:
@ -219,12 +350,19 @@ tests:
result = json.loads(output) result = json.loads(output)
result.get('passed', 0) > 0 result.get('passed', 0) > 0
# Default test configuration
defaultTest:
metadata:
evaluclaude: true
tracesDir: .evaluclaude/traces
outputPath: .evaluclaude/results/promptfoo-results.json outputPath: .evaluclaude/results/promptfoo-results.json
`; `;
mkdirSync(dirname(configPath), { recursive: true }); mkdirSync(dirname(configPath), { recursive: true });
writeFileSync(configPath, defaultConfig); writeFileSync(configPath, defaultConfig);
// Also generate the provider
await generateTestProvider(providerPath); await generateTestProvider(providerPath);
} }
@ -232,5 +370,8 @@ function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
if (spec.repo.languages.includes('python')) { if (spec.repo.languages.includes('python')) {
return 'pytest'; return 'pytest';
} }
if (spec.repo.languages.includes('typescript') || spec.repo.languages.includes('javascript')) {
return 'vitest';
}
return 'vitest'; return 'vitest';
} }

View file

@ -8,6 +8,7 @@ import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/g
import { runCommand } from './commands/run.js'; import { runCommand } from './commands/run.js';
import { viewCommand, tracesCommand } from './commands/view.js'; import { viewCommand, tracesCommand } from './commands/view.js';
import { uiCommand, evalCommand } from './commands/ui.js'; import { uiCommand, evalCommand } from './commands/ui.js';
import { pipelineCommand } from './commands/pipeline.js';
const program = new Command(); const program = new Command();
@ -16,15 +17,25 @@ program
.description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.') .description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.')
.version('0.1.0'); .version('0.1.0');
// Core pipeline command - the "zero to evals" experience
program.addCommand(pipelineCommand);
// Individual step commands
program.addCommand(introCommand); program.addCommand(introCommand);
program.addCommand(analyzeCommand); program.addCommand(analyzeCommand);
program.addCommand(renderCommand); program.addCommand(renderCommand);
program.addCommand(runCommand);
// Grading commands
program.addCommand(gradeCommand); program.addCommand(gradeCommand);
program.addCommand(listRubricsCommand); program.addCommand(listRubricsCommand);
program.addCommand(calibrateCommand); program.addCommand(calibrateCommand);
program.addCommand(runCommand);
// Observability commands
program.addCommand(viewCommand); program.addCommand(viewCommand);
program.addCommand(tracesCommand); program.addCommand(tracesCommand);
// Promptfoo integration commands
program.addCommand(uiCommand); program.addCommand(uiCommand);
program.addCommand(evalCommand); program.addCommand(evalCommand);

View file

@ -64,7 +64,7 @@ export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}
lines.push('─'.repeat(40)); lines.push('─'.repeat(40));
lines.push(` ✅ Passed: ${trace.execution.testsPassed}`); lines.push(` ✅ Passed: ${trace.execution.testsPassed}`);
lines.push(` ❌ Failed: ${trace.execution.testsFailed}`); lines.push(` ❌ Failed: ${trace.execution.testsFailed}`);
lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped}`); lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped ?? 0}`);
lines.push(''); lines.push('');
if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) { if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {

View file

@ -1,5 +1,5 @@
import { writeFile, mkdir } from 'fs/promises'; import { writeFile, mkdir } from 'fs/promises';
import { dirname, join } from 'path'; import { dirname, join, resolve } from 'path';
import * as yaml from 'js-yaml'; import * as yaml from 'js-yaml';
import type { EvalSpec, EvalScenario } from '../analyzer/types.js'; import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js'; import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
@ -9,6 +9,7 @@ export interface ConfigOptions {
outputPath: string; outputPath: string;
framework: 'pytest' | 'vitest' | 'jest'; framework: 'pytest' | 'vitest' | 'jest';
includeTraceLinks: boolean; includeTraceLinks: boolean;
providerPath?: string;
} }
export async function generatePromptfooConfig( export async function generatePromptfooConfig(
@ -30,16 +31,23 @@ export async function generatePromptfooConfig(
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig { function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
const tests = spec.scenarios.map(scenario => buildTest(scenario, options)); const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
// Provider path should be relative to the config file location
// Since config is at .evaluclaude/promptfooconfig.yaml, the provider is at ./providers/test-runner.py
const providerRelativePath = options.providerPath
? options.providerPath.replace('.evaluclaude/', './').replace(/^\.evaluclaude\//, './')
: './providers/test-runner.py';
return { return {
description: `Evaluclaude functional tests for ${spec.repo.name}`, description: `Evaluclaude functional tests for ${spec.repo.name}`,
providers: [ providers: [
{ {
id: `file://providers/test-runner.py`, id: `file://${providerRelativePath}`,
label: 'functional-tests', label: 'functional-tests',
config: { config: {
test_dir: options.testDir, test_dir: resolve(options.testDir),
framework: options.framework, framework: options.framework,
timeout: 300, timeout: 300,
sandbox: true,
}, },
}, },
], ],
@ -48,11 +56,12 @@ function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
defaultTest: options.includeTraceLinks defaultTest: options.includeTraceLinks
? { ? {
metadata: { metadata: {
traceFile: '.evaluclaude/traces/{{evalId}}.json', evaluclaude: true,
tracesDir: './traces',
}, },
} }
: undefined, : undefined,
outputPath: '.evaluclaude/results/promptfoo-results.json', outputPath: './results/promptfoo-results.json',
}; };
} }
@ -147,91 +156,50 @@ function buildAssertion(assertion: any): PromptfooAssertion {
export async function generateTestProvider(outputPath: string): Promise<void> { export async function generateTestProvider(outputPath: string): Promise<void> {
const providerCode = `#!/usr/bin/env python3 const providerCode = `#!/usr/bin/env python3
"""Promptfoo provider that executes tests and returns structured results.""" """
Promptfoo provider that executes tests and returns structured results.
This provider integrates with evaluclaude-harness test runners to execute
functional tests in a sandboxed environment and return results compatible
with Promptfoo's assertion system.
"""
import subprocess import subprocess
import json import json
import sys import sys
import os import os
import tempfile
import uuid
from pathlib import Path
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
def call_api(prompt: str, options: dict, context: dict) -> dict:
"""Runs tests and returns structured results.""" """Runs tests and returns structured results."""
test_dir = options.get('config', {}).get('test_dir', './tests') config = options.get('config', {})
framework = options.get('config', {}).get('framework', 'pytest') test_dir = config.get('test_dir', './tests/generated')
timeout = options.get('config', {}).get('timeout', 300) framework = config.get('framework', 'pytest')
timeout = config.get('timeout', 300)
sandbox = config.get('sandbox', True)
scenario_id = prompt.strip() scenario_id = prompt.strip()
eval_id = f"eval-{uuid.uuid4().hex[:8]}"
# Ensure traces directory exists
traces_dir = Path('.evaluclaude/traces')
traces_dir.mkdir(parents=True, exist_ok=True)
try: try:
if framework == 'pytest': if framework == 'pytest':
result = subprocess.run( output = run_pytest(test_dir, scenario_id, timeout, eval_id)
[
'python', '-m', 'pytest',
'--json-report',
'--json-report-file=/tmp/pytest_results.json',
'-k', scenario_id,
test_dir
],
capture_output=True,
text=True,
timeout=timeout
)
try:
with open('/tmp/pytest_results.json') as f:
report = json.load(f)
output = {
'passed': report.get('summary', {}).get('passed', 0),
'failed': report.get('summary', {}).get('failed', 0),
'skipped': report.get('summary', {}).get('skipped', 0),
'tests': report.get('tests', []),
'stdout': result.stdout,
'stderr': result.stderr,
'exit_code': result.returncode,
}
except FileNotFoundError:
output = {
'passed': 0,
'failed': 1,
'error': 'Failed to generate pytest report',
'stdout': result.stdout,
'stderr': result.stderr,
}
elif framework in ('vitest', 'jest'): elif framework in ('vitest', 'jest'):
cmd = ['npx', framework, 'run', '--reporter=json'] output = run_js_tests(test_dir, scenario_id, timeout, framework, eval_id)
if scenario_id:
cmd.extend(['--testNamePattern', scenario_id])
cmd.append(test_dir)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout
)
try:
report = json.loads(result.stdout)
output = {
'passed': report.get('numPassedTests', 0),
'failed': report.get('numFailedTests', 0),
'skipped': report.get('numSkippedTests', 0),
'tests': report.get('testResults', []),
'exit_code': result.returncode,
}
except json.JSONDecodeError:
output = {
'passed': 0,
'failed': 1,
'error': 'Failed to parse test output',
'stdout': result.stdout,
'stderr': result.stderr,
}
else: else:
output = {'error': f'Unknown framework: {framework}'} output = {'error': f'Unknown framework: {framework}', 'passed': 0, 'failed': 1}
# Add trace reference
output['eval_id'] = eval_id
output['trace_file'] = str(traces_dir / f"{eval_id}.json")
return { return {
'output': json.dumps(output), 'output': json.dumps(output),
@ -240,32 +208,187 @@ def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return { return {
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}), 'output': json.dumps({
'error': 'Test execution timed out',
'passed': 0,
'failed': 1,
'eval_id': eval_id,
}),
'error': None, 'error': None,
} }
except Exception as e: except Exception as e:
return { return {
'output': None, 'output': json.dumps({
'error': str(e),
'passed': 0,
'failed': 1,
'eval_id': eval_id,
}),
'error': str(e), 'error': str(e),
} }
def run_pytest(test_dir: str, scenario_id: str, timeout: int, eval_id: str) -> dict:
"""Run pytest and return structured results."""
with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
report_file = f.name
cmd = [
sys.executable, '-m', 'pytest',
'--json-report',
f'--json-report-file={report_file}',
'-v',
'--tb=short',
]
if scenario_id:
cmd.extend(['-k', scenario_id])
cmd.append(test_dir)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
cwd=os.getcwd(),
)
try:
with open(report_file) as f:
report = json.load(f)
summary = report.get('summary', {})
tests = report.get('tests', [])
output = {
'passed': summary.get('passed', 0),
'failed': summary.get('failed', 0),
'skipped': summary.get('skipped', 0),
'total': summary.get('total', 0),
'duration': report.get('duration', 0) * 1000, # Convert to ms
'tests': [
{
'id': extract_scenario_id(t.get('nodeid', '')),
'name': t.get('nodeid', ''),
'status': t.get('outcome', 'unknown'),
'duration': (t.get('call', {}).get('duration', 0) or 0) * 1000,
'error': t.get('call', {}).get('crash', {}).get('message') if t.get('call', {}).get('crash') else None,
}
for t in tests
],
'exit_code': result.returncode,
}
except (FileNotFoundError, json.JSONDecodeError) as e:
output = {
'passed': 0,
'failed': 1,
'error': f'Failed to parse pytest report: {e}',
'stdout': result.stdout[-2000:] if result.stdout else '',
'stderr': result.stderr[-2000:] if result.stderr else '',
}
finally:
try:
os.unlink(report_file)
except OSError:
pass
return output
def run_js_tests(test_dir: str, scenario_id: str, timeout: int, framework: str, eval_id: str) -> dict:
"""Run vitest/jest and return structured results."""
with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
report_file = f.name
cmd = ['npx', framework, 'run', '--reporter=json', f'--outputFile={report_file}']
if scenario_id:
cmd.extend(['--testNamePattern', scenario_id])
cmd.append(test_dir)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
cwd=os.getcwd(),
)
try:
with open(report_file) as f:
report = json.load(f)
output = {
'passed': report.get('numPassedTests', 0),
'failed': report.get('numFailedTests', 0),
'skipped': report.get('numSkippedTests', 0),
'total': report.get('numTotalTests', 0),
'tests': [],
'exit_code': result.returncode,
}
for test_file in report.get('testResults', []):
for assertion in test_file.get('assertionResults', []):
output['tests'].append({
'id': extract_scenario_id(assertion.get('fullName', '')),
'name': assertion.get('fullName', ''),
'status': assertion.get('status', 'unknown'),
'duration': assertion.get('duration', 0),
'error': assertion.get('failureMessages', [None])[0] if assertion.get('failureMessages') else None,
})
except (FileNotFoundError, json.JSONDecodeError) as e:
output = {
'passed': 0,
'failed': 1,
'error': f'Failed to parse {framework} report: {e}',
'stdout': result.stdout[-2000:] if result.stdout else '',
'stderr': result.stderr[-2000:] if result.stderr else '',
}
finally:
try:
os.unlink(report_file)
except OSError:
pass
return output
def extract_scenario_id(nodeid: str) -> str:
"""Extract scenario ID from test name."""
import re
match = re.search(r'test[_\\s]([a-zA-Z0-9_-]+)', nodeid, re.IGNORECASE)
return match.group(1) if match else nodeid.replace(' ', '_')
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
"""Alias for call_api for backwards compatibility."""
return call_api(prompt, options, context)
if __name__ == '__main__': if __name__ == '__main__':
# For testing the provider directly
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser(description='Run tests for Promptfoo')
parser.add_argument('--scenario', default='') parser.add_argument('--scenario', default='', help='Scenario ID to filter')
parser.add_argument('--test-dir', default='./tests') parser.add_argument('--test-dir', default='./tests/generated', help='Test directory')
parser.add_argument('--framework', default='pytest') parser.add_argument('--framework', default='pytest', help='Test framework')
parser.add_argument('--timeout', type=int, default=300, help='Timeout in seconds')
args = parser.parse_args() args = parser.parse_args()
result = get_provider_response( result = call_api(
args.scenario, args.scenario,
{'config': {'test_dir': args.test_dir, 'framework': args.framework}}, {'config': {
'test_dir': args.test_dir,
'framework': args.framework,
'timeout': args.timeout,
}},
{} {}
) )
print(json.dumps(result, indent=2)) print(json.dumps(json.loads(result['output']), indent=2) if result['output'] else result['error'])
`; `;
await mkdir(dirname(outputPath), { recursive: true }); await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, providerCode); await writeFile(outputPath, providerCode, { mode: 0o755 });
} }

View file

@ -1,2 +1,13 @@
export * from './types.js'; export * from './types.js';
export { generatePromptfooConfig, generateTestProvider } from './config-generator.js'; export { generatePromptfooConfig, generateTestProvider, type ConfigOptions } from './config-generator.js';
export {
runTestsForPromptfoo,
savePromptfooResults,
type RunTestsForPromptfooOptions,
type PromptfooProviderResult,
} from './runner-bridge.js';
export {
exportToPromptfooFormat,
generateViewOnlyConfig,
type ExportOptions,
} from './results-exporter.js';

View file

@ -0,0 +1,127 @@
/**
* Export test execution results to Promptfoo format for viewing in the UI.
*
* Instead of using Promptfoo to run tests (which requires a provider that
* responds quickly), we run tests ourselves and export results to Promptfoo's
* result format. This allows us to use Promptfoo's excellent visualization UI.
*/
import { writeFile, mkdir } from 'fs/promises';
import { join } from 'path';
import type { ExecutionResult } from '../runners/types.js';
import type { EvalSpec } from '../analyzer/types.js';
import type { PromptfooResult, PromptfooTestResult } from './types.js';
export interface ExportOptions {
outputDir: string;
evalId?: string;
includeSpec?: boolean;
}
/**
* Export ExecutionResult to Promptfoo result format.
*/
export async function exportToPromptfooFormat(
result: ExecutionResult,
spec: EvalSpec | undefined,
options: ExportOptions
): Promise<string> {
const { outputDir, evalId = `eval-${Date.now()}` } = options;
const promptfooResult = buildPromptfooResult(result, spec, evalId);
await mkdir(outputDir, { recursive: true });
const outputPath = join(outputDir, `${evalId}.json`);
await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
// Also write the latest.json symlink equivalent
const latestPath = join(outputDir, 'latest.json');
await writeFile(latestPath, JSON.stringify(promptfooResult, null, 2));
return outputPath;
}
function buildPromptfooResult(
result: ExecutionResult,
spec: EvalSpec | undefined,
evalId: string
): PromptfooResult {
const testResults: PromptfooTestResult[] = result.tests.map(test => {
// Try to find matching scenario from spec
const scenario = spec?.scenarios.find(s =>
s.id === test.id || test.name.includes(s.id)
);
return {
prompt: {
raw: scenario?.id || test.id,
label: scenario?.name || test.name,
},
vars: {
scenario_id: scenario?.id || test.id,
target_module: scenario?.target.module || '',
target_function: scenario?.target.function || '',
description: scenario?.description || test.name,
},
response: {
output: test.status === 'passed'
? 'Test passed successfully'
: test.error?.message || 'Test failed',
},
gradingResult: {
pass: test.status === 'passed',
score: test.status === 'passed' ? 1 : 0,
reason: test.status === 'passed'
? 'All assertions passed'
: test.error?.message || 'Test failed',
componentResults: test.assertions.details.map(a => ({
pass: a.passed,
score: a.passed ? 1 : 0,
reason: a.description,
assertion: {
type: 'custom',
value: a.description,
},
})),
},
success: test.status === 'passed',
error: test.error?.message,
};
});
return {
version: 1,
timestamp: new Date().toISOString(),
results: testResults,
stats: {
successes: result.summary.passed,
failures: result.summary.failed,
tokenUsage: {
total: 0,
prompt: 0,
completion: 0,
},
},
};
}
/**
* Generate a minimal Promptfoo config that just views results (no provider).
*/
export function generateViewOnlyConfig(spec: EvalSpec): string {
return `# Evaluclaude Results Config
# This config is for viewing results only - tests are run via evaluclaude run
description: "Test results for ${spec.repo.name}"
# No providers needed - we pre-run tests and import results
providers: []
prompts: []
tests: []
# Results are stored here by evaluclaude run --export-promptfoo
outputPath: .evaluclaude/results/latest.json
`;
}

View file

@ -0,0 +1,194 @@
/**
* Bridge between our test runners and Promptfoo's provider interface.
*
* This module provides a unified way to run tests that works both:
* 1. Standalone via our `run` command
* 2. As a Promptfoo provider via the generated test-runner.py
*
* Results are stored in a format compatible with Promptfoo's expectations.
*/
import { writeFile, mkdir } from 'fs/promises';
import { join, dirname } from 'path';
import { runTests, type ExecutionResult, type ExecutionOptions, DEFAULT_SANDBOX_CONFIG } from '../runners/index.js';
import { createTracer, saveTrace, type EvalTrace } from '../observability/index.js';
export interface PromptfooProviderResult {
output: string;
error: string | null;
tokenUsage?: {
total: number;
prompt: number;
completion: number;
};
}
export interface RunTestsForPromptfooOptions {
scenarioId: string;
testDir: string;
framework: 'pytest' | 'vitest' | 'jest';
timeout?: number;
sandbox?: boolean;
evalId?: string;
recordTrace?: boolean;
}
/**
* Run tests for a specific scenario and format results for Promptfoo.
*/
export async function runTestsForPromptfoo(
options: RunTestsForPromptfooOptions
): Promise<PromptfooProviderResult> {
const {
scenarioId,
testDir,
framework,
timeout = 300000,
sandbox = true,
evalId = `eval-${Date.now()}`,
recordTrace = true,
} = options;
const tracer = recordTrace ? createTracer(evalId) : null;
try {
const execOptions: ExecutionOptions = {
framework,
sandbox,
timeout,
parallel: false,
filter: scenarioId ? [scenarioId] : undefined,
cwd: process.cwd(),
};
tracer?.recordIntrospection({
filesAnalyzed: [testDir],
duration: 0,
});
const result = await runTests(
testDir,
execOptions,
sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
);
// Record execution results in trace
if (tracer) {
tracer.recordExecution({
testsPassed: result.summary.passed,
testsFailed: result.summary.failed,
testsSkipped: result.summary.skipped,
});
for (const test of result.tests) {
if (test.status === 'failed' || test.status === 'error') {
tracer.recordTestFailure({
scenarioId: test.id,
testName: test.name,
error: test.error?.message || 'Unknown error',
stack: test.error?.stack,
});
}
}
}
// Build Promptfoo-compatible output
const promptfooOutput = buildPromptfooOutput(result, scenarioId);
// Save trace if enabled
if (tracer) {
const trace = tracer.finalize();
await saveTrace(trace);
}
return {
output: JSON.stringify(promptfooOutput),
error: null,
};
} catch (error) {
if (tracer) {
tracer.recordError(error instanceof Error ? error : new Error(String(error)));
const trace = tracer.finalize();
await saveTrace(trace);
}
return {
output: JSON.stringify({
passed: 0,
failed: 1,
error: error instanceof Error ? error.message : String(error),
}),
error: error instanceof Error ? error.message : String(error),
};
}
}
/**
* Build Promptfoo-compatible output from ExecutionResult.
*/
function buildPromptfooOutput(
result: ExecutionResult,
scenarioId?: string
): Record<string, unknown> {
const matchingTests = scenarioId
? result.tests.filter(t => t.id === scenarioId || t.name.includes(scenarioId))
: result.tests;
return {
passed: matchingTests.filter(t => t.status === 'passed').length,
failed: matchingTests.filter(t => t.status === 'failed' || t.status === 'error').length,
skipped: matchingTests.filter(t => t.status === 'skipped').length,
total: matchingTests.length,
tests: matchingTests.map(t => ({
id: t.id,
name: t.name,
status: t.status,
duration: t.duration,
error: t.error?.message,
})),
summary: {
...result.summary,
matchedScenario: scenarioId,
},
errors: result.errors,
};
}
/**
* Generate a Promptfoo-compatible results file from our execution results.
*/
export async function savePromptfooResults(
result: ExecutionResult,
evalId: string,
outputDir: string = '.evaluclaude/results'
): Promise<string> {
const promptfooResult = {
version: 1,
timestamp: new Date().toISOString(),
evalId,
results: result.tests.map(t => ({
prompt: { raw: t.id, label: t.name },
vars: { scenario_id: t.id },
response: {
output: t.status === 'passed' ? 'PASS' : t.error?.message || 'FAIL',
},
gradingResult: {
pass: t.status === 'passed',
score: t.status === 'passed' ? 1 : 0,
reason: t.error?.message || (t.status === 'passed' ? 'Test passed' : 'Test failed'),
},
success: t.status === 'passed',
error: t.error?.message,
})),
stats: {
successes: result.summary.passed,
failures: result.summary.failed,
},
};
await mkdir(outputDir, { recursive: true });
const outputPath = join(outputDir, `promptfoo-${evalId}.json`);
await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
return outputPath;
}

View file

@ -73,8 +73,8 @@ export function formatResults(result: ExecutionResult): string {
lines.push(` Total: ${result.summary.total}`); lines.push(` Total: ${result.summary.total}`);
lines.push(` ✅ Passed: ${result.summary.passed}`); lines.push(` ✅ Passed: ${result.summary.passed}`);
lines.push(` ❌ Failed: ${result.summary.failed}`); lines.push(` ❌ Failed: ${result.summary.failed}`);
lines.push(` ⏭️ Skipped: ${result.summary.skipped}`); lines.push(` ⏭️ Skipped: ${result.summary.skipped ?? 0}`);
lines.push(` ⏱️ Duration: ${result.summary.duration}ms`); lines.push(` ⏱️ Duration: ${result.summary.duration || 0}ms`);
if (result.errors.length > 0) { if (result.errors.length > 0) {
lines.push(''); lines.push('');