mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-15 07:04:47 +00:00
improvements and promptfoo
This commit is contained in:
parent
6698c12e5b
commit
ff5300f4e0
13 changed files with 1082 additions and 117 deletions
32
rubrics/code-quality.yaml
Normal file
32
rubrics/code-quality.yaml
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
name: code-quality
|
||||||
|
description: Evaluates generated code for quality and maintainability
|
||||||
|
passingThreshold: 0.7
|
||||||
|
|
||||||
|
criteria:
|
||||||
|
- name: readability
|
||||||
|
weight: 0.3
|
||||||
|
description: Code is easy to read and understand
|
||||||
|
examples:
|
||||||
|
good: "Clear variable names, logical flow, proper indentation"
|
||||||
|
bad: "Single-letter variables, deeply nested logic, inconsistent style"
|
||||||
|
|
||||||
|
- name: correctness
|
||||||
|
weight: 0.4
|
||||||
|
description: Code correctly implements the intended behavior
|
||||||
|
examples:
|
||||||
|
good: "Handles edge cases, correct algorithm, proper error handling"
|
||||||
|
bad: "Missing edge cases, off-by-one errors, swallowed exceptions"
|
||||||
|
|
||||||
|
- name: efficiency
|
||||||
|
weight: 0.2
|
||||||
|
description: Code uses appropriate data structures and algorithms
|
||||||
|
examples:
|
||||||
|
good: "O(n) where O(n) is optimal, avoids unnecessary allocations"
|
||||||
|
bad: "O(n²) when O(n) is possible, creates objects in tight loops"
|
||||||
|
|
||||||
|
- name: maintainability
|
||||||
|
weight: 0.1
|
||||||
|
description: Code is easy to modify and extend
|
||||||
|
examples:
|
||||||
|
good: "Single responsibility, low coupling, clear interfaces"
|
||||||
|
bad: "God functions, tight coupling, magic numbers"
|
||||||
32
rubrics/documentation.yaml
Normal file
32
rubrics/documentation.yaml
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
name: documentation
|
||||||
|
description: Evaluates quality of code documentation and docstrings
|
||||||
|
passingThreshold: 0.65
|
||||||
|
|
||||||
|
criteria:
|
||||||
|
- name: completeness
|
||||||
|
weight: 0.35
|
||||||
|
description: Documentation covers all parameters, return values, and exceptions
|
||||||
|
examples:
|
||||||
|
good: "Fully documents args, returns, raises, and includes usage example"
|
||||||
|
bad: "Missing parameter descriptions or return type"
|
||||||
|
|
||||||
|
- name: accuracy
|
||||||
|
weight: 0.35
|
||||||
|
description: Documentation accurately describes the function's behavior
|
||||||
|
examples:
|
||||||
|
good: "Description matches implementation, types are correct"
|
||||||
|
bad: "Outdated docs that don't match current behavior"
|
||||||
|
|
||||||
|
- name: examples
|
||||||
|
weight: 0.2
|
||||||
|
description: Includes helpful usage examples
|
||||||
|
examples:
|
||||||
|
good: "Shows common use cases with expected outputs"
|
||||||
|
bad: "No examples or only trivial ones"
|
||||||
|
|
||||||
|
- name: style
|
||||||
|
weight: 0.1
|
||||||
|
description: Follows project/language documentation conventions
|
||||||
|
examples:
|
||||||
|
good: "Uses standard docstring format (Google, NumPy, or reStructuredText)"
|
||||||
|
bad: "Inconsistent or non-standard format"
|
||||||
25
rubrics/error-messages.yaml
Normal file
25
rubrics/error-messages.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
name: error-messages
|
||||||
|
description: Evaluates quality of error messages
|
||||||
|
passingThreshold: 0.6
|
||||||
|
|
||||||
|
criteria:
|
||||||
|
- name: clarity
|
||||||
|
weight: 0.4
|
||||||
|
description: Error message clearly explains what went wrong
|
||||||
|
examples:
|
||||||
|
good: "Invalid email format: 'not-an-email' is missing '@' symbol"
|
||||||
|
bad: "Error: validation failed"
|
||||||
|
|
||||||
|
- name: actionability
|
||||||
|
weight: 0.4
|
||||||
|
description: Error message suggests how to fix the problem
|
||||||
|
examples:
|
||||||
|
good: "File not found. Create the file or check the path spelling."
|
||||||
|
bad: "ENOENT"
|
||||||
|
|
||||||
|
- name: context
|
||||||
|
weight: 0.2
|
||||||
|
description: Error message includes relevant context (file, line, values)
|
||||||
|
examples:
|
||||||
|
good: "TypeError at line 42 in auth.py: expected str, got int (value=123)"
|
||||||
|
bad: "type error"
|
||||||
257
src/cli/commands/pipeline.ts
Normal file
257
src/cli/commands/pipeline.ts
Normal file
|
|
@ -0,0 +1,257 @@
|
||||||
|
import { Command } from 'commander';
|
||||||
|
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
||||||
|
import { join, resolve } from 'path';
|
||||||
|
import { analyze } from '../../introspector/index.js';
|
||||||
|
import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
|
||||||
|
import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
|
||||||
|
import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
|
||||||
|
import { createTracer, saveTrace } from '../../observability/index.js';
|
||||||
|
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
||||||
|
import type { EvalSpec } from '../../analyzer/types.js';
|
||||||
|
|
||||||
|
const EVALUCLAUDE_DIR = '.evaluclaude';
|
||||||
|
|
||||||
|
interface PipelineOptions {
|
||||||
|
output?: string;
|
||||||
|
interactive?: boolean;
|
||||||
|
focus?: string;
|
||||||
|
maxScenarios: string;
|
||||||
|
testDir: string;
|
||||||
|
framework?: string;
|
||||||
|
skipAnalyze?: boolean;
|
||||||
|
skipRender?: boolean;
|
||||||
|
skipRun?: boolean;
|
||||||
|
promptfoo?: boolean;
|
||||||
|
quiet?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const pipelineCommand = new Command('pipeline')
|
||||||
|
.description('Run the full eval generation pipeline: introspect → analyze → render → run')
|
||||||
|
.argument('[path]', 'Path to the repository to analyze', '.')
|
||||||
|
.option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude')
|
||||||
|
.option('-i, --interactive', 'Enable interactive mode with clarifying questions')
|
||||||
|
.option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
|
||||||
|
.option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
|
||||||
|
.option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
|
||||||
|
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
||||||
|
.option('--skip-analyze', 'Skip analysis, use existing spec')
|
||||||
|
.option('--skip-render', 'Skip rendering, use existing tests')
|
||||||
|
.option('--skip-run', 'Skip test execution')
|
||||||
|
.option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
|
||||||
|
.option('--quiet', 'Suppress progress messages')
|
||||||
|
.action(async (repoPath: string, options: PipelineOptions) => {
|
||||||
|
const absolutePath = resolve(repoPath);
|
||||||
|
const log = options.quiet ? () => {} : console.log;
|
||||||
|
const outputDir = options.output || EVALUCLAUDE_DIR;
|
||||||
|
|
||||||
|
console.log('\n🚀 Evaluclaude Pipeline');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(` Repository: ${absolutePath}`);
|
||||||
|
console.log(` Output: ${outputDir}`);
|
||||||
|
console.log('═'.repeat(50) + '\n');
|
||||||
|
|
||||||
|
// Ensure output directories exist
|
||||||
|
mkdirSync(outputDir, { recursive: true });
|
||||||
|
mkdirSync(options.testDir, { recursive: true });
|
||||||
|
|
||||||
|
const specPath = join(outputDir, 'spec.json');
|
||||||
|
const tracesDir = join(outputDir, 'traces');
|
||||||
|
const resultsDir = join(outputDir, 'results');
|
||||||
|
|
||||||
|
mkdirSync(tracesDir, { recursive: true });
|
||||||
|
mkdirSync(resultsDir, { recursive: true });
|
||||||
|
|
||||||
|
let spec: EvalSpec;
|
||||||
|
|
||||||
|
// Step 1: Introspection + Analysis
|
||||||
|
if (options.skipAnalyze && existsSync(specPath)) {
|
||||||
|
log('📋 Using existing EvalSpec...');
|
||||||
|
spec = JSON.parse(readFileSync(specPath, 'utf-8'));
|
||||||
|
log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
|
||||||
|
} else {
|
||||||
|
log('🔬 Step 1: Introspecting codebase...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const repoSummary = await analyze({
|
||||||
|
root: absolutePath,
|
||||||
|
onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`),
|
||||||
|
});
|
||||||
|
|
||||||
|
log(` Files: ${repoSummary.files.length}`);
|
||||||
|
log(` Languages: ${repoSummary.languages.join(', ')}`);
|
||||||
|
log('');
|
||||||
|
|
||||||
|
log('🤖 Step 2: Generating EvalSpec with Claude...\n');
|
||||||
|
|
||||||
|
const focus = options.focus?.split(',').map(s => s.trim());
|
||||||
|
const maxScenarios = parseInt(options.maxScenarios, 10);
|
||||||
|
|
||||||
|
let result;
|
||||||
|
if (options.interactive) {
|
||||||
|
const { default: inquirer } = await import('inquirer');
|
||||||
|
|
||||||
|
result = await generateEvalSpecInteractive(
|
||||||
|
repoSummary,
|
||||||
|
async (question: string) => {
|
||||||
|
const { answer } = await inquirer.prompt([{
|
||||||
|
type: 'input',
|
||||||
|
name: 'answer',
|
||||||
|
message: `🤖 Claude asks: ${question}`,
|
||||||
|
}]);
|
||||||
|
return answer;
|
||||||
|
},
|
||||||
|
{ focus, maxScenarios }
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
result = await generateEvalSpec(repoSummary, {
|
||||||
|
interactive: false,
|
||||||
|
focus,
|
||||||
|
maxScenarios,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
spec = result.spec;
|
||||||
|
|
||||||
|
// Save the spec
|
||||||
|
writeFileSync(specPath, JSON.stringify(spec, null, 2));
|
||||||
|
|
||||||
|
log(`\n✅ EvalSpec generated!`);
|
||||||
|
log(` Scenarios: ${spec.scenarios.length}`);
|
||||||
|
log(` Tokens: ${result.tokensUsed}`);
|
||||||
|
log(` Saved: ${specPath}\n`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Render tests
|
||||||
|
if (!options.skipRender) {
|
||||||
|
log('📝 Step 3: Rendering test files...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
||||||
|
|
||||||
|
const renderResult = await renderSpec(spec, {
|
||||||
|
outputDir: options.testDir,
|
||||||
|
framework,
|
||||||
|
includeFixtures: true,
|
||||||
|
generateMocks: true,
|
||||||
|
dryRun: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
log(` Framework: ${framework}`);
|
||||||
|
log(` Files: ${renderResult.stats.fileCount}`);
|
||||||
|
log(` Scenarios: ${renderResult.stats.scenarioCount}`);
|
||||||
|
log(` Assertions: ${renderResult.stats.assertionCount}`);
|
||||||
|
log(` Output: ${options.testDir}\n`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: Run tests
|
||||||
|
if (!options.skipRun) {
|
||||||
|
log('🧪 Step 4: Running tests...\n');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
||||||
|
const tracer = createTracer(spec.repo.name);
|
||||||
|
|
||||||
|
tracer.recordIntrospection({
|
||||||
|
filesAnalyzed: spec.scenarios.map(s => s.target.module),
|
||||||
|
totalFunctions: spec.scenarios.length,
|
||||||
|
duration: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
tracer.recordGeneration({
|
||||||
|
scenariosGenerated: spec.scenarios.length,
|
||||||
|
filesWritten: [options.testDir],
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runTests(
|
||||||
|
options.testDir,
|
||||||
|
{
|
||||||
|
framework,
|
||||||
|
sandbox: true,
|
||||||
|
timeout: 300000,
|
||||||
|
parallel: false,
|
||||||
|
cwd: process.cwd(),
|
||||||
|
},
|
||||||
|
DEFAULT_SANDBOX_CONFIG
|
||||||
|
);
|
||||||
|
|
||||||
|
tracer.recordExecution({
|
||||||
|
testsPassed: result.summary.passed,
|
||||||
|
testsFailed: result.summary.failed,
|
||||||
|
testsSkipped: result.summary.skipped,
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const test of result.tests) {
|
||||||
|
if (test.status === 'failed' || test.status === 'error') {
|
||||||
|
tracer.recordTestFailure({
|
||||||
|
scenarioId: test.id,
|
||||||
|
testName: test.name,
|
||||||
|
error: test.error?.message || 'Unknown error',
|
||||||
|
stack: test.error?.stack,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const trace = tracer.finalize();
|
||||||
|
const tracePath = await saveTrace(trace);
|
||||||
|
|
||||||
|
log(formatResults(result));
|
||||||
|
log(`📊 Trace saved: ${tracePath}`);
|
||||||
|
log(` View with: evaluclaude view ${trace.id}\n`);
|
||||||
|
|
||||||
|
// Save results
|
||||||
|
const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
|
||||||
|
writeFileSync(resultsPath, JSON.stringify(result, null, 2));
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4: Generate Promptfoo config
|
||||||
|
if (options.promptfoo) {
|
||||||
|
log('📦 Step 5: Generating Promptfoo configuration...');
|
||||||
|
|
||||||
|
try {
|
||||||
|
const configPath = join(outputDir, 'promptfooconfig.yaml');
|
||||||
|
const providerPath = join(outputDir, 'providers', 'test-runner.py');
|
||||||
|
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
||||||
|
|
||||||
|
await generatePromptfooConfig(spec, {
|
||||||
|
testDir: options.testDir,
|
||||||
|
outputPath: configPath,
|
||||||
|
framework,
|
||||||
|
includeTraceLinks: true,
|
||||||
|
providerPath,
|
||||||
|
});
|
||||||
|
|
||||||
|
await generateTestProvider(providerPath);
|
||||||
|
|
||||||
|
log(` Config: ${configPath}`);
|
||||||
|
log(` Provider: ${providerPath}`);
|
||||||
|
log(`\n Launch UI with: evaluclaude ui\n`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log('✅ Pipeline complete!');
|
||||||
|
console.log('═'.repeat(50));
|
||||||
|
console.log(`\nNext steps:`);
|
||||||
|
console.log(` View traces: evaluclaude view --last`);
|
||||||
|
console.log(` List all traces: evaluclaude traces`);
|
||||||
|
if (options.promptfoo) {
|
||||||
|
console.log(` Launch UI: evaluclaude ui`);
|
||||||
|
console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`);
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
});
|
||||||
|
|
@ -10,6 +10,7 @@ import {
|
||||||
DEFAULT_SANDBOX_CONFIG
|
DEFAULT_SANDBOX_CONFIG
|
||||||
} from '../../runners/index.js';
|
} from '../../runners/index.js';
|
||||||
import { createTracer, saveTrace } from '../../observability/index.js';
|
import { createTracer, saveTrace } from '../../observability/index.js';
|
||||||
|
import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
|
||||||
import type { EvalSpec } from '../../analyzer/types.js';
|
import type { EvalSpec } from '../../analyzer/types.js';
|
||||||
|
|
||||||
export const runCommand = new Command('run')
|
export const runCommand = new Command('run')
|
||||||
|
|
@ -25,6 +26,7 @@ export const runCommand = new Command('run')
|
||||||
.option('-o, --output <file>', 'Output results to JSON file')
|
.option('-o, --output <file>', 'Output results to JSON file')
|
||||||
.option('--trace', 'Record execution trace', true)
|
.option('--trace', 'Record execution trace', true)
|
||||||
.option('--no-trace', 'Disable execution tracing')
|
.option('--no-trace', 'Disable execution tracing')
|
||||||
|
.option('--export-promptfoo', 'Export results in Promptfoo format', false)
|
||||||
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
|
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
|
||||||
.action(async (testDir: string, options) => {
|
.action(async (testDir: string, options) => {
|
||||||
try {
|
try {
|
||||||
|
|
@ -109,6 +111,16 @@ export const runCommand = new Command('run')
|
||||||
console.log(`\n📁 Results saved to: ${options.output}`);
|
console.log(`\n📁 Results saved to: ${options.output}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Export to Promptfoo format for UI viewing
|
||||||
|
if (options.exportPromptfoo) {
|
||||||
|
const exportPath = await exportToPromptfooFormat(result, spec, {
|
||||||
|
outputDir: '.evaluclaude/results',
|
||||||
|
evalId: `eval-${Date.now()}`,
|
||||||
|
});
|
||||||
|
console.log(`\n📦 Promptfoo results exported: ${exportPath}`);
|
||||||
|
console.log(` View with: evaluclaude ui`);
|
||||||
|
}
|
||||||
|
|
||||||
if (tracer) {
|
if (tracer) {
|
||||||
const trace = tracer.finalize();
|
const trace = tracer.finalize();
|
||||||
const tracePath = await saveTrace(trace);
|
const tracePath = await saveTrace(trace);
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import { Command } from 'commander';
|
import { Command } from 'commander';
|
||||||
import { spawn, type ChildProcess } from 'child_process';
|
import { spawn } from 'child_process';
|
||||||
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
||||||
import { join, dirname } from 'path';
|
import { join, dirname, resolve as resolvePath } from 'path';
|
||||||
import type { EvalSpec } from '../../analyzer/types.js';
|
import type { EvalSpec } from '../../analyzer/types.js';
|
||||||
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
||||||
|
|
||||||
|
|
@ -21,6 +21,7 @@ export const uiCommand = new Command('ui')
|
||||||
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||||
|
|
||||||
|
// If spec provided with --generate, create/update Promptfoo config
|
||||||
if (options.spec && options.generate) {
|
if (options.spec && options.generate) {
|
||||||
console.log('\n📄 Generating Promptfoo configuration...');
|
console.log('\n📄 Generating Promptfoo configuration...');
|
||||||
|
|
||||||
|
|
@ -36,6 +37,7 @@ export const uiCommand = new Command('ui')
|
||||||
outputPath: configPath,
|
outputPath: configPath,
|
||||||
framework: detectFramework(spec),
|
framework: detectFramework(spec),
|
||||||
includeTraceLinks: true,
|
includeTraceLinks: true,
|
||||||
|
providerPath: providerPath,
|
||||||
});
|
});
|
||||||
|
|
||||||
await generateTestProvider(providerPath);
|
await generateTestProvider(providerPath);
|
||||||
|
|
@ -44,20 +46,31 @@ export const uiCommand = new Command('ui')
|
||||||
console.log(` Provider: ${providerPath}`);
|
console.log(` Provider: ${providerPath}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for existing config, create default if missing
|
||||||
if (!existsSync(configPath)) {
|
if (!existsSync(configPath)) {
|
||||||
console.log('\n⚠️ No Promptfoo config found.');
|
console.log('\n⚠️ No Promptfoo config found.');
|
||||||
console.log(' Run with --spec <file> --generate to create one.\n');
|
console.log(' Creating default configuration...\n');
|
||||||
console.log(' Or create one manually:');
|
|
||||||
console.log(` ${configPath}\n`);
|
|
||||||
|
|
||||||
await createDefaultConfig(configPath, providerPath);
|
await createDefaultConfig(configPath, providerPath);
|
||||||
console.log(` Created default config at ${configPath}`);
|
console.log(` Created: ${configPath}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for results to display
|
||||||
|
const resultsDir = join(EVALUCLAUDE_DIR, 'results');
|
||||||
|
const latestResults = join(resultsDir, 'latest.json');
|
||||||
|
|
||||||
|
if (!existsSync(latestResults)) {
|
||||||
|
console.log('\n⚠️ No evaluation results found.');
|
||||||
|
console.log(' Run `evaluclaude run --export-promptfoo` first to generate results.\n');
|
||||||
|
console.log(' Or run the full pipeline:');
|
||||||
|
console.log(' evaluclaude pipeline <path> --promptfoo\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
|
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
|
||||||
console.log(` Config: ${configPath}\n`);
|
console.log(` Results: ${latestResults}\n`);
|
||||||
|
|
||||||
await launchPromptfooUI(port, configPath, options.open);
|
// Use promptfoo view with the results file
|
||||||
|
await launchPromptfooView(port, latestResults, options.open);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error launching UI:', error instanceof Error ? error.message : error);
|
console.error('Error launching UI:', error instanceof Error ? error.message : error);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
|
|
@ -71,12 +84,21 @@ export const evalCommand = new Command('eval')
|
||||||
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
|
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
|
||||||
.option('--view', 'Launch UI after evaluation', false)
|
.option('--view', 'Launch UI after evaluation', false)
|
||||||
.option('-p, --port <port>', 'Port for UI', '3000')
|
.option('-p, --port <port>', 'Port for UI', '3000')
|
||||||
|
.option('--no-cache', 'Disable Promptfoo caching', false)
|
||||||
.action(async (options) => {
|
.action(async (options) => {
|
||||||
try {
|
try {
|
||||||
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||||
|
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||||
|
|
||||||
|
// Generate config from spec if provided
|
||||||
if (options.spec) {
|
if (options.spec) {
|
||||||
console.log('\n📄 Generating Promptfoo configuration from spec...');
|
console.log('\n📄 Generating Promptfoo configuration from spec...');
|
||||||
|
|
||||||
|
if (!existsSync(options.spec)) {
|
||||||
|
console.error(`Error: Spec file not found: ${options.spec}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
||||||
|
|
||||||
await generatePromptfooConfig(spec, {
|
await generatePromptfooConfig(spec, {
|
||||||
|
|
@ -84,30 +106,57 @@ export const evalCommand = new Command('eval')
|
||||||
outputPath: configPath,
|
outputPath: configPath,
|
||||||
framework: detectFramework(spec),
|
framework: detectFramework(spec),
|
||||||
includeTraceLinks: true,
|
includeTraceLinks: true,
|
||||||
|
providerPath: providerPath,
|
||||||
});
|
});
|
||||||
|
|
||||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
|
||||||
await generateTestProvider(providerPath);
|
await generateTestProvider(providerPath);
|
||||||
|
|
||||||
|
console.log(` Config: ${configPath}`);
|
||||||
|
console.log(` Provider: ${providerPath}`);
|
||||||
|
console.log(` Scenarios: ${spec.scenarios.length}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!existsSync(configPath)) {
|
if (!existsSync(configPath)) {
|
||||||
console.error(`Error: Config not found: ${configPath}`);
|
console.error(`\nError: Config not found: ${configPath}`);
|
||||||
console.log('Run with --spec <file> to generate from EvalSpec.');
|
console.log('Run with --spec <file> to generate from EvalSpec, or create config manually.');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('\n🧪 Running Promptfoo evaluations...\n');
|
// Ensure output directory exists
|
||||||
|
mkdirSync(options.output, { recursive: true });
|
||||||
|
|
||||||
|
console.log('\n🧪 Running Promptfoo evaluations...');
|
||||||
|
console.log(` Config: ${configPath}`);
|
||||||
|
console.log(` Output: ${options.output}\n`);
|
||||||
|
|
||||||
const outputFile = join(options.output, `eval-${Date.now()}.json`);
|
const outputFile = join(options.output, `eval-${Date.now()}.json`);
|
||||||
mkdirSync(dirname(outputFile), { recursive: true });
|
|
||||||
|
|
||||||
await runPromptfooEval(configPath, outputFile);
|
const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache);
|
||||||
|
|
||||||
console.log(`\n📁 Results saved: ${outputFile}`);
|
if (exitCode === 0) {
|
||||||
|
console.log(`\n✅ Evaluation complete!`);
|
||||||
|
console.log(`📁 Results: ${outputFile}`);
|
||||||
|
} else {
|
||||||
|
console.log(`\n⚠️ Evaluation finished with exit code ${exitCode}`);
|
||||||
|
console.log(`📁 Results: ${outputFile}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// List traces generated during evaluation
|
||||||
|
const tracesDir = join(EVALUCLAUDE_DIR, 'traces');
|
||||||
|
if (existsSync(tracesDir)) {
|
||||||
|
const { readdirSync } = await import('fs');
|
||||||
|
const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json'));
|
||||||
|
if (traces.length > 0) {
|
||||||
|
console.log(`\n📊 Traces generated: ${traces.length}`);
|
||||||
|
console.log(` View with: evaluclaude view --last`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (options.view) {
|
if (options.view) {
|
||||||
console.log(`\n🚀 Launching UI on port ${options.port}...`);
|
console.log(`\n🚀 Launching UI on port ${options.port}...`);
|
||||||
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
|
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
|
||||||
|
} else {
|
||||||
|
console.log(`\n View results: evaluclaude ui`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error running eval:', error instanceof Error ? error.message : error);
|
console.error('Error running eval:', error instanceof Error ? error.message : error);
|
||||||
|
|
@ -115,6 +164,64 @@ export const evalCommand = new Command('eval')
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch Promptfoo view to display pre-computed results.
|
||||||
|
*/
|
||||||
|
async function launchPromptfooView(
|
||||||
|
port: number,
|
||||||
|
resultsFile: string,
|
||||||
|
openBrowser: boolean
|
||||||
|
): Promise<void> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
// Use 'promptfoo view' which opens the web UI showing results from the output directory
|
||||||
|
const resultsDir = dirname(resolvePath(resultsFile));
|
||||||
|
const args = ['promptfoo', 'view', '--port', String(port)];
|
||||||
|
|
||||||
|
if (openBrowser) {
|
||||||
|
args.push('-y');
|
||||||
|
} else {
|
||||||
|
args.push('-n');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass the directory containing results
|
||||||
|
args.push(resultsDir);
|
||||||
|
|
||||||
|
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||||
|
|
||||||
|
const child = spawn('npx', args, {
|
||||||
|
stdio: 'inherit',
|
||||||
|
env: { ...process.env },
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on('error', (error) => {
|
||||||
|
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||||
|
console.error('\n❌ Promptfoo not found.');
|
||||||
|
console.error(' Install with: npm install -g promptfoo');
|
||||||
|
console.error(' Or run: npx promptfoo --version\n');
|
||||||
|
} else {
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on('close', (code) => {
|
||||||
|
if (code === 0) {
|
||||||
|
resolve();
|
||||||
|
} else {
|
||||||
|
reject(new Error(`Promptfoo exited with code ${code}`));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle Ctrl+C gracefully
|
||||||
|
process.on('SIGINT', () => {
|
||||||
|
child.kill('SIGINT');
|
||||||
|
process.exit(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Launch Promptfoo with a config file (for running evals).
|
||||||
|
*/
|
||||||
async function launchPromptfooUI(
|
async function launchPromptfooUI(
|
||||||
port: number,
|
port: number,
|
||||||
configPath: string,
|
configPath: string,
|
||||||
|
|
@ -129,7 +236,8 @@ async function launchPromptfooUI(
|
||||||
args.push('-n');
|
args.push('-n');
|
||||||
}
|
}
|
||||||
|
|
||||||
const configDir = dirname(configPath);
|
// Pass the directory containing the config
|
||||||
|
const configDir = dirname(resolvePath(configPath));
|
||||||
args.push(configDir);
|
args.push(configDir);
|
||||||
|
|
||||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||||
|
|
@ -141,7 +249,9 @@ async function launchPromptfooUI(
|
||||||
|
|
||||||
child.on('error', (error) => {
|
child.on('error', (error) => {
|
||||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||||
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
|
console.error('\n❌ Promptfoo not found.');
|
||||||
|
console.error(' Install with: npm install -g promptfoo');
|
||||||
|
console.error(' Or run: npx promptfoo --version\n');
|
||||||
} else {
|
} else {
|
||||||
reject(error);
|
reject(error);
|
||||||
}
|
}
|
||||||
|
|
@ -155,6 +265,7 @@ async function launchPromptfooUI(
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Handle Ctrl+C gracefully
|
||||||
process.on('SIGINT', () => {
|
process.on('SIGINT', () => {
|
||||||
child.kill('SIGINT');
|
child.kill('SIGINT');
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
|
|
@ -162,16 +273,23 @@ async function launchPromptfooUI(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
|
async function runPromptfooEval(
|
||||||
|
configPath: string,
|
||||||
|
outputFile: string,
|
||||||
|
noCache: boolean
|
||||||
|
): Promise<number> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const args = [
|
const args = [
|
||||||
'promptfoo',
|
'promptfoo',
|
||||||
'eval',
|
'eval',
|
||||||
'-c', configPath,
|
'-c', configPath,
|
||||||
'-o', outputFile,
|
'-o', outputFile,
|
||||||
'--no-cache',
|
|
||||||
];
|
];
|
||||||
|
|
||||||
|
if (noCache) {
|
||||||
|
args.push('--no-cache');
|
||||||
|
}
|
||||||
|
|
||||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||||
|
|
||||||
const child = spawn('npx', args, {
|
const child = spawn('npx', args, {
|
||||||
|
|
@ -179,14 +297,18 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
|
||||||
env: { ...process.env },
|
env: { ...process.env },
|
||||||
});
|
});
|
||||||
|
|
||||||
child.on('error', reject);
|
child.on('error', (error) => {
|
||||||
|
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||||
|
console.error('\n❌ Promptfoo not found.');
|
||||||
|
console.error(' Install with: npm install -g promptfoo\n');
|
||||||
|
reject(error);
|
||||||
|
} else {
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
child.on('close', (code) => {
|
child.on('close', (code) => {
|
||||||
if (code === 0) {
|
resolve(code ?? 1);
|
||||||
resolve();
|
|
||||||
} else {
|
|
||||||
reject(new Error(`Promptfoo eval exited with code ${code}`));
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
@ -194,6 +316,14 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
|
||||||
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
|
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
|
||||||
const defaultConfig = `# Evaluclaude Promptfoo Configuration
|
const defaultConfig = `# Evaluclaude Promptfoo Configuration
|
||||||
# Generated by evaluclaude
|
# Generated by evaluclaude
|
||||||
|
#
|
||||||
|
# To populate this config from an EvalSpec:
|
||||||
|
# evaluclaude eval --spec <evalspec.json>
|
||||||
|
#
|
||||||
|
# Or run the full pipeline:
|
||||||
|
# evaluclaude analyze <path> -o spec.json
|
||||||
|
# evaluclaude render spec.json -o tests/generated
|
||||||
|
# evaluclaude eval --spec spec.json
|
||||||
|
|
||||||
description: "Evaluclaude functional test evaluations"
|
description: "Evaluclaude functional test evaluations"
|
||||||
|
|
||||||
|
|
@ -204,12 +334,13 @@ providers:
|
||||||
test_dir: ./tests/generated
|
test_dir: ./tests/generated
|
||||||
framework: pytest
|
framework: pytest
|
||||||
timeout: 300
|
timeout: 300
|
||||||
|
sandbox: true
|
||||||
|
|
||||||
prompts:
|
prompts:
|
||||||
- "{{scenario_id}}"
|
- "{{scenario_id}}"
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
- description: "Example test"
|
- description: "Example test - replace with real scenarios"
|
||||||
vars:
|
vars:
|
||||||
scenario_id: "test_example"
|
scenario_id: "test_example"
|
||||||
assert:
|
assert:
|
||||||
|
|
@ -219,12 +350,19 @@ tests:
|
||||||
result = json.loads(output)
|
result = json.loads(output)
|
||||||
result.get('passed', 0) > 0
|
result.get('passed', 0) > 0
|
||||||
|
|
||||||
|
# Default test configuration
|
||||||
|
defaultTest:
|
||||||
|
metadata:
|
||||||
|
evaluclaude: true
|
||||||
|
tracesDir: .evaluclaude/traces
|
||||||
|
|
||||||
outputPath: .evaluclaude/results/promptfoo-results.json
|
outputPath: .evaluclaude/results/promptfoo-results.json
|
||||||
`;
|
`;
|
||||||
|
|
||||||
mkdirSync(dirname(configPath), { recursive: true });
|
mkdirSync(dirname(configPath), { recursive: true });
|
||||||
writeFileSync(configPath, defaultConfig);
|
writeFileSync(configPath, defaultConfig);
|
||||||
|
|
||||||
|
// Also generate the provider
|
||||||
await generateTestProvider(providerPath);
|
await generateTestProvider(providerPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -232,5 +370,8 @@ function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
|
||||||
if (spec.repo.languages.includes('python')) {
|
if (spec.repo.languages.includes('python')) {
|
||||||
return 'pytest';
|
return 'pytest';
|
||||||
}
|
}
|
||||||
|
if (spec.repo.languages.includes('typescript') || spec.repo.languages.includes('javascript')) {
|
||||||
|
return 'vitest';
|
||||||
|
}
|
||||||
return 'vitest';
|
return 'vitest';
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/g
|
||||||
import { runCommand } from './commands/run.js';
|
import { runCommand } from './commands/run.js';
|
||||||
import { viewCommand, tracesCommand } from './commands/view.js';
|
import { viewCommand, tracesCommand } from './commands/view.js';
|
||||||
import { uiCommand, evalCommand } from './commands/ui.js';
|
import { uiCommand, evalCommand } from './commands/ui.js';
|
||||||
|
import { pipelineCommand } from './commands/pipeline.js';
|
||||||
|
|
||||||
const program = new Command();
|
const program = new Command();
|
||||||
|
|
||||||
|
|
@ -16,15 +17,25 @@ program
|
||||||
.description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.')
|
.description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.')
|
||||||
.version('0.1.0');
|
.version('0.1.0');
|
||||||
|
|
||||||
|
// Core pipeline command - the "zero to evals" experience
|
||||||
|
program.addCommand(pipelineCommand);
|
||||||
|
|
||||||
|
// Individual step commands
|
||||||
program.addCommand(introCommand);
|
program.addCommand(introCommand);
|
||||||
program.addCommand(analyzeCommand);
|
program.addCommand(analyzeCommand);
|
||||||
program.addCommand(renderCommand);
|
program.addCommand(renderCommand);
|
||||||
|
program.addCommand(runCommand);
|
||||||
|
|
||||||
|
// Grading commands
|
||||||
program.addCommand(gradeCommand);
|
program.addCommand(gradeCommand);
|
||||||
program.addCommand(listRubricsCommand);
|
program.addCommand(listRubricsCommand);
|
||||||
program.addCommand(calibrateCommand);
|
program.addCommand(calibrateCommand);
|
||||||
program.addCommand(runCommand);
|
|
||||||
|
// Observability commands
|
||||||
program.addCommand(viewCommand);
|
program.addCommand(viewCommand);
|
||||||
program.addCommand(tracesCommand);
|
program.addCommand(tracesCommand);
|
||||||
|
|
||||||
|
// Promptfoo integration commands
|
||||||
program.addCommand(uiCommand);
|
program.addCommand(uiCommand);
|
||||||
program.addCommand(evalCommand);
|
program.addCommand(evalCommand);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}
|
||||||
lines.push('─'.repeat(40));
|
lines.push('─'.repeat(40));
|
||||||
lines.push(` ✅ Passed: ${trace.execution.testsPassed}`);
|
lines.push(` ✅ Passed: ${trace.execution.testsPassed}`);
|
||||||
lines.push(` ❌ Failed: ${trace.execution.testsFailed}`);
|
lines.push(` ❌ Failed: ${trace.execution.testsFailed}`);
|
||||||
lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped}`);
|
lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped ?? 0}`);
|
||||||
lines.push('');
|
lines.push('');
|
||||||
|
|
||||||
if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
|
if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import { writeFile, mkdir } from 'fs/promises';
|
import { writeFile, mkdir } from 'fs/promises';
|
||||||
import { dirname, join } from 'path';
|
import { dirname, join, resolve } from 'path';
|
||||||
import * as yaml from 'js-yaml';
|
import * as yaml from 'js-yaml';
|
||||||
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
||||||
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
|
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
|
||||||
|
|
@ -9,6 +9,7 @@ export interface ConfigOptions {
|
||||||
outputPath: string;
|
outputPath: string;
|
||||||
framework: 'pytest' | 'vitest' | 'jest';
|
framework: 'pytest' | 'vitest' | 'jest';
|
||||||
includeTraceLinks: boolean;
|
includeTraceLinks: boolean;
|
||||||
|
providerPath?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generatePromptfooConfig(
|
export async function generatePromptfooConfig(
|
||||||
|
|
@ -30,16 +31,23 @@ export async function generatePromptfooConfig(
|
||||||
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
|
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
|
||||||
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
|
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
|
||||||
|
|
||||||
|
// Provider path should be relative to the config file location
|
||||||
|
// Since config is at .evaluclaude/promptfooconfig.yaml, the provider is at ./providers/test-runner.py
|
||||||
|
const providerRelativePath = options.providerPath
|
||||||
|
? options.providerPath.replace('.evaluclaude/', './').replace(/^\.evaluclaude\//, './')
|
||||||
|
: './providers/test-runner.py';
|
||||||
|
|
||||||
return {
|
return {
|
||||||
description: `Evaluclaude functional tests for ${spec.repo.name}`,
|
description: `Evaluclaude functional tests for ${spec.repo.name}`,
|
||||||
providers: [
|
providers: [
|
||||||
{
|
{
|
||||||
id: `file://providers/test-runner.py`,
|
id: `file://${providerRelativePath}`,
|
||||||
label: 'functional-tests',
|
label: 'functional-tests',
|
||||||
config: {
|
config: {
|
||||||
test_dir: options.testDir,
|
test_dir: resolve(options.testDir),
|
||||||
framework: options.framework,
|
framework: options.framework,
|
||||||
timeout: 300,
|
timeout: 300,
|
||||||
|
sandbox: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
@ -48,11 +56,12 @@ function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
|
||||||
defaultTest: options.includeTraceLinks
|
defaultTest: options.includeTraceLinks
|
||||||
? {
|
? {
|
||||||
metadata: {
|
metadata: {
|
||||||
traceFile: '.evaluclaude/traces/{{evalId}}.json',
|
evaluclaude: true,
|
||||||
|
tracesDir: './traces',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
: undefined,
|
: undefined,
|
||||||
outputPath: '.evaluclaude/results/promptfoo-results.json',
|
outputPath: './results/promptfoo-results.json',
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -147,91 +156,50 @@ function buildAssertion(assertion: any): PromptfooAssertion {
|
||||||
|
|
||||||
export async function generateTestProvider(outputPath: string): Promise<void> {
|
export async function generateTestProvider(outputPath: string): Promise<void> {
|
||||||
const providerCode = `#!/usr/bin/env python3
|
const providerCode = `#!/usr/bin/env python3
|
||||||
"""Promptfoo provider that executes tests and returns structured results."""
|
"""
|
||||||
|
Promptfoo provider that executes tests and returns structured results.
|
||||||
|
|
||||||
|
This provider integrates with evaluclaude-harness test runners to execute
|
||||||
|
functional tests in a sandboxed environment and return results compatible
|
||||||
|
with Promptfoo's assertion system.
|
||||||
|
"""
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
|
||||||
|
def call_api(prompt: str, options: dict, context: dict) -> dict:
|
||||||
"""Runs tests and returns structured results."""
|
"""Runs tests and returns structured results."""
|
||||||
|
|
||||||
test_dir = options.get('config', {}).get('test_dir', './tests')
|
config = options.get('config', {})
|
||||||
framework = options.get('config', {}).get('framework', 'pytest')
|
test_dir = config.get('test_dir', './tests/generated')
|
||||||
timeout = options.get('config', {}).get('timeout', 300)
|
framework = config.get('framework', 'pytest')
|
||||||
|
timeout = config.get('timeout', 300)
|
||||||
|
sandbox = config.get('sandbox', True)
|
||||||
|
|
||||||
scenario_id = prompt.strip()
|
scenario_id = prompt.strip()
|
||||||
|
eval_id = f"eval-{uuid.uuid4().hex[:8]}"
|
||||||
|
|
||||||
|
# Ensure traces directory exists
|
||||||
|
traces_dir = Path('.evaluclaude/traces')
|
||||||
|
traces_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if framework == 'pytest':
|
if framework == 'pytest':
|
||||||
result = subprocess.run(
|
output = run_pytest(test_dir, scenario_id, timeout, eval_id)
|
||||||
[
|
|
||||||
'python', '-m', 'pytest',
|
|
||||||
'--json-report',
|
|
||||||
'--json-report-file=/tmp/pytest_results.json',
|
|
||||||
'-k', scenario_id,
|
|
||||||
test_dir
|
|
||||||
],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=timeout
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open('/tmp/pytest_results.json') as f:
|
|
||||||
report = json.load(f)
|
|
||||||
|
|
||||||
output = {
|
|
||||||
'passed': report.get('summary', {}).get('passed', 0),
|
|
||||||
'failed': report.get('summary', {}).get('failed', 0),
|
|
||||||
'skipped': report.get('summary', {}).get('skipped', 0),
|
|
||||||
'tests': report.get('tests', []),
|
|
||||||
'stdout': result.stdout,
|
|
||||||
'stderr': result.stderr,
|
|
||||||
'exit_code': result.returncode,
|
|
||||||
}
|
|
||||||
except FileNotFoundError:
|
|
||||||
output = {
|
|
||||||
'passed': 0,
|
|
||||||
'failed': 1,
|
|
||||||
'error': 'Failed to generate pytest report',
|
|
||||||
'stdout': result.stdout,
|
|
||||||
'stderr': result.stderr,
|
|
||||||
}
|
|
||||||
|
|
||||||
elif framework in ('vitest', 'jest'):
|
elif framework in ('vitest', 'jest'):
|
||||||
cmd = ['npx', framework, 'run', '--reporter=json']
|
output = run_js_tests(test_dir, scenario_id, timeout, framework, eval_id)
|
||||||
if scenario_id:
|
|
||||||
cmd.extend(['--testNamePattern', scenario_id])
|
|
||||||
cmd.append(test_dir)
|
|
||||||
|
|
||||||
result = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=timeout
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
report = json.loads(result.stdout)
|
|
||||||
output = {
|
|
||||||
'passed': report.get('numPassedTests', 0),
|
|
||||||
'failed': report.get('numFailedTests', 0),
|
|
||||||
'skipped': report.get('numSkippedTests', 0),
|
|
||||||
'tests': report.get('testResults', []),
|
|
||||||
'exit_code': result.returncode,
|
|
||||||
}
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
output = {
|
|
||||||
'passed': 0,
|
|
||||||
'failed': 1,
|
|
||||||
'error': 'Failed to parse test output',
|
|
||||||
'stdout': result.stdout,
|
|
||||||
'stderr': result.stderr,
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
output = {'error': f'Unknown framework: {framework}'}
|
output = {'error': f'Unknown framework: {framework}', 'passed': 0, 'failed': 1}
|
||||||
|
|
||||||
|
# Add trace reference
|
||||||
|
output['eval_id'] = eval_id
|
||||||
|
output['trace_file'] = str(traces_dir / f"{eval_id}.json")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'output': json.dumps(output),
|
'output': json.dumps(output),
|
||||||
|
|
@ -240,32 +208,187 @@ def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
return {
|
return {
|
||||||
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
|
'output': json.dumps({
|
||||||
|
'error': 'Test execution timed out',
|
||||||
|
'passed': 0,
|
||||||
|
'failed': 1,
|
||||||
|
'eval_id': eval_id,
|
||||||
|
}),
|
||||||
'error': None,
|
'error': None,
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {
|
return {
|
||||||
'output': None,
|
'output': json.dumps({
|
||||||
|
'error': str(e),
|
||||||
|
'passed': 0,
|
||||||
|
'failed': 1,
|
||||||
|
'eval_id': eval_id,
|
||||||
|
}),
|
||||||
'error': str(e),
|
'error': str(e),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_pytest(test_dir: str, scenario_id: str, timeout: int, eval_id: str) -> dict:
|
||||||
|
"""Run pytest and return structured results."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
|
||||||
|
report_file = f.name
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
sys.executable, '-m', 'pytest',
|
||||||
|
'--json-report',
|
||||||
|
f'--json-report-file={report_file}',
|
||||||
|
'-v',
|
||||||
|
'--tb=short',
|
||||||
|
]
|
||||||
|
|
||||||
|
if scenario_id:
|
||||||
|
cmd.extend(['-k', scenario_id])
|
||||||
|
|
||||||
|
cmd.append(test_dir)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout,
|
||||||
|
cwd=os.getcwd(),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(report_file) as f:
|
||||||
|
report = json.load(f)
|
||||||
|
|
||||||
|
summary = report.get('summary', {})
|
||||||
|
tests = report.get('tests', [])
|
||||||
|
|
||||||
|
output = {
|
||||||
|
'passed': summary.get('passed', 0),
|
||||||
|
'failed': summary.get('failed', 0),
|
||||||
|
'skipped': summary.get('skipped', 0),
|
||||||
|
'total': summary.get('total', 0),
|
||||||
|
'duration': report.get('duration', 0) * 1000, # Convert to ms
|
||||||
|
'tests': [
|
||||||
|
{
|
||||||
|
'id': extract_scenario_id(t.get('nodeid', '')),
|
||||||
|
'name': t.get('nodeid', ''),
|
||||||
|
'status': t.get('outcome', 'unknown'),
|
||||||
|
'duration': (t.get('call', {}).get('duration', 0) or 0) * 1000,
|
||||||
|
'error': t.get('call', {}).get('crash', {}).get('message') if t.get('call', {}).get('crash') else None,
|
||||||
|
}
|
||||||
|
for t in tests
|
||||||
|
],
|
||||||
|
'exit_code': result.returncode,
|
||||||
|
}
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||||
|
output = {
|
||||||
|
'passed': 0,
|
||||||
|
'failed': 1,
|
||||||
|
'error': f'Failed to parse pytest report: {e}',
|
||||||
|
'stdout': result.stdout[-2000:] if result.stdout else '',
|
||||||
|
'stderr': result.stderr[-2000:] if result.stderr else '',
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(report_file)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def run_js_tests(test_dir: str, scenario_id: str, timeout: int, framework: str, eval_id: str) -> dict:
|
||||||
|
"""Run vitest/jest and return structured results."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
|
||||||
|
report_file = f.name
|
||||||
|
|
||||||
|
cmd = ['npx', framework, 'run', '--reporter=json', f'--outputFile={report_file}']
|
||||||
|
|
||||||
|
if scenario_id:
|
||||||
|
cmd.extend(['--testNamePattern', scenario_id])
|
||||||
|
|
||||||
|
cmd.append(test_dir)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout,
|
||||||
|
cwd=os.getcwd(),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(report_file) as f:
|
||||||
|
report = json.load(f)
|
||||||
|
|
||||||
|
output = {
|
||||||
|
'passed': report.get('numPassedTests', 0),
|
||||||
|
'failed': report.get('numFailedTests', 0),
|
||||||
|
'skipped': report.get('numSkippedTests', 0),
|
||||||
|
'total': report.get('numTotalTests', 0),
|
||||||
|
'tests': [],
|
||||||
|
'exit_code': result.returncode,
|
||||||
|
}
|
||||||
|
|
||||||
|
for test_file in report.get('testResults', []):
|
||||||
|
for assertion in test_file.get('assertionResults', []):
|
||||||
|
output['tests'].append({
|
||||||
|
'id': extract_scenario_id(assertion.get('fullName', '')),
|
||||||
|
'name': assertion.get('fullName', ''),
|
||||||
|
'status': assertion.get('status', 'unknown'),
|
||||||
|
'duration': assertion.get('duration', 0),
|
||||||
|
'error': assertion.get('failureMessages', [None])[0] if assertion.get('failureMessages') else None,
|
||||||
|
})
|
||||||
|
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||||
|
output = {
|
||||||
|
'passed': 0,
|
||||||
|
'failed': 1,
|
||||||
|
'error': f'Failed to parse {framework} report: {e}',
|
||||||
|
'stdout': result.stdout[-2000:] if result.stdout else '',
|
||||||
|
'stderr': result.stderr[-2000:] if result.stderr else '',
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(report_file)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def extract_scenario_id(nodeid: str) -> str:
|
||||||
|
"""Extract scenario ID from test name."""
|
||||||
|
import re
|
||||||
|
match = re.search(r'test[_\\s]([a-zA-Z0-9_-]+)', nodeid, re.IGNORECASE)
|
||||||
|
return match.group(1) if match else nodeid.replace(' ', '_')
|
||||||
|
|
||||||
|
|
||||||
|
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
||||||
|
"""Alias for call_api for backwards compatibility."""
|
||||||
|
return call_api(prompt, options, context)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# For testing the provider directly
|
|
||||||
import argparse
|
import argparse
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser(description='Run tests for Promptfoo')
|
||||||
parser.add_argument('--scenario', default='')
|
parser.add_argument('--scenario', default='', help='Scenario ID to filter')
|
||||||
parser.add_argument('--test-dir', default='./tests')
|
parser.add_argument('--test-dir', default='./tests/generated', help='Test directory')
|
||||||
parser.add_argument('--framework', default='pytest')
|
parser.add_argument('--framework', default='pytest', help='Test framework')
|
||||||
|
parser.add_argument('--timeout', type=int, default=300, help='Timeout in seconds')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
result = get_provider_response(
|
result = call_api(
|
||||||
args.scenario,
|
args.scenario,
|
||||||
{'config': {'test_dir': args.test_dir, 'framework': args.framework}},
|
{'config': {
|
||||||
|
'test_dir': args.test_dir,
|
||||||
|
'framework': args.framework,
|
||||||
|
'timeout': args.timeout,
|
||||||
|
}},
|
||||||
{}
|
{}
|
||||||
)
|
)
|
||||||
print(json.dumps(result, indent=2))
|
print(json.dumps(json.loads(result['output']), indent=2) if result['output'] else result['error'])
|
||||||
`;
|
`;
|
||||||
|
|
||||||
await mkdir(dirname(outputPath), { recursive: true });
|
await mkdir(dirname(outputPath), { recursive: true });
|
||||||
await writeFile(outputPath, providerCode);
|
await writeFile(outputPath, providerCode, { mode: 0o755 });
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,13 @@
|
||||||
export * from './types.js';
|
export * from './types.js';
|
||||||
export { generatePromptfooConfig, generateTestProvider } from './config-generator.js';
|
export { generatePromptfooConfig, generateTestProvider, type ConfigOptions } from './config-generator.js';
|
||||||
|
export {
|
||||||
|
runTestsForPromptfoo,
|
||||||
|
savePromptfooResults,
|
||||||
|
type RunTestsForPromptfooOptions,
|
||||||
|
type PromptfooProviderResult,
|
||||||
|
} from './runner-bridge.js';
|
||||||
|
export {
|
||||||
|
exportToPromptfooFormat,
|
||||||
|
generateViewOnlyConfig,
|
||||||
|
type ExportOptions,
|
||||||
|
} from './results-exporter.js';
|
||||||
|
|
|
||||||
127
src/promptfoo/results-exporter.ts
Normal file
127
src/promptfoo/results-exporter.ts
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
/**
|
||||||
|
* Export test execution results to Promptfoo format for viewing in the UI.
|
||||||
|
*
|
||||||
|
* Instead of using Promptfoo to run tests (which requires a provider that
|
||||||
|
* responds quickly), we run tests ourselves and export results to Promptfoo's
|
||||||
|
* result format. This allows us to use Promptfoo's excellent visualization UI.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { writeFile, mkdir } from 'fs/promises';
|
||||||
|
import { join } from 'path';
|
||||||
|
import type { ExecutionResult } from '../runners/types.js';
|
||||||
|
import type { EvalSpec } from '../analyzer/types.js';
|
||||||
|
import type { PromptfooResult, PromptfooTestResult } from './types.js';
|
||||||
|
|
||||||
|
export interface ExportOptions {
|
||||||
|
outputDir: string;
|
||||||
|
evalId?: string;
|
||||||
|
includeSpec?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Export ExecutionResult to Promptfoo result format.
|
||||||
|
*/
|
||||||
|
export async function exportToPromptfooFormat(
|
||||||
|
result: ExecutionResult,
|
||||||
|
spec: EvalSpec | undefined,
|
||||||
|
options: ExportOptions
|
||||||
|
): Promise<string> {
|
||||||
|
const { outputDir, evalId = `eval-${Date.now()}` } = options;
|
||||||
|
|
||||||
|
const promptfooResult = buildPromptfooResult(result, spec, evalId);
|
||||||
|
|
||||||
|
await mkdir(outputDir, { recursive: true });
|
||||||
|
const outputPath = join(outputDir, `${evalId}.json`);
|
||||||
|
await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
|
||||||
|
|
||||||
|
// Also write the latest.json symlink equivalent
|
||||||
|
const latestPath = join(outputDir, 'latest.json');
|
||||||
|
await writeFile(latestPath, JSON.stringify(promptfooResult, null, 2));
|
||||||
|
|
||||||
|
return outputPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildPromptfooResult(
|
||||||
|
result: ExecutionResult,
|
||||||
|
spec: EvalSpec | undefined,
|
||||||
|
evalId: string
|
||||||
|
): PromptfooResult {
|
||||||
|
const testResults: PromptfooTestResult[] = result.tests.map(test => {
|
||||||
|
// Try to find matching scenario from spec
|
||||||
|
const scenario = spec?.scenarios.find(s =>
|
||||||
|
s.id === test.id || test.name.includes(s.id)
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
prompt: {
|
||||||
|
raw: scenario?.id || test.id,
|
||||||
|
label: scenario?.name || test.name,
|
||||||
|
},
|
||||||
|
vars: {
|
||||||
|
scenario_id: scenario?.id || test.id,
|
||||||
|
target_module: scenario?.target.module || '',
|
||||||
|
target_function: scenario?.target.function || '',
|
||||||
|
description: scenario?.description || test.name,
|
||||||
|
},
|
||||||
|
response: {
|
||||||
|
output: test.status === 'passed'
|
||||||
|
? 'Test passed successfully'
|
||||||
|
: test.error?.message || 'Test failed',
|
||||||
|
},
|
||||||
|
gradingResult: {
|
||||||
|
pass: test.status === 'passed',
|
||||||
|
score: test.status === 'passed' ? 1 : 0,
|
||||||
|
reason: test.status === 'passed'
|
||||||
|
? 'All assertions passed'
|
||||||
|
: test.error?.message || 'Test failed',
|
||||||
|
componentResults: test.assertions.details.map(a => ({
|
||||||
|
pass: a.passed,
|
||||||
|
score: a.passed ? 1 : 0,
|
||||||
|
reason: a.description,
|
||||||
|
assertion: {
|
||||||
|
type: 'custom',
|
||||||
|
value: a.description,
|
||||||
|
},
|
||||||
|
})),
|
||||||
|
},
|
||||||
|
success: test.status === 'passed',
|
||||||
|
error: test.error?.message,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
version: 1,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
results: testResults,
|
||||||
|
stats: {
|
||||||
|
successes: result.summary.passed,
|
||||||
|
failures: result.summary.failed,
|
||||||
|
tokenUsage: {
|
||||||
|
total: 0,
|
||||||
|
prompt: 0,
|
||||||
|
completion: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a minimal Promptfoo config that just views results (no provider).
|
||||||
|
*/
|
||||||
|
export function generateViewOnlyConfig(spec: EvalSpec): string {
|
||||||
|
return `# Evaluclaude Results Config
|
||||||
|
# This config is for viewing results only - tests are run via evaluclaude run
|
||||||
|
|
||||||
|
description: "Test results for ${spec.repo.name}"
|
||||||
|
|
||||||
|
# No providers needed - we pre-run tests and import results
|
||||||
|
providers: []
|
||||||
|
|
||||||
|
prompts: []
|
||||||
|
|
||||||
|
tests: []
|
||||||
|
|
||||||
|
# Results are stored here by evaluclaude run --export-promptfoo
|
||||||
|
outputPath: .evaluclaude/results/latest.json
|
||||||
|
`;
|
||||||
|
}
|
||||||
194
src/promptfoo/runner-bridge.ts
Normal file
194
src/promptfoo/runner-bridge.ts
Normal file
|
|
@ -0,0 +1,194 @@
|
||||||
|
/**
|
||||||
|
* Bridge between our test runners and Promptfoo's provider interface.
|
||||||
|
*
|
||||||
|
* This module provides a unified way to run tests that works both:
|
||||||
|
* 1. Standalone via our `run` command
|
||||||
|
* 2. As a Promptfoo provider via the generated test-runner.py
|
||||||
|
*
|
||||||
|
* Results are stored in a format compatible with Promptfoo's expectations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { writeFile, mkdir } from 'fs/promises';
|
||||||
|
import { join, dirname } from 'path';
|
||||||
|
import { runTests, type ExecutionResult, type ExecutionOptions, DEFAULT_SANDBOX_CONFIG } from '../runners/index.js';
|
||||||
|
import { createTracer, saveTrace, type EvalTrace } from '../observability/index.js';
|
||||||
|
|
||||||
|
export interface PromptfooProviderResult {
|
||||||
|
output: string;
|
||||||
|
error: string | null;
|
||||||
|
tokenUsage?: {
|
||||||
|
total: number;
|
||||||
|
prompt: number;
|
||||||
|
completion: number;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RunTestsForPromptfooOptions {
|
||||||
|
scenarioId: string;
|
||||||
|
testDir: string;
|
||||||
|
framework: 'pytest' | 'vitest' | 'jest';
|
||||||
|
timeout?: number;
|
||||||
|
sandbox?: boolean;
|
||||||
|
evalId?: string;
|
||||||
|
recordTrace?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run tests for a specific scenario and format results for Promptfoo.
|
||||||
|
*/
|
||||||
|
export async function runTestsForPromptfoo(
|
||||||
|
options: RunTestsForPromptfooOptions
|
||||||
|
): Promise<PromptfooProviderResult> {
|
||||||
|
const {
|
||||||
|
scenarioId,
|
||||||
|
testDir,
|
||||||
|
framework,
|
||||||
|
timeout = 300000,
|
||||||
|
sandbox = true,
|
||||||
|
evalId = `eval-${Date.now()}`,
|
||||||
|
recordTrace = true,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const tracer = recordTrace ? createTracer(evalId) : null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const execOptions: ExecutionOptions = {
|
||||||
|
framework,
|
||||||
|
sandbox,
|
||||||
|
timeout,
|
||||||
|
parallel: false,
|
||||||
|
filter: scenarioId ? [scenarioId] : undefined,
|
||||||
|
cwd: process.cwd(),
|
||||||
|
};
|
||||||
|
|
||||||
|
tracer?.recordIntrospection({
|
||||||
|
filesAnalyzed: [testDir],
|
||||||
|
duration: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runTests(
|
||||||
|
testDir,
|
||||||
|
execOptions,
|
||||||
|
sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
|
||||||
|
);
|
||||||
|
|
||||||
|
// Record execution results in trace
|
||||||
|
if (tracer) {
|
||||||
|
tracer.recordExecution({
|
||||||
|
testsPassed: result.summary.passed,
|
||||||
|
testsFailed: result.summary.failed,
|
||||||
|
testsSkipped: result.summary.skipped,
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const test of result.tests) {
|
||||||
|
if (test.status === 'failed' || test.status === 'error') {
|
||||||
|
tracer.recordTestFailure({
|
||||||
|
scenarioId: test.id,
|
||||||
|
testName: test.name,
|
||||||
|
error: test.error?.message || 'Unknown error',
|
||||||
|
stack: test.error?.stack,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build Promptfoo-compatible output
|
||||||
|
const promptfooOutput = buildPromptfooOutput(result, scenarioId);
|
||||||
|
|
||||||
|
// Save trace if enabled
|
||||||
|
if (tracer) {
|
||||||
|
const trace = tracer.finalize();
|
||||||
|
await saveTrace(trace);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
output: JSON.stringify(promptfooOutput),
|
||||||
|
error: null,
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
if (tracer) {
|
||||||
|
tracer.recordError(error instanceof Error ? error : new Error(String(error)));
|
||||||
|
const trace = tracer.finalize();
|
||||||
|
await saveTrace(trace);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
output: JSON.stringify({
|
||||||
|
passed: 0,
|
||||||
|
failed: 1,
|
||||||
|
error: error instanceof Error ? error.message : String(error),
|
||||||
|
}),
|
||||||
|
error: error instanceof Error ? error.message : String(error),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build Promptfoo-compatible output from ExecutionResult.
|
||||||
|
*/
|
||||||
|
function buildPromptfooOutput(
|
||||||
|
result: ExecutionResult,
|
||||||
|
scenarioId?: string
|
||||||
|
): Record<string, unknown> {
|
||||||
|
const matchingTests = scenarioId
|
||||||
|
? result.tests.filter(t => t.id === scenarioId || t.name.includes(scenarioId))
|
||||||
|
: result.tests;
|
||||||
|
|
||||||
|
return {
|
||||||
|
passed: matchingTests.filter(t => t.status === 'passed').length,
|
||||||
|
failed: matchingTests.filter(t => t.status === 'failed' || t.status === 'error').length,
|
||||||
|
skipped: matchingTests.filter(t => t.status === 'skipped').length,
|
||||||
|
total: matchingTests.length,
|
||||||
|
tests: matchingTests.map(t => ({
|
||||||
|
id: t.id,
|
||||||
|
name: t.name,
|
||||||
|
status: t.status,
|
||||||
|
duration: t.duration,
|
||||||
|
error: t.error?.message,
|
||||||
|
})),
|
||||||
|
summary: {
|
||||||
|
...result.summary,
|
||||||
|
matchedScenario: scenarioId,
|
||||||
|
},
|
||||||
|
errors: result.errors,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a Promptfoo-compatible results file from our execution results.
|
||||||
|
*/
|
||||||
|
export async function savePromptfooResults(
|
||||||
|
result: ExecutionResult,
|
||||||
|
evalId: string,
|
||||||
|
outputDir: string = '.evaluclaude/results'
|
||||||
|
): Promise<string> {
|
||||||
|
const promptfooResult = {
|
||||||
|
version: 1,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
evalId,
|
||||||
|
results: result.tests.map(t => ({
|
||||||
|
prompt: { raw: t.id, label: t.name },
|
||||||
|
vars: { scenario_id: t.id },
|
||||||
|
response: {
|
||||||
|
output: t.status === 'passed' ? 'PASS' : t.error?.message || 'FAIL',
|
||||||
|
},
|
||||||
|
gradingResult: {
|
||||||
|
pass: t.status === 'passed',
|
||||||
|
score: t.status === 'passed' ? 1 : 0,
|
||||||
|
reason: t.error?.message || (t.status === 'passed' ? 'Test passed' : 'Test failed'),
|
||||||
|
},
|
||||||
|
success: t.status === 'passed',
|
||||||
|
error: t.error?.message,
|
||||||
|
})),
|
||||||
|
stats: {
|
||||||
|
successes: result.summary.passed,
|
||||||
|
failures: result.summary.failed,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
await mkdir(outputDir, { recursive: true });
|
||||||
|
const outputPath = join(outputDir, `promptfoo-${evalId}.json`);
|
||||||
|
await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
|
||||||
|
|
||||||
|
return outputPath;
|
||||||
|
}
|
||||||
|
|
@ -73,8 +73,8 @@ export function formatResults(result: ExecutionResult): string {
|
||||||
lines.push(` Total: ${result.summary.total}`);
|
lines.push(` Total: ${result.summary.total}`);
|
||||||
lines.push(` ✅ Passed: ${result.summary.passed}`);
|
lines.push(` ✅ Passed: ${result.summary.passed}`);
|
||||||
lines.push(` ❌ Failed: ${result.summary.failed}`);
|
lines.push(` ❌ Failed: ${result.summary.failed}`);
|
||||||
lines.push(` ⏭️ Skipped: ${result.summary.skipped}`);
|
lines.push(` ⏭️ Skipped: ${result.summary.skipped ?? 0}`);
|
||||||
lines.push(` ⏱️ Duration: ${result.summary.duration}ms`);
|
lines.push(` ⏱️ Duration: ${result.summary.duration || 0}ms`);
|
||||||
|
|
||||||
if (result.errors.length > 0) {
|
if (result.errors.length > 0) {
|
||||||
lines.push('');
|
lines.push('');
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue