mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-17 17:03:26 +00:00
improvements and promptfoo
This commit is contained in:
parent
6698c12e5b
commit
ff5300f4e0
13 changed files with 1082 additions and 117 deletions
257
src/cli/commands/pipeline.ts
Normal file
257
src/cli/commands/pipeline.ts
Normal file
|
|
@ -0,0 +1,257 @@
|
|||
import { Command } from 'commander';
|
||||
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
||||
import { join, resolve } from 'path';
|
||||
import { analyze } from '../../introspector/index.js';
|
||||
import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
|
||||
import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
|
||||
import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
|
||||
import { createTracer, saveTrace } from '../../observability/index.js';
|
||||
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
|
||||
const EVALUCLAUDE_DIR = '.evaluclaude';
|
||||
|
||||
interface PipelineOptions {
|
||||
output?: string;
|
||||
interactive?: boolean;
|
||||
focus?: string;
|
||||
maxScenarios: string;
|
||||
testDir: string;
|
||||
framework?: string;
|
||||
skipAnalyze?: boolean;
|
||||
skipRender?: boolean;
|
||||
skipRun?: boolean;
|
||||
promptfoo?: boolean;
|
||||
quiet?: boolean;
|
||||
}
|
||||
|
||||
export const pipelineCommand = new Command('pipeline')
|
||||
.description('Run the full eval generation pipeline: introspect → analyze → render → run')
|
||||
.argument('[path]', 'Path to the repository to analyze', '.')
|
||||
.option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude')
|
||||
.option('-i, --interactive', 'Enable interactive mode with clarifying questions')
|
||||
.option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
|
||||
.option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
|
||||
.option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
|
||||
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
||||
.option('--skip-analyze', 'Skip analysis, use existing spec')
|
||||
.option('--skip-render', 'Skip rendering, use existing tests')
|
||||
.option('--skip-run', 'Skip test execution')
|
||||
.option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
|
||||
.option('--quiet', 'Suppress progress messages')
|
||||
.action(async (repoPath: string, options: PipelineOptions) => {
|
||||
const absolutePath = resolve(repoPath);
|
||||
const log = options.quiet ? () => {} : console.log;
|
||||
const outputDir = options.output || EVALUCLAUDE_DIR;
|
||||
|
||||
console.log('\n🚀 Evaluclaude Pipeline');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(` Repository: ${absolutePath}`);
|
||||
console.log(` Output: ${outputDir}`);
|
||||
console.log('═'.repeat(50) + '\n');
|
||||
|
||||
// Ensure output directories exist
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
mkdirSync(options.testDir, { recursive: true });
|
||||
|
||||
const specPath = join(outputDir, 'spec.json');
|
||||
const tracesDir = join(outputDir, 'traces');
|
||||
const resultsDir = join(outputDir, 'results');
|
||||
|
||||
mkdirSync(tracesDir, { recursive: true });
|
||||
mkdirSync(resultsDir, { recursive: true });
|
||||
|
||||
let spec: EvalSpec;
|
||||
|
||||
// Step 1: Introspection + Analysis
|
||||
if (options.skipAnalyze && existsSync(specPath)) {
|
||||
log('📋 Using existing EvalSpec...');
|
||||
spec = JSON.parse(readFileSync(specPath, 'utf-8'));
|
||||
log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
|
||||
} else {
|
||||
log('🔬 Step 1: Introspecting codebase...');
|
||||
|
||||
try {
|
||||
const repoSummary = await analyze({
|
||||
root: absolutePath,
|
||||
onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`),
|
||||
});
|
||||
|
||||
log(` Files: ${repoSummary.files.length}`);
|
||||
log(` Languages: ${repoSummary.languages.join(', ')}`);
|
||||
log('');
|
||||
|
||||
log('🤖 Step 2: Generating EvalSpec with Claude...\n');
|
||||
|
||||
const focus = options.focus?.split(',').map(s => s.trim());
|
||||
const maxScenarios = parseInt(options.maxScenarios, 10);
|
||||
|
||||
let result;
|
||||
if (options.interactive) {
|
||||
const { default: inquirer } = await import('inquirer');
|
||||
|
||||
result = await generateEvalSpecInteractive(
|
||||
repoSummary,
|
||||
async (question: string) => {
|
||||
const { answer } = await inquirer.prompt([{
|
||||
type: 'input',
|
||||
name: 'answer',
|
||||
message: `🤖 Claude asks: ${question}`,
|
||||
}]);
|
||||
return answer;
|
||||
},
|
||||
{ focus, maxScenarios }
|
||||
);
|
||||
} else {
|
||||
result = await generateEvalSpec(repoSummary, {
|
||||
interactive: false,
|
||||
focus,
|
||||
maxScenarios,
|
||||
});
|
||||
}
|
||||
|
||||
spec = result.spec;
|
||||
|
||||
// Save the spec
|
||||
writeFileSync(specPath, JSON.stringify(spec, null, 2));
|
||||
|
||||
log(`\n✅ EvalSpec generated!`);
|
||||
log(` Scenarios: ${spec.scenarios.length}`);
|
||||
log(` Tokens: ${result.tokensUsed}`);
|
||||
log(` Saved: ${specPath}\n`);
|
||||
} catch (error) {
|
||||
console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Render tests
|
||||
if (!options.skipRender) {
|
||||
log('📝 Step 3: Rendering test files...');
|
||||
|
||||
try {
|
||||
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
||||
|
||||
const renderResult = await renderSpec(spec, {
|
||||
outputDir: options.testDir,
|
||||
framework,
|
||||
includeFixtures: true,
|
||||
generateMocks: true,
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
log(` Framework: ${framework}`);
|
||||
log(` Files: ${renderResult.stats.fileCount}`);
|
||||
log(` Scenarios: ${renderResult.stats.scenarioCount}`);
|
||||
log(` Assertions: ${renderResult.stats.assertionCount}`);
|
||||
log(` Output: ${options.testDir}\n`);
|
||||
} catch (error) {
|
||||
console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Run tests
|
||||
if (!options.skipRun) {
|
||||
log('🧪 Step 4: Running tests...\n');
|
||||
|
||||
try {
|
||||
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
||||
const tracer = createTracer(spec.repo.name);
|
||||
|
||||
tracer.recordIntrospection({
|
||||
filesAnalyzed: spec.scenarios.map(s => s.target.module),
|
||||
totalFunctions: spec.scenarios.length,
|
||||
duration: 0,
|
||||
});
|
||||
|
||||
tracer.recordGeneration({
|
||||
scenariosGenerated: spec.scenarios.length,
|
||||
filesWritten: [options.testDir],
|
||||
});
|
||||
|
||||
const result = await runTests(
|
||||
options.testDir,
|
||||
{
|
||||
framework,
|
||||
sandbox: true,
|
||||
timeout: 300000,
|
||||
parallel: false,
|
||||
cwd: process.cwd(),
|
||||
},
|
||||
DEFAULT_SANDBOX_CONFIG
|
||||
);
|
||||
|
||||
tracer.recordExecution({
|
||||
testsPassed: result.summary.passed,
|
||||
testsFailed: result.summary.failed,
|
||||
testsSkipped: result.summary.skipped,
|
||||
});
|
||||
|
||||
for (const test of result.tests) {
|
||||
if (test.status === 'failed' || test.status === 'error') {
|
||||
tracer.recordTestFailure({
|
||||
scenarioId: test.id,
|
||||
testName: test.name,
|
||||
error: test.error?.message || 'Unknown error',
|
||||
stack: test.error?.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const trace = tracer.finalize();
|
||||
const tracePath = await saveTrace(trace);
|
||||
|
||||
log(formatResults(result));
|
||||
log(`📊 Trace saved: ${tracePath}`);
|
||||
log(` View with: evaluclaude view ${trace.id}\n`);
|
||||
|
||||
// Save results
|
||||
const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
|
||||
writeFileSync(resultsPath, JSON.stringify(result, null, 2));
|
||||
|
||||
} catch (error) {
|
||||
console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Generate Promptfoo config
|
||||
if (options.promptfoo) {
|
||||
log('📦 Step 5: Generating Promptfoo configuration...');
|
||||
|
||||
try {
|
||||
const configPath = join(outputDir, 'promptfooconfig.yaml');
|
||||
const providerPath = join(outputDir, 'providers', 'test-runner.py');
|
||||
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
||||
|
||||
await generatePromptfooConfig(spec, {
|
||||
testDir: options.testDir,
|
||||
outputPath: configPath,
|
||||
framework,
|
||||
includeTraceLinks: true,
|
||||
providerPath,
|
||||
});
|
||||
|
||||
await generateTestProvider(providerPath);
|
||||
|
||||
log(` Config: ${configPath}`);
|
||||
log(` Provider: ${providerPath}`);
|
||||
log(`\n Launch UI with: evaluclaude ui\n`);
|
||||
} catch (error) {
|
||||
console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('═'.repeat(50));
|
||||
console.log('✅ Pipeline complete!');
|
||||
console.log('═'.repeat(50));
|
||||
console.log(`\nNext steps:`);
|
||||
console.log(` View traces: evaluclaude view --last`);
|
||||
console.log(` List all traces: evaluclaude traces`);
|
||||
if (options.promptfoo) {
|
||||
console.log(` Launch UI: evaluclaude ui`);
|
||||
console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`);
|
||||
}
|
||||
console.log('');
|
||||
});
|
||||
|
|
@ -10,6 +10,7 @@ import {
|
|||
DEFAULT_SANDBOX_CONFIG
|
||||
} from '../../runners/index.js';
|
||||
import { createTracer, saveTrace } from '../../observability/index.js';
|
||||
import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
|
||||
export const runCommand = new Command('run')
|
||||
|
|
@ -25,6 +26,7 @@ export const runCommand = new Command('run')
|
|||
.option('-o, --output <file>', 'Output results to JSON file')
|
||||
.option('--trace', 'Record execution trace', true)
|
||||
.option('--no-trace', 'Disable execution tracing')
|
||||
.option('--export-promptfoo', 'Export results in Promptfoo format', false)
|
||||
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
|
||||
.action(async (testDir: string, options) => {
|
||||
try {
|
||||
|
|
@ -109,6 +111,16 @@ export const runCommand = new Command('run')
|
|||
console.log(`\n📁 Results saved to: ${options.output}`);
|
||||
}
|
||||
|
||||
// Export to Promptfoo format for UI viewing
|
||||
if (options.exportPromptfoo) {
|
||||
const exportPath = await exportToPromptfooFormat(result, spec, {
|
||||
outputDir: '.evaluclaude/results',
|
||||
evalId: `eval-${Date.now()}`,
|
||||
});
|
||||
console.log(`\n📦 Promptfoo results exported: ${exportPath}`);
|
||||
console.log(` View with: evaluclaude ui`);
|
||||
}
|
||||
|
||||
if (tracer) {
|
||||
const trace = tracer.finalize();
|
||||
const tracePath = await saveTrace(trace);
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import { Command } from 'commander';
|
||||
import { spawn, type ChildProcess } from 'child_process';
|
||||
import { spawn } from 'child_process';
|
||||
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import { join, dirname, resolve as resolvePath } from 'path';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
||||
|
||||
|
|
@ -21,6 +21,7 @@ export const uiCommand = new Command('ui')
|
|||
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
|
||||
// If spec provided with --generate, create/update Promptfoo config
|
||||
if (options.spec && options.generate) {
|
||||
console.log('\n📄 Generating Promptfoo configuration...');
|
||||
|
||||
|
|
@ -36,6 +37,7 @@ export const uiCommand = new Command('ui')
|
|||
outputPath: configPath,
|
||||
framework: detectFramework(spec),
|
||||
includeTraceLinks: true,
|
||||
providerPath: providerPath,
|
||||
});
|
||||
|
||||
await generateTestProvider(providerPath);
|
||||
|
|
@ -44,20 +46,31 @@ export const uiCommand = new Command('ui')
|
|||
console.log(` Provider: ${providerPath}`);
|
||||
}
|
||||
|
||||
// Check for existing config, create default if missing
|
||||
if (!existsSync(configPath)) {
|
||||
console.log('\n⚠️ No Promptfoo config found.');
|
||||
console.log(' Run with --spec <file> --generate to create one.\n');
|
||||
console.log(' Or create one manually:');
|
||||
console.log(` ${configPath}\n`);
|
||||
console.log(' Creating default configuration...\n');
|
||||
|
||||
await createDefaultConfig(configPath, providerPath);
|
||||
console.log(` Created default config at ${configPath}`);
|
||||
console.log(` Created: ${configPath}`);
|
||||
}
|
||||
|
||||
// Check for results to display
|
||||
const resultsDir = join(EVALUCLAUDE_DIR, 'results');
|
||||
const latestResults = join(resultsDir, 'latest.json');
|
||||
|
||||
if (!existsSync(latestResults)) {
|
||||
console.log('\n⚠️ No evaluation results found.');
|
||||
console.log(' Run `evaluclaude run --export-promptfoo` first to generate results.\n');
|
||||
console.log(' Or run the full pipeline:');
|
||||
console.log(' evaluclaude pipeline <path> --promptfoo\n');
|
||||
}
|
||||
|
||||
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
|
||||
console.log(` Config: ${configPath}\n`);
|
||||
console.log(` Results: ${latestResults}\n`);
|
||||
|
||||
await launchPromptfooUI(port, configPath, options.open);
|
||||
// Use promptfoo view with the results file
|
||||
await launchPromptfooView(port, latestResults, options.open);
|
||||
} catch (error) {
|
||||
console.error('Error launching UI:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
|
|
@ -71,12 +84,21 @@ export const evalCommand = new Command('eval')
|
|||
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
|
||||
.option('--view', 'Launch UI after evaluation', false)
|
||||
.option('-p, --port <port>', 'Port for UI', '3000')
|
||||
.option('--no-cache', 'Disable Promptfoo caching', false)
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
|
||||
// Generate config from spec if provided
|
||||
if (options.spec) {
|
||||
console.log('\n📄 Generating Promptfoo configuration from spec...');
|
||||
|
||||
if (!existsSync(options.spec)) {
|
||||
console.error(`Error: Spec file not found: ${options.spec}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
||||
|
||||
await generatePromptfooConfig(spec, {
|
||||
|
|
@ -84,30 +106,57 @@ export const evalCommand = new Command('eval')
|
|||
outputPath: configPath,
|
||||
framework: detectFramework(spec),
|
||||
includeTraceLinks: true,
|
||||
providerPath: providerPath,
|
||||
});
|
||||
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
await generateTestProvider(providerPath);
|
||||
|
||||
console.log(` Config: ${configPath}`);
|
||||
console.log(` Provider: ${providerPath}`);
|
||||
console.log(` Scenarios: ${spec.scenarios.length}`);
|
||||
}
|
||||
|
||||
if (!existsSync(configPath)) {
|
||||
console.error(`Error: Config not found: ${configPath}`);
|
||||
console.log('Run with --spec <file> to generate from EvalSpec.');
|
||||
console.error(`\nError: Config not found: ${configPath}`);
|
||||
console.log('Run with --spec <file> to generate from EvalSpec, or create config manually.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n🧪 Running Promptfoo evaluations...\n');
|
||||
// Ensure output directory exists
|
||||
mkdirSync(options.output, { recursive: true });
|
||||
|
||||
console.log('\n🧪 Running Promptfoo evaluations...');
|
||||
console.log(` Config: ${configPath}`);
|
||||
console.log(` Output: ${options.output}\n`);
|
||||
|
||||
const outputFile = join(options.output, `eval-${Date.now()}.json`);
|
||||
mkdirSync(dirname(outputFile), { recursive: true });
|
||||
|
||||
await runPromptfooEval(configPath, outputFile);
|
||||
const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache);
|
||||
|
||||
console.log(`\n📁 Results saved: ${outputFile}`);
|
||||
if (exitCode === 0) {
|
||||
console.log(`\n✅ Evaluation complete!`);
|
||||
console.log(`📁 Results: ${outputFile}`);
|
||||
} else {
|
||||
console.log(`\n⚠️ Evaluation finished with exit code ${exitCode}`);
|
||||
console.log(`📁 Results: ${outputFile}`);
|
||||
}
|
||||
|
||||
// List traces generated during evaluation
|
||||
const tracesDir = join(EVALUCLAUDE_DIR, 'traces');
|
||||
if (existsSync(tracesDir)) {
|
||||
const { readdirSync } = await import('fs');
|
||||
const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json'));
|
||||
if (traces.length > 0) {
|
||||
console.log(`\n📊 Traces generated: ${traces.length}`);
|
||||
console.log(` View with: evaluclaude view --last`);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.view) {
|
||||
console.log(`\n🚀 Launching UI on port ${options.port}...`);
|
||||
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
|
||||
} else {
|
||||
console.log(`\n View results: evaluclaude ui`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error running eval:', error instanceof Error ? error.message : error);
|
||||
|
|
@ -115,6 +164,64 @@ export const evalCommand = new Command('eval')
|
|||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Launch Promptfoo view to display pre-computed results.
|
||||
*/
|
||||
async function launchPromptfooView(
|
||||
port: number,
|
||||
resultsFile: string,
|
||||
openBrowser: boolean
|
||||
): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
// Use 'promptfoo view' which opens the web UI showing results from the output directory
|
||||
const resultsDir = dirname(resolvePath(resultsFile));
|
||||
const args = ['promptfoo', 'view', '--port', String(port)];
|
||||
|
||||
if (openBrowser) {
|
||||
args.push('-y');
|
||||
} else {
|
||||
args.push('-n');
|
||||
}
|
||||
|
||||
// Pass the directory containing results
|
||||
args.push(resultsDir);
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
||||
const child = spawn('npx', args, {
|
||||
stdio: 'inherit',
|
||||
env: { ...process.env },
|
||||
});
|
||||
|
||||
child.on('error', (error) => {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
console.error('\n❌ Promptfoo not found.');
|
||||
console.error(' Install with: npm install -g promptfoo');
|
||||
console.error(' Or run: npx promptfoo --version\n');
|
||||
} else {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`Promptfoo exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
// Handle Ctrl+C gracefully
|
||||
process.on('SIGINT', () => {
|
||||
child.kill('SIGINT');
|
||||
process.exit(0);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Launch Promptfoo with a config file (for running evals).
|
||||
*/
|
||||
async function launchPromptfooUI(
|
||||
port: number,
|
||||
configPath: string,
|
||||
|
|
@ -129,7 +236,8 @@ async function launchPromptfooUI(
|
|||
args.push('-n');
|
||||
}
|
||||
|
||||
const configDir = dirname(configPath);
|
||||
// Pass the directory containing the config
|
||||
const configDir = dirname(resolvePath(configPath));
|
||||
args.push(configDir);
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
|
@ -141,7 +249,9 @@ async function launchPromptfooUI(
|
|||
|
||||
child.on('error', (error) => {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
|
||||
console.error('\n❌ Promptfoo not found.');
|
||||
console.error(' Install with: npm install -g promptfoo');
|
||||
console.error(' Or run: npx promptfoo --version\n');
|
||||
} else {
|
||||
reject(error);
|
||||
}
|
||||
|
|
@ -155,6 +265,7 @@ async function launchPromptfooUI(
|
|||
}
|
||||
});
|
||||
|
||||
// Handle Ctrl+C gracefully
|
||||
process.on('SIGINT', () => {
|
||||
child.kill('SIGINT');
|
||||
process.exit(0);
|
||||
|
|
@ -162,16 +273,23 @@ async function launchPromptfooUI(
|
|||
});
|
||||
}
|
||||
|
||||
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
|
||||
async function runPromptfooEval(
|
||||
configPath: string,
|
||||
outputFile: string,
|
||||
noCache: boolean
|
||||
): Promise<number> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const args = [
|
||||
'promptfoo',
|
||||
'eval',
|
||||
'-c', configPath,
|
||||
'-o', outputFile,
|
||||
'--no-cache',
|
||||
];
|
||||
|
||||
if (noCache) {
|
||||
args.push('--no-cache');
|
||||
}
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
||||
const child = spawn('npx', args, {
|
||||
|
|
@ -179,14 +297,18 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
|
|||
env: { ...process.env },
|
||||
});
|
||||
|
||||
child.on('error', reject);
|
||||
child.on('error', (error) => {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
console.error('\n❌ Promptfoo not found.');
|
||||
console.error(' Install with: npm install -g promptfoo\n');
|
||||
reject(error);
|
||||
} else {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`Promptfoo eval exited with code ${code}`));
|
||||
}
|
||||
resolve(code ?? 1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
@ -194,6 +316,14 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
|
|||
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
|
||||
const defaultConfig = `# Evaluclaude Promptfoo Configuration
|
||||
# Generated by evaluclaude
|
||||
#
|
||||
# To populate this config from an EvalSpec:
|
||||
# evaluclaude eval --spec <evalspec.json>
|
||||
#
|
||||
# Or run the full pipeline:
|
||||
# evaluclaude analyze <path> -o spec.json
|
||||
# evaluclaude render spec.json -o tests/generated
|
||||
# evaluclaude eval --spec spec.json
|
||||
|
||||
description: "Evaluclaude functional test evaluations"
|
||||
|
||||
|
|
@ -204,12 +334,13 @@ providers:
|
|||
test_dir: ./tests/generated
|
||||
framework: pytest
|
||||
timeout: 300
|
||||
sandbox: true
|
||||
|
||||
prompts:
|
||||
- "{{scenario_id}}"
|
||||
|
||||
tests:
|
||||
- description: "Example test"
|
||||
- description: "Example test - replace with real scenarios"
|
||||
vars:
|
||||
scenario_id: "test_example"
|
||||
assert:
|
||||
|
|
@ -219,12 +350,19 @@ tests:
|
|||
result = json.loads(output)
|
||||
result.get('passed', 0) > 0
|
||||
|
||||
# Default test configuration
|
||||
defaultTest:
|
||||
metadata:
|
||||
evaluclaude: true
|
||||
tracesDir: .evaluclaude/traces
|
||||
|
||||
outputPath: .evaluclaude/results/promptfoo-results.json
|
||||
`;
|
||||
|
||||
mkdirSync(dirname(configPath), { recursive: true });
|
||||
writeFileSync(configPath, defaultConfig);
|
||||
|
||||
// Also generate the provider
|
||||
await generateTestProvider(providerPath);
|
||||
}
|
||||
|
||||
|
|
@ -232,5 +370,8 @@ function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
|
|||
if (spec.repo.languages.includes('python')) {
|
||||
return 'pytest';
|
||||
}
|
||||
if (spec.repo.languages.includes('typescript') || spec.repo.languages.includes('javascript')) {
|
||||
return 'vitest';
|
||||
}
|
||||
return 'vitest';
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue