diff --git a/rubrics/code-quality.yaml b/rubrics/code-quality.yaml new file mode 100644 index 0000000..5af8b08 --- /dev/null +++ b/rubrics/code-quality.yaml @@ -0,0 +1,32 @@ +name: code-quality +description: Evaluates generated code for quality and maintainability +passingThreshold: 0.7 + +criteria: + - name: readability + weight: 0.3 + description: Code is easy to read and understand + examples: + good: "Clear variable names, logical flow, proper indentation" + bad: "Single-letter variables, deeply nested logic, inconsistent style" + + - name: correctness + weight: 0.4 + description: Code correctly implements the intended behavior + examples: + good: "Handles edge cases, correct algorithm, proper error handling" + bad: "Missing edge cases, off-by-one errors, swallowed exceptions" + + - name: efficiency + weight: 0.2 + description: Code uses appropriate data structures and algorithms + examples: + good: "O(n) where O(n) is optimal, avoids unnecessary allocations" + bad: "O(nยฒ) when O(n) is possible, creates objects in tight loops" + + - name: maintainability + weight: 0.1 + description: Code is easy to modify and extend + examples: + good: "Single responsibility, low coupling, clear interfaces" + bad: "God functions, tight coupling, magic numbers" diff --git a/rubrics/documentation.yaml b/rubrics/documentation.yaml new file mode 100644 index 0000000..7089c96 --- /dev/null +++ b/rubrics/documentation.yaml @@ -0,0 +1,32 @@ +name: documentation +description: Evaluates quality of code documentation and docstrings +passingThreshold: 0.65 + +criteria: + - name: completeness + weight: 0.35 + description: Documentation covers all parameters, return values, and exceptions + examples: + good: "Fully documents args, returns, raises, and includes usage example" + bad: "Missing parameter descriptions or return type" + + - name: accuracy + weight: 0.35 + description: Documentation accurately describes the function's behavior + examples: + good: "Description matches implementation, types are correct" + bad: "Outdated docs that don't match current behavior" + + - name: examples + weight: 0.2 + description: Includes helpful usage examples + examples: + good: "Shows common use cases with expected outputs" + bad: "No examples or only trivial ones" + + - name: style + weight: 0.1 + description: Follows project/language documentation conventions + examples: + good: "Uses standard docstring format (Google, NumPy, or reStructuredText)" + bad: "Inconsistent or non-standard format" diff --git a/rubrics/error-messages.yaml b/rubrics/error-messages.yaml new file mode 100644 index 0000000..3e8dace --- /dev/null +++ b/rubrics/error-messages.yaml @@ -0,0 +1,25 @@ +name: error-messages +description: Evaluates quality of error messages +passingThreshold: 0.6 + +criteria: + - name: clarity + weight: 0.4 + description: Error message clearly explains what went wrong + examples: + good: "Invalid email format: 'not-an-email' is missing '@' symbol" + bad: "Error: validation failed" + + - name: actionability + weight: 0.4 + description: Error message suggests how to fix the problem + examples: + good: "File not found. Create the file or check the path spelling." + bad: "ENOENT" + + - name: context + weight: 0.2 + description: Error message includes relevant context (file, line, values) + examples: + good: "TypeError at line 42 in auth.py: expected str, got int (value=123)" + bad: "type error" diff --git a/src/cli/commands/pipeline.ts b/src/cli/commands/pipeline.ts new file mode 100644 index 0000000..d1e7fa4 --- /dev/null +++ b/src/cli/commands/pipeline.ts @@ -0,0 +1,257 @@ +import { Command } from 'commander'; +import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs'; +import { join, resolve } from 'path'; +import { analyze } from '../../introspector/index.js'; +import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js'; +import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js'; +import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js'; +import { createTracer, saveTrace } from '../../observability/index.js'; +import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js'; +import type { EvalSpec } from '../../analyzer/types.js'; + +const EVALUCLAUDE_DIR = '.evaluclaude'; + +interface PipelineOptions { + output?: string; + interactive?: boolean; + focus?: string; + maxScenarios: string; + testDir: string; + framework?: string; + skipAnalyze?: boolean; + skipRender?: boolean; + skipRun?: boolean; + promptfoo?: boolean; + quiet?: boolean; +} + +export const pipelineCommand = new Command('pipeline') + .description('Run the full eval generation pipeline: introspect โ†’ analyze โ†’ render โ†’ run') + .argument('[path]', 'Path to the repository to analyze', '.') + .option('-o, --output ', 'Output directory for all artifacts', '.evaluclaude') + .option('-i, --interactive', 'Enable interactive mode with clarifying questions') + .option('--focus ', 'Comma-separated list of modules/functions to focus on') + .option('--max-scenarios ', 'Maximum number of test scenarios to generate', '10') + .option('--test-dir ', 'Directory for generated tests', './tests/generated') + .option('-f, --framework ', 'Test framework (pytest, vitest, jest)') + .option('--skip-analyze', 'Skip analysis, use existing spec') + .option('--skip-render', 'Skip rendering, use existing tests') + .option('--skip-run', 'Skip test execution') + .option('--promptfoo', 'Generate Promptfoo configuration for UI viewing') + .option('--quiet', 'Suppress progress messages') + .action(async (repoPath: string, options: PipelineOptions) => { + const absolutePath = resolve(repoPath); + const log = options.quiet ? () => {} : console.log; + const outputDir = options.output || EVALUCLAUDE_DIR; + + console.log('\n๐Ÿš€ Evaluclaude Pipeline'); + console.log('โ•'.repeat(50)); + console.log(` Repository: ${absolutePath}`); + console.log(` Output: ${outputDir}`); + console.log('โ•'.repeat(50) + '\n'); + + // Ensure output directories exist + mkdirSync(outputDir, { recursive: true }); + mkdirSync(options.testDir, { recursive: true }); + + const specPath = join(outputDir, 'spec.json'); + const tracesDir = join(outputDir, 'traces'); + const resultsDir = join(outputDir, 'results'); + + mkdirSync(tracesDir, { recursive: true }); + mkdirSync(resultsDir, { recursive: true }); + + let spec: EvalSpec; + + // Step 1: Introspection + Analysis + if (options.skipAnalyze && existsSync(specPath)) { + log('๐Ÿ“‹ Using existing EvalSpec...'); + spec = JSON.parse(readFileSync(specPath, 'utf-8')); + log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`); + } else { + log('๐Ÿ”ฌ Step 1: Introspecting codebase...'); + + try { + const repoSummary = await analyze({ + root: absolutePath, + onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`), + }); + + log(` Files: ${repoSummary.files.length}`); + log(` Languages: ${repoSummary.languages.join(', ')}`); + log(''); + + log('๐Ÿค– Step 2: Generating EvalSpec with Claude...\n'); + + const focus = options.focus?.split(',').map(s => s.trim()); + const maxScenarios = parseInt(options.maxScenarios, 10); + + let result; + if (options.interactive) { + const { default: inquirer } = await import('inquirer'); + + result = await generateEvalSpecInteractive( + repoSummary, + async (question: string) => { + const { answer } = await inquirer.prompt([{ + type: 'input', + name: 'answer', + message: `๐Ÿค– Claude asks: ${question}`, + }]); + return answer; + }, + { focus, maxScenarios } + ); + } else { + result = await generateEvalSpec(repoSummary, { + interactive: false, + focus, + maxScenarios, + }); + } + + spec = result.spec; + + // Save the spec + writeFileSync(specPath, JSON.stringify(spec, null, 2)); + + log(`\nโœ… EvalSpec generated!`); + log(` Scenarios: ${spec.scenarios.length}`); + log(` Tokens: ${result.tokensUsed}`); + log(` Saved: ${specPath}\n`); + } catch (error) { + console.error('\nโŒ Analysis failed:', error instanceof Error ? error.message : error); + process.exit(1); + } + } + + // Step 2: Render tests + if (!options.skipRender) { + log('๐Ÿ“ Step 3: Rendering test files...'); + + try { + const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec); + + const renderResult = await renderSpec(spec, { + outputDir: options.testDir, + framework, + includeFixtures: true, + generateMocks: true, + dryRun: false, + }); + + log(` Framework: ${framework}`); + log(` Files: ${renderResult.stats.fileCount}`); + log(` Scenarios: ${renderResult.stats.scenarioCount}`); + log(` Assertions: ${renderResult.stats.assertionCount}`); + log(` Output: ${options.testDir}\n`); + } catch (error) { + console.error('\nโŒ Rendering failed:', error instanceof Error ? error.message : error); + process.exit(1); + } + } + + // Step 3: Run tests + if (!options.skipRun) { + log('๐Ÿงช Step 4: Running tests...\n'); + + try { + const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec); + const tracer = createTracer(spec.repo.name); + + tracer.recordIntrospection({ + filesAnalyzed: spec.scenarios.map(s => s.target.module), + totalFunctions: spec.scenarios.length, + duration: 0, + }); + + tracer.recordGeneration({ + scenariosGenerated: spec.scenarios.length, + filesWritten: [options.testDir], + }); + + const result = await runTests( + options.testDir, + { + framework, + sandbox: true, + timeout: 300000, + parallel: false, + cwd: process.cwd(), + }, + DEFAULT_SANDBOX_CONFIG + ); + + tracer.recordExecution({ + testsPassed: result.summary.passed, + testsFailed: result.summary.failed, + testsSkipped: result.summary.skipped, + }); + + for (const test of result.tests) { + if (test.status === 'failed' || test.status === 'error') { + tracer.recordTestFailure({ + scenarioId: test.id, + testName: test.name, + error: test.error?.message || 'Unknown error', + stack: test.error?.stack, + }); + } + } + + const trace = tracer.finalize(); + const tracePath = await saveTrace(trace); + + log(formatResults(result)); + log(`๐Ÿ“Š Trace saved: ${tracePath}`); + log(` View with: evaluclaude view ${trace.id}\n`); + + // Save results + const resultsPath = join(resultsDir, `run-${Date.now()}.json`); + writeFileSync(resultsPath, JSON.stringify(result, null, 2)); + + } catch (error) { + console.error('\nโŒ Test execution failed:', error instanceof Error ? error.message : error); + process.exit(1); + } + } + + // Step 4: Generate Promptfoo config + if (options.promptfoo) { + log('๐Ÿ“ฆ Step 5: Generating Promptfoo configuration...'); + + try { + const configPath = join(outputDir, 'promptfooconfig.yaml'); + const providerPath = join(outputDir, 'providers', 'test-runner.py'); + const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec); + + await generatePromptfooConfig(spec, { + testDir: options.testDir, + outputPath: configPath, + framework, + includeTraceLinks: true, + providerPath, + }); + + await generateTestProvider(providerPath); + + log(` Config: ${configPath}`); + log(` Provider: ${providerPath}`); + log(`\n Launch UI with: evaluclaude ui\n`); + } catch (error) { + console.error('\nโŒ Promptfoo config generation failed:', error instanceof Error ? error.message : error); + } + } + + console.log('โ•'.repeat(50)); + console.log('โœ… Pipeline complete!'); + console.log('โ•'.repeat(50)); + console.log(`\nNext steps:`); + console.log(` View traces: evaluclaude view --last`); + console.log(` List all traces: evaluclaude traces`); + if (options.promptfoo) { + console.log(` Launch UI: evaluclaude ui`); + console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`); + } + console.log(''); + }); diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 28f372f..a20c01e 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -10,6 +10,7 @@ import { DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js'; import { createTracer, saveTrace } from '../../observability/index.js'; +import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js'; import type { EvalSpec } from '../../analyzer/types.js'; export const runCommand = new Command('run') @@ -25,6 +26,7 @@ export const runCommand = new Command('run') .option('-o, --output ', 'Output results to JSON file') .option('--trace', 'Record execution trace', true) .option('--no-trace', 'Disable execution tracing') + .option('--export-promptfoo', 'Export results in Promptfoo format', false) .option('-w, --watch', 'Watch mode (rerun on changes)', false) .action(async (testDir: string, options) => { try { @@ -109,6 +111,16 @@ export const runCommand = new Command('run') console.log(`\n๐Ÿ“ Results saved to: ${options.output}`); } + // Export to Promptfoo format for UI viewing + if (options.exportPromptfoo) { + const exportPath = await exportToPromptfooFormat(result, spec, { + outputDir: '.evaluclaude/results', + evalId: `eval-${Date.now()}`, + }); + console.log(`\n๐Ÿ“ฆ Promptfoo results exported: ${exportPath}`); + console.log(` View with: evaluclaude ui`); + } + if (tracer) { const trace = tracer.finalize(); const tracePath = await saveTrace(trace); diff --git a/src/cli/commands/ui.ts b/src/cli/commands/ui.ts index 9be91eb..a775225 100644 --- a/src/cli/commands/ui.ts +++ b/src/cli/commands/ui.ts @@ -1,7 +1,7 @@ import { Command } from 'commander'; -import { spawn, type ChildProcess } from 'child_process'; +import { spawn } from 'child_process'; import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs'; -import { join, dirname } from 'path'; +import { join, dirname, resolve as resolvePath } from 'path'; import type { EvalSpec } from '../../analyzer/types.js'; import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js'; @@ -21,6 +21,7 @@ export const uiCommand = new Command('ui') const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE); const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py'); + // If spec provided with --generate, create/update Promptfoo config if (options.spec && options.generate) { console.log('\n๐Ÿ“„ Generating Promptfoo configuration...'); @@ -36,6 +37,7 @@ export const uiCommand = new Command('ui') outputPath: configPath, framework: detectFramework(spec), includeTraceLinks: true, + providerPath: providerPath, }); await generateTestProvider(providerPath); @@ -44,20 +46,31 @@ export const uiCommand = new Command('ui') console.log(` Provider: ${providerPath}`); } + // Check for existing config, create default if missing if (!existsSync(configPath)) { console.log('\nโš ๏ธ No Promptfoo config found.'); - console.log(' Run with --spec --generate to create one.\n'); - console.log(' Or create one manually:'); - console.log(` ${configPath}\n`); + console.log(' Creating default configuration...\n'); await createDefaultConfig(configPath, providerPath); - console.log(` Created default config at ${configPath}`); + console.log(` Created: ${configPath}`); + } + + // Check for results to display + const resultsDir = join(EVALUCLAUDE_DIR, 'results'); + const latestResults = join(resultsDir, 'latest.json'); + + if (!existsSync(latestResults)) { + console.log('\nโš ๏ธ No evaluation results found.'); + console.log(' Run `evaluclaude run --export-promptfoo` first to generate results.\n'); + console.log(' Or run the full pipeline:'); + console.log(' evaluclaude pipeline --promptfoo\n'); } console.log(`\n๐Ÿš€ Starting Promptfoo UI on port ${port}...`); - console.log(` Config: ${configPath}\n`); + console.log(` Results: ${latestResults}\n`); - await launchPromptfooUI(port, configPath, options.open); + // Use promptfoo view with the results file + await launchPromptfooView(port, latestResults, options.open); } catch (error) { console.error('Error launching UI:', error instanceof Error ? error.message : error); process.exit(1); @@ -71,12 +84,21 @@ export const evalCommand = new Command('eval') .option('-o, --output ', 'Output path for results', '.evaluclaude/results') .option('--view', 'Launch UI after evaluation', false) .option('-p, --port ', 'Port for UI', '3000') + .option('--no-cache', 'Disable Promptfoo caching', false) .action(async (options) => { try { const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE); + const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py'); + // Generate config from spec if provided if (options.spec) { console.log('\n๐Ÿ“„ Generating Promptfoo configuration from spec...'); + + if (!existsSync(options.spec)) { + console.error(`Error: Spec file not found: ${options.spec}`); + process.exit(1); + } + const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8')); await generatePromptfooConfig(spec, { @@ -84,30 +106,57 @@ export const evalCommand = new Command('eval') outputPath: configPath, framework: detectFramework(spec), includeTraceLinks: true, + providerPath: providerPath, }); - const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py'); await generateTestProvider(providerPath); + + console.log(` Config: ${configPath}`); + console.log(` Provider: ${providerPath}`); + console.log(` Scenarios: ${spec.scenarios.length}`); } if (!existsSync(configPath)) { - console.error(`Error: Config not found: ${configPath}`); - console.log('Run with --spec to generate from EvalSpec.'); + console.error(`\nError: Config not found: ${configPath}`); + console.log('Run with --spec to generate from EvalSpec, or create config manually.'); process.exit(1); } - console.log('\n๐Ÿงช Running Promptfoo evaluations...\n'); + // Ensure output directory exists + mkdirSync(options.output, { recursive: true }); + + console.log('\n๐Ÿงช Running Promptfoo evaluations...'); + console.log(` Config: ${configPath}`); + console.log(` Output: ${options.output}\n`); const outputFile = join(options.output, `eval-${Date.now()}.json`); - mkdirSync(dirname(outputFile), { recursive: true }); - await runPromptfooEval(configPath, outputFile); + const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache); - console.log(`\n๐Ÿ“ Results saved: ${outputFile}`); + if (exitCode === 0) { + console.log(`\nโœ… Evaluation complete!`); + console.log(`๐Ÿ“ Results: ${outputFile}`); + } else { + console.log(`\nโš ๏ธ Evaluation finished with exit code ${exitCode}`); + console.log(`๐Ÿ“ Results: ${outputFile}`); + } + + // List traces generated during evaluation + const tracesDir = join(EVALUCLAUDE_DIR, 'traces'); + if (existsSync(tracesDir)) { + const { readdirSync } = await import('fs'); + const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json')); + if (traces.length > 0) { + console.log(`\n๐Ÿ“Š Traces generated: ${traces.length}`); + console.log(` View with: evaluclaude view --last`); + } + } if (options.view) { console.log(`\n๐Ÿš€ Launching UI on port ${options.port}...`); await launchPromptfooUI(parseInt(options.port, 10), configPath, true); + } else { + console.log(`\n View results: evaluclaude ui`); } } catch (error) { console.error('Error running eval:', error instanceof Error ? error.message : error); @@ -115,6 +164,64 @@ export const evalCommand = new Command('eval') } }); +/** + * Launch Promptfoo view to display pre-computed results. + */ +async function launchPromptfooView( + port: number, + resultsFile: string, + openBrowser: boolean +): Promise { + return new Promise((resolve, reject) => { + // Use 'promptfoo view' which opens the web UI showing results from the output directory + const resultsDir = dirname(resolvePath(resultsFile)); + const args = ['promptfoo', 'view', '--port', String(port)]; + + if (openBrowser) { + args.push('-y'); + } else { + args.push('-n'); + } + + // Pass the directory containing results + args.push(resultsDir); + + console.log(` Running: npx ${args.join(' ')}\n`); + + const child = spawn('npx', args, { + stdio: 'inherit', + env: { ...process.env }, + }); + + child.on('error', (error) => { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + console.error('\nโŒ Promptfoo not found.'); + console.error(' Install with: npm install -g promptfoo'); + console.error(' Or run: npx promptfoo --version\n'); + } else { + reject(error); + } + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(); + } else { + reject(new Error(`Promptfoo exited with code ${code}`)); + } + }); + + // Handle Ctrl+C gracefully + process.on('SIGINT', () => { + child.kill('SIGINT'); + process.exit(0); + }); + }); +} + +/** + * Launch Promptfoo with a config file (for running evals). + */ async function launchPromptfooUI( port: number, configPath: string, @@ -129,7 +236,8 @@ async function launchPromptfooUI( args.push('-n'); } - const configDir = dirname(configPath); + // Pass the directory containing the config + const configDir = dirname(resolvePath(configPath)); args.push(configDir); console.log(` Running: npx ${args.join(' ')}\n`); @@ -141,7 +249,9 @@ async function launchPromptfooUI( child.on('error', (error) => { if ((error as NodeJS.ErrnoException).code === 'ENOENT') { - console.error('\nโŒ Promptfoo not found. Install with: npm install -g promptfoo'); + console.error('\nโŒ Promptfoo not found.'); + console.error(' Install with: npm install -g promptfoo'); + console.error(' Or run: npx promptfoo --version\n'); } else { reject(error); } @@ -155,6 +265,7 @@ async function launchPromptfooUI( } }); + // Handle Ctrl+C gracefully process.on('SIGINT', () => { child.kill('SIGINT'); process.exit(0); @@ -162,16 +273,23 @@ async function launchPromptfooUI( }); } -async function runPromptfooEval(configPath: string, outputFile: string): Promise { +async function runPromptfooEval( + configPath: string, + outputFile: string, + noCache: boolean +): Promise { return new Promise((resolve, reject) => { const args = [ 'promptfoo', 'eval', '-c', configPath, '-o', outputFile, - '--no-cache', ]; + if (noCache) { + args.push('--no-cache'); + } + console.log(` Running: npx ${args.join(' ')}\n`); const child = spawn('npx', args, { @@ -179,14 +297,18 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise env: { ...process.env }, }); - child.on('error', reject); + child.on('error', (error) => { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + console.error('\nโŒ Promptfoo not found.'); + console.error(' Install with: npm install -g promptfoo\n'); + reject(error); + } else { + reject(error); + } + }); child.on('close', (code) => { - if (code === 0) { - resolve(); - } else { - reject(new Error(`Promptfoo eval exited with code ${code}`)); - } + resolve(code ?? 1); }); }); } @@ -194,6 +316,14 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise async function createDefaultConfig(configPath: string, providerPath: string): Promise { const defaultConfig = `# Evaluclaude Promptfoo Configuration # Generated by evaluclaude +# +# To populate this config from an EvalSpec: +# evaluclaude eval --spec +# +# Or run the full pipeline: +# evaluclaude analyze -o spec.json +# evaluclaude render spec.json -o tests/generated +# evaluclaude eval --spec spec.json description: "Evaluclaude functional test evaluations" @@ -204,12 +334,13 @@ providers: test_dir: ./tests/generated framework: pytest timeout: 300 + sandbox: true prompts: - "{{scenario_id}}" tests: - - description: "Example test" + - description: "Example test - replace with real scenarios" vars: scenario_id: "test_example" assert: @@ -219,12 +350,19 @@ tests: result = json.loads(output) result.get('passed', 0) > 0 +# Default test configuration +defaultTest: + metadata: + evaluclaude: true + tracesDir: .evaluclaude/traces + outputPath: .evaluclaude/results/promptfoo-results.json `; mkdirSync(dirname(configPath), { recursive: true }); writeFileSync(configPath, defaultConfig); + // Also generate the provider await generateTestProvider(providerPath); } @@ -232,5 +370,8 @@ function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' { if (spec.repo.languages.includes('python')) { return 'pytest'; } + if (spec.repo.languages.includes('typescript') || spec.repo.languages.includes('javascript')) { + return 'vitest'; + } return 'vitest'; } diff --git a/src/cli/index.ts b/src/cli/index.ts index bdc914f..ffa056b 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -8,6 +8,7 @@ import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/g import { runCommand } from './commands/run.js'; import { viewCommand, tracesCommand } from './commands/view.js'; import { uiCommand, evalCommand } from './commands/ui.js'; +import { pipelineCommand } from './commands/pipeline.js'; const program = new Command(); @@ -16,15 +17,25 @@ program .description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.') .version('0.1.0'); +// Core pipeline command - the "zero to evals" experience +program.addCommand(pipelineCommand); + +// Individual step commands program.addCommand(introCommand); program.addCommand(analyzeCommand); program.addCommand(renderCommand); +program.addCommand(runCommand); + +// Grading commands program.addCommand(gradeCommand); program.addCommand(listRubricsCommand); program.addCommand(calibrateCommand); -program.addCommand(runCommand); + +// Observability commands program.addCommand(viewCommand); program.addCommand(tracesCommand); + +// Promptfoo integration commands program.addCommand(uiCommand); program.addCommand(evalCommand); diff --git a/src/observability/trace-viewer.ts b/src/observability/trace-viewer.ts index e1c3a52..553081d 100644 --- a/src/observability/trace-viewer.ts +++ b/src/observability/trace-viewer.ts @@ -64,7 +64,7 @@ export function formatTrace(trace: EvalTrace, options: Partial = {} lines.push('โ”€'.repeat(40)); lines.push(` โœ… Passed: ${trace.execution.testsPassed}`); lines.push(` โŒ Failed: ${trace.execution.testsFailed}`); - lines.push(` โญ๏ธ Skipped: ${trace.execution.testsSkipped}`); + lines.push(` โญ๏ธ Skipped: ${trace.execution.testsSkipped ?? 0}`); lines.push(''); if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) { diff --git a/src/promptfoo/config-generator.ts b/src/promptfoo/config-generator.ts index 96536ab..737b2f0 100644 --- a/src/promptfoo/config-generator.ts +++ b/src/promptfoo/config-generator.ts @@ -1,5 +1,5 @@ import { writeFile, mkdir } from 'fs/promises'; -import { dirname, join } from 'path'; +import { dirname, join, resolve } from 'path'; import * as yaml from 'js-yaml'; import type { EvalSpec, EvalScenario } from '../analyzer/types.js'; import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js'; @@ -9,6 +9,7 @@ export interface ConfigOptions { outputPath: string; framework: 'pytest' | 'vitest' | 'jest'; includeTraceLinks: boolean; + providerPath?: string; } export async function generatePromptfooConfig( @@ -29,17 +30,24 @@ export async function generatePromptfooConfig( function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig { const tests = spec.scenarios.map(scenario => buildTest(scenario, options)); + + // Provider path should be relative to the config file location + // Since config is at .evaluclaude/promptfooconfig.yaml, the provider is at ./providers/test-runner.py + const providerRelativePath = options.providerPath + ? options.providerPath.replace('.evaluclaude/', './').replace(/^\.evaluclaude\//, './') + : './providers/test-runner.py'; return { description: `Evaluclaude functional tests for ${spec.repo.name}`, providers: [ { - id: `file://providers/test-runner.py`, + id: `file://${providerRelativePath}`, label: 'functional-tests', config: { - test_dir: options.testDir, + test_dir: resolve(options.testDir), framework: options.framework, timeout: 300, + sandbox: true, }, }, ], @@ -48,11 +56,12 @@ function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig { defaultTest: options.includeTraceLinks ? { metadata: { - traceFile: '.evaluclaude/traces/{{evalId}}.json', + evaluclaude: true, + tracesDir: './traces', }, } : undefined, - outputPath: '.evaluclaude/results/promptfoo-results.json', + outputPath: './results/promptfoo-results.json', }; } @@ -147,92 +156,51 @@ function buildAssertion(assertion: any): PromptfooAssertion { export async function generateTestProvider(outputPath: string): Promise { const providerCode = `#!/usr/bin/env python3 -"""Promptfoo provider that executes tests and returns structured results.""" +""" +Promptfoo provider that executes tests and returns structured results. + +This provider integrates with evaluclaude-harness test runners to execute +functional tests in a sandboxed environment and return results compatible +with Promptfoo's assertion system. +""" import subprocess import json import sys import os +import tempfile +import uuid +from pathlib import Path -def get_provider_response(prompt: str, options: dict, context: dict) -> dict: + +def call_api(prompt: str, options: dict, context: dict) -> dict: """Runs tests and returns structured results.""" - test_dir = options.get('config', {}).get('test_dir', './tests') - framework = options.get('config', {}).get('framework', 'pytest') - timeout = options.get('config', {}).get('timeout', 300) + config = options.get('config', {}) + test_dir = config.get('test_dir', './tests/generated') + framework = config.get('framework', 'pytest') + timeout = config.get('timeout', 300) + sandbox = config.get('sandbox', True) scenario_id = prompt.strip() + eval_id = f"eval-{uuid.uuid4().hex[:8]}" + + # Ensure traces directory exists + traces_dir = Path('.evaluclaude/traces') + traces_dir.mkdir(parents=True, exist_ok=True) try: if framework == 'pytest': - result = subprocess.run( - [ - 'python', '-m', 'pytest', - '--json-report', - '--json-report-file=/tmp/pytest_results.json', - '-k', scenario_id, - test_dir - ], - capture_output=True, - text=True, - timeout=timeout - ) - - try: - with open('/tmp/pytest_results.json') as f: - report = json.load(f) - - output = { - 'passed': report.get('summary', {}).get('passed', 0), - 'failed': report.get('summary', {}).get('failed', 0), - 'skipped': report.get('summary', {}).get('skipped', 0), - 'tests': report.get('tests', []), - 'stdout': result.stdout, - 'stderr': result.stderr, - 'exit_code': result.returncode, - } - except FileNotFoundError: - output = { - 'passed': 0, - 'failed': 1, - 'error': 'Failed to generate pytest report', - 'stdout': result.stdout, - 'stderr': result.stderr, - } - + output = run_pytest(test_dir, scenario_id, timeout, eval_id) elif framework in ('vitest', 'jest'): - cmd = ['npx', framework, 'run', '--reporter=json'] - if scenario_id: - cmd.extend(['--testNamePattern', scenario_id]) - cmd.append(test_dir) - - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=timeout - ) - - try: - report = json.loads(result.stdout) - output = { - 'passed': report.get('numPassedTests', 0), - 'failed': report.get('numFailedTests', 0), - 'skipped': report.get('numSkippedTests', 0), - 'tests': report.get('testResults', []), - 'exit_code': result.returncode, - } - except json.JSONDecodeError: - output = { - 'passed': 0, - 'failed': 1, - 'error': 'Failed to parse test output', - 'stdout': result.stdout, - 'stderr': result.stderr, - } + output = run_js_tests(test_dir, scenario_id, timeout, framework, eval_id) else: - output = {'error': f'Unknown framework: {framework}'} + output = {'error': f'Unknown framework: {framework}', 'passed': 0, 'failed': 1} + # Add trace reference + output['eval_id'] = eval_id + output['trace_file'] = str(traces_dir / f"{eval_id}.json") + return { 'output': json.dumps(output), 'error': None, @@ -240,32 +208,187 @@ def get_provider_response(prompt: str, options: dict, context: dict) -> dict: except subprocess.TimeoutExpired: return { - 'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}), + 'output': json.dumps({ + 'error': 'Test execution timed out', + 'passed': 0, + 'failed': 1, + 'eval_id': eval_id, + }), 'error': None, } except Exception as e: return { - 'output': None, + 'output': json.dumps({ + 'error': str(e), + 'passed': 0, + 'failed': 1, + 'eval_id': eval_id, + }), 'error': str(e), } + +def run_pytest(test_dir: str, scenario_id: str, timeout: int, eval_id: str) -> dict: + """Run pytest and return structured results.""" + with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f: + report_file = f.name + + cmd = [ + sys.executable, '-m', 'pytest', + '--json-report', + f'--json-report-file={report_file}', + '-v', + '--tb=short', + ] + + if scenario_id: + cmd.extend(['-k', scenario_id]) + + cmd.append(test_dir) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + cwd=os.getcwd(), + ) + + try: + with open(report_file) as f: + report = json.load(f) + + summary = report.get('summary', {}) + tests = report.get('tests', []) + + output = { + 'passed': summary.get('passed', 0), + 'failed': summary.get('failed', 0), + 'skipped': summary.get('skipped', 0), + 'total': summary.get('total', 0), + 'duration': report.get('duration', 0) * 1000, # Convert to ms + 'tests': [ + { + 'id': extract_scenario_id(t.get('nodeid', '')), + 'name': t.get('nodeid', ''), + 'status': t.get('outcome', 'unknown'), + 'duration': (t.get('call', {}).get('duration', 0) or 0) * 1000, + 'error': t.get('call', {}).get('crash', {}).get('message') if t.get('call', {}).get('crash') else None, + } + for t in tests + ], + 'exit_code': result.returncode, + } + except (FileNotFoundError, json.JSONDecodeError) as e: + output = { + 'passed': 0, + 'failed': 1, + 'error': f'Failed to parse pytest report: {e}', + 'stdout': result.stdout[-2000:] if result.stdout else '', + 'stderr': result.stderr[-2000:] if result.stderr else '', + } + finally: + try: + os.unlink(report_file) + except OSError: + pass + + return output + + +def run_js_tests(test_dir: str, scenario_id: str, timeout: int, framework: str, eval_id: str) -> dict: + """Run vitest/jest and return structured results.""" + with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f: + report_file = f.name + + cmd = ['npx', framework, 'run', '--reporter=json', f'--outputFile={report_file}'] + + if scenario_id: + cmd.extend(['--testNamePattern', scenario_id]) + + cmd.append(test_dir) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + cwd=os.getcwd(), + ) + + try: + with open(report_file) as f: + report = json.load(f) + + output = { + 'passed': report.get('numPassedTests', 0), + 'failed': report.get('numFailedTests', 0), + 'skipped': report.get('numSkippedTests', 0), + 'total': report.get('numTotalTests', 0), + 'tests': [], + 'exit_code': result.returncode, + } + + for test_file in report.get('testResults', []): + for assertion in test_file.get('assertionResults', []): + output['tests'].append({ + 'id': extract_scenario_id(assertion.get('fullName', '')), + 'name': assertion.get('fullName', ''), + 'status': assertion.get('status', 'unknown'), + 'duration': assertion.get('duration', 0), + 'error': assertion.get('failureMessages', [None])[0] if assertion.get('failureMessages') else None, + }) + + except (FileNotFoundError, json.JSONDecodeError) as e: + output = { + 'passed': 0, + 'failed': 1, + 'error': f'Failed to parse {framework} report: {e}', + 'stdout': result.stdout[-2000:] if result.stdout else '', + 'stderr': result.stderr[-2000:] if result.stderr else '', + } + finally: + try: + os.unlink(report_file) + except OSError: + pass + + return output + + +def extract_scenario_id(nodeid: str) -> str: + """Extract scenario ID from test name.""" + import re + match = re.search(r'test[_\\s]([a-zA-Z0-9_-]+)', nodeid, re.IGNORECASE) + return match.group(1) if match else nodeid.replace(' ', '_') + + +def get_provider_response(prompt: str, options: dict, context: dict) -> dict: + """Alias for call_api for backwards compatibility.""" + return call_api(prompt, options, context) + + if __name__ == '__main__': - # For testing the provider directly import argparse - parser = argparse.ArgumentParser() - parser.add_argument('--scenario', default='') - parser.add_argument('--test-dir', default='./tests') - parser.add_argument('--framework', default='pytest') + parser = argparse.ArgumentParser(description='Run tests for Promptfoo') + parser.add_argument('--scenario', default='', help='Scenario ID to filter') + parser.add_argument('--test-dir', default='./tests/generated', help='Test directory') + parser.add_argument('--framework', default='pytest', help='Test framework') + parser.add_argument('--timeout', type=int, default=300, help='Timeout in seconds') args = parser.parse_args() - result = get_provider_response( + result = call_api( args.scenario, - {'config': {'test_dir': args.test_dir, 'framework': args.framework}}, + {'config': { + 'test_dir': args.test_dir, + 'framework': args.framework, + 'timeout': args.timeout, + }}, {} ) - print(json.dumps(result, indent=2)) + print(json.dumps(json.loads(result['output']), indent=2) if result['output'] else result['error']) `; await mkdir(dirname(outputPath), { recursive: true }); - await writeFile(outputPath, providerCode); + await writeFile(outputPath, providerCode, { mode: 0o755 }); } diff --git a/src/promptfoo/index.ts b/src/promptfoo/index.ts index 42deb5c..7c3ba17 100644 --- a/src/promptfoo/index.ts +++ b/src/promptfoo/index.ts @@ -1,2 +1,13 @@ export * from './types.js'; -export { generatePromptfooConfig, generateTestProvider } from './config-generator.js'; +export { generatePromptfooConfig, generateTestProvider, type ConfigOptions } from './config-generator.js'; +export { + runTestsForPromptfoo, + savePromptfooResults, + type RunTestsForPromptfooOptions, + type PromptfooProviderResult, +} from './runner-bridge.js'; +export { + exportToPromptfooFormat, + generateViewOnlyConfig, + type ExportOptions, +} from './results-exporter.js'; diff --git a/src/promptfoo/results-exporter.ts b/src/promptfoo/results-exporter.ts new file mode 100644 index 0000000..f78f338 --- /dev/null +++ b/src/promptfoo/results-exporter.ts @@ -0,0 +1,127 @@ +/** + * Export test execution results to Promptfoo format for viewing in the UI. + * + * Instead of using Promptfoo to run tests (which requires a provider that + * responds quickly), we run tests ourselves and export results to Promptfoo's + * result format. This allows us to use Promptfoo's excellent visualization UI. + */ + +import { writeFile, mkdir } from 'fs/promises'; +import { join } from 'path'; +import type { ExecutionResult } from '../runners/types.js'; +import type { EvalSpec } from '../analyzer/types.js'; +import type { PromptfooResult, PromptfooTestResult } from './types.js'; + +export interface ExportOptions { + outputDir: string; + evalId?: string; + includeSpec?: boolean; +} + +/** + * Export ExecutionResult to Promptfoo result format. + */ +export async function exportToPromptfooFormat( + result: ExecutionResult, + spec: EvalSpec | undefined, + options: ExportOptions +): Promise { + const { outputDir, evalId = `eval-${Date.now()}` } = options; + + const promptfooResult = buildPromptfooResult(result, spec, evalId); + + await mkdir(outputDir, { recursive: true }); + const outputPath = join(outputDir, `${evalId}.json`); + await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2)); + + // Also write the latest.json symlink equivalent + const latestPath = join(outputDir, 'latest.json'); + await writeFile(latestPath, JSON.stringify(promptfooResult, null, 2)); + + return outputPath; +} + +function buildPromptfooResult( + result: ExecutionResult, + spec: EvalSpec | undefined, + evalId: string +): PromptfooResult { + const testResults: PromptfooTestResult[] = result.tests.map(test => { + // Try to find matching scenario from spec + const scenario = spec?.scenarios.find(s => + s.id === test.id || test.name.includes(s.id) + ); + + return { + prompt: { + raw: scenario?.id || test.id, + label: scenario?.name || test.name, + }, + vars: { + scenario_id: scenario?.id || test.id, + target_module: scenario?.target.module || '', + target_function: scenario?.target.function || '', + description: scenario?.description || test.name, + }, + response: { + output: test.status === 'passed' + ? 'Test passed successfully' + : test.error?.message || 'Test failed', + }, + gradingResult: { + pass: test.status === 'passed', + score: test.status === 'passed' ? 1 : 0, + reason: test.status === 'passed' + ? 'All assertions passed' + : test.error?.message || 'Test failed', + componentResults: test.assertions.details.map(a => ({ + pass: a.passed, + score: a.passed ? 1 : 0, + reason: a.description, + assertion: { + type: 'custom', + value: a.description, + }, + })), + }, + success: test.status === 'passed', + error: test.error?.message, + }; + }); + + return { + version: 1, + timestamp: new Date().toISOString(), + results: testResults, + stats: { + successes: result.summary.passed, + failures: result.summary.failed, + tokenUsage: { + total: 0, + prompt: 0, + completion: 0, + }, + }, + }; +} + +/** + * Generate a minimal Promptfoo config that just views results (no provider). + */ +export function generateViewOnlyConfig(spec: EvalSpec): string { + return `# Evaluclaude Results Config +# This config is for viewing results only - tests are run via evaluclaude run + +description: "Test results for ${spec.repo.name}" + +# No providers needed - we pre-run tests and import results +providers: [] + +prompts: [] + +tests: [] + +# Results are stored here by evaluclaude run --export-promptfoo +outputPath: .evaluclaude/results/latest.json +`; +} diff --git a/src/promptfoo/runner-bridge.ts b/src/promptfoo/runner-bridge.ts new file mode 100644 index 0000000..7cff619 --- /dev/null +++ b/src/promptfoo/runner-bridge.ts @@ -0,0 +1,194 @@ +/** + * Bridge between our test runners and Promptfoo's provider interface. + * + * This module provides a unified way to run tests that works both: + * 1. Standalone via our `run` command + * 2. As a Promptfoo provider via the generated test-runner.py + * + * Results are stored in a format compatible with Promptfoo's expectations. + */ + +import { writeFile, mkdir } from 'fs/promises'; +import { join, dirname } from 'path'; +import { runTests, type ExecutionResult, type ExecutionOptions, DEFAULT_SANDBOX_CONFIG } from '../runners/index.js'; +import { createTracer, saveTrace, type EvalTrace } from '../observability/index.js'; + +export interface PromptfooProviderResult { + output: string; + error: string | null; + tokenUsage?: { + total: number; + prompt: number; + completion: number; + }; +} + +export interface RunTestsForPromptfooOptions { + scenarioId: string; + testDir: string; + framework: 'pytest' | 'vitest' | 'jest'; + timeout?: number; + sandbox?: boolean; + evalId?: string; + recordTrace?: boolean; +} + +/** + * Run tests for a specific scenario and format results for Promptfoo. + */ +export async function runTestsForPromptfoo( + options: RunTestsForPromptfooOptions +): Promise { + const { + scenarioId, + testDir, + framework, + timeout = 300000, + sandbox = true, + evalId = `eval-${Date.now()}`, + recordTrace = true, + } = options; + + const tracer = recordTrace ? createTracer(evalId) : null; + + try { + const execOptions: ExecutionOptions = { + framework, + sandbox, + timeout, + parallel: false, + filter: scenarioId ? [scenarioId] : undefined, + cwd: process.cwd(), + }; + + tracer?.recordIntrospection({ + filesAnalyzed: [testDir], + duration: 0, + }); + + const result = await runTests( + testDir, + execOptions, + sandbox ? DEFAULT_SANDBOX_CONFIG : undefined + ); + + // Record execution results in trace + if (tracer) { + tracer.recordExecution({ + testsPassed: result.summary.passed, + testsFailed: result.summary.failed, + testsSkipped: result.summary.skipped, + }); + + for (const test of result.tests) { + if (test.status === 'failed' || test.status === 'error') { + tracer.recordTestFailure({ + scenarioId: test.id, + testName: test.name, + error: test.error?.message || 'Unknown error', + stack: test.error?.stack, + }); + } + } + } + + // Build Promptfoo-compatible output + const promptfooOutput = buildPromptfooOutput(result, scenarioId); + + // Save trace if enabled + if (tracer) { + const trace = tracer.finalize(); + await saveTrace(trace); + } + + return { + output: JSON.stringify(promptfooOutput), + error: null, + }; + } catch (error) { + if (tracer) { + tracer.recordError(error instanceof Error ? error : new Error(String(error))); + const trace = tracer.finalize(); + await saveTrace(trace); + } + + return { + output: JSON.stringify({ + passed: 0, + failed: 1, + error: error instanceof Error ? error.message : String(error), + }), + error: error instanceof Error ? error.message : String(error), + }; + } +} + +/** + * Build Promptfoo-compatible output from ExecutionResult. + */ +function buildPromptfooOutput( + result: ExecutionResult, + scenarioId?: string +): Record { + const matchingTests = scenarioId + ? result.tests.filter(t => t.id === scenarioId || t.name.includes(scenarioId)) + : result.tests; + + return { + passed: matchingTests.filter(t => t.status === 'passed').length, + failed: matchingTests.filter(t => t.status === 'failed' || t.status === 'error').length, + skipped: matchingTests.filter(t => t.status === 'skipped').length, + total: matchingTests.length, + tests: matchingTests.map(t => ({ + id: t.id, + name: t.name, + status: t.status, + duration: t.duration, + error: t.error?.message, + })), + summary: { + ...result.summary, + matchedScenario: scenarioId, + }, + errors: result.errors, + }; +} + +/** + * Generate a Promptfoo-compatible results file from our execution results. + */ +export async function savePromptfooResults( + result: ExecutionResult, + evalId: string, + outputDir: string = '.evaluclaude/results' +): Promise { + const promptfooResult = { + version: 1, + timestamp: new Date().toISOString(), + evalId, + results: result.tests.map(t => ({ + prompt: { raw: t.id, label: t.name }, + vars: { scenario_id: t.id }, + response: { + output: t.status === 'passed' ? 'PASS' : t.error?.message || 'FAIL', + }, + gradingResult: { + pass: t.status === 'passed', + score: t.status === 'passed' ? 1 : 0, + reason: t.error?.message || (t.status === 'passed' ? 'Test passed' : 'Test failed'), + }, + success: t.status === 'passed', + error: t.error?.message, + })), + stats: { + successes: result.summary.passed, + failures: result.summary.failed, + }, + }; + + await mkdir(outputDir, { recursive: true }); + const outputPath = join(outputDir, `promptfoo-${evalId}.json`); + await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2)); + + return outputPath; +} diff --git a/src/runners/index.ts b/src/runners/index.ts index ae861b9..050986e 100644 --- a/src/runners/index.ts +++ b/src/runners/index.ts @@ -73,8 +73,8 @@ export function formatResults(result: ExecutionResult): string { lines.push(` Total: ${result.summary.total}`); lines.push(` โœ… Passed: ${result.summary.passed}`); lines.push(` โŒ Failed: ${result.summary.failed}`); - lines.push(` โญ๏ธ Skipped: ${result.summary.skipped}`); - lines.push(` โฑ๏ธ Duration: ${result.summary.duration}ms`); + lines.push(` โญ๏ธ Skipped: ${result.summary.skipped ?? 0}`); + lines.push(` โฑ๏ธ Duration: ${result.summary.duration || 0}ms`); if (result.errors.length > 0) { lines.push('');