improvements and promptfoo

2026-04-17 17:03:26 +00:00 · 2026-01-11 20:02:30 -05:00 · 2026-01-11 20:02:30 -05:00 · ff5300f4e0
commit ff5300f4e0
parent 6698c12e5b
13 changed files with 1082 additions and 117 deletions
--- a/src/cli/commands/pipeline.ts
+++ b/src/cli/commands/pipeline.ts
@ -0,0 +1,257 @@
+import { Command } from 'commander';
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
+import { join, resolve } from 'path';
+import { analyze } from '../../introspector/index.js';
+import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
+import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
+import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
+import { createTracer, saveTrace } from '../../observability/index.js';
+import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
+import type { EvalSpec } from '../../analyzer/types.js';
+
+const EVALUCLAUDE_DIR = '.evaluclaude';
+
+interface PipelineOptions {
+  output?: string;
+  interactive?: boolean;
+  focus?: string;
+  maxScenarios: string;
+  testDir: string;
+  framework?: string;
+  skipAnalyze?: boolean;
+  skipRender?: boolean;
+  skipRun?: boolean;
+  promptfoo?: boolean;
+  quiet?: boolean;
+}
+
+export const pipelineCommand = new Command('pipeline')
+  .description('Run the full eval generation pipeline: introspect → analyze → render → run')
+  .argument('[path]', 'Path to the repository to analyze', '.')
+  .option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude')
+  .option('-i, --interactive', 'Enable interactive mode with clarifying questions')
+  .option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
+  .option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
+  .option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
+  .option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
+  .option('--skip-analyze', 'Skip analysis, use existing spec')
+  .option('--skip-render', 'Skip rendering, use existing tests')
+  .option('--skip-run', 'Skip test execution')
+  .option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
+  .option('--quiet', 'Suppress progress messages')
+  .action(async (repoPath: string, options: PipelineOptions) => {
+    const absolutePath = resolve(repoPath);
+    const log = options.quiet ? () => {} : console.log;
+    const outputDir = options.output || EVALUCLAUDE_DIR;
+
+    console.log('\n🚀 Evaluclaude Pipeline');
+    console.log('═'.repeat(50));
+    console.log(`   Repository: ${absolutePath}`);
+    console.log(`   Output: ${outputDir}`);
+    console.log('═'.repeat(50) + '\n');
+
+    // Ensure output directories exist
+    mkdirSync(outputDir, { recursive: true });
+    mkdirSync(options.testDir, { recursive: true });
+
+    const specPath = join(outputDir, 'spec.json');
+    const tracesDir = join(outputDir, 'traces');
+    const resultsDir = join(outputDir, 'results');
+
+    mkdirSync(tracesDir, { recursive: true });
+    mkdirSync(resultsDir, { recursive: true });
+
+    let spec: EvalSpec;
+
+    // Step 1: Introspection + Analysis
+    if (options.skipAnalyze && existsSync(specPath)) {
+      log('📋 Using existing EvalSpec...');
+      spec = JSON.parse(readFileSync(specPath, 'utf-8'));
+      log(`   Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
+    } else {
+      log('🔬 Step 1: Introspecting codebase...');
+      
+      try {
+        const repoSummary = await analyze({
+          root: absolutePath,
+          onProgress: options.quiet ? undefined : (msg) => log(`   ${msg}`),
+        });
+
+        log(`   Files: ${repoSummary.files.length}`);
+        log(`   Languages: ${repoSummary.languages.join(', ')}`);
+        log('');
+
+        log('🤖 Step 2: Generating EvalSpec with Claude...\n');
+
+        const focus = options.focus?.split(',').map(s => s.trim());
+        const maxScenarios = parseInt(options.maxScenarios, 10);
+
+        let result;
+        if (options.interactive) {
+          const { default: inquirer } = await import('inquirer');
+          
+          result = await generateEvalSpecInteractive(
+            repoSummary,
+            async (question: string) => {
+              const { answer } = await inquirer.prompt([{
+                type: 'input',
+                name: 'answer',
+                message: `🤖 Claude asks: ${question}`,
+              }]);
+              return answer;
+            },
+            { focus, maxScenarios }
+          );
+        } else {
+          result = await generateEvalSpec(repoSummary, {
+            interactive: false,
+            focus,
+            maxScenarios,
+          });
+        }
+
+        spec = result.spec;
+
+        // Save the spec
+        writeFileSync(specPath, JSON.stringify(spec, null, 2));
+
+        log(`\n✅ EvalSpec generated!`);
+        log(`   Scenarios: ${spec.scenarios.length}`);
+        log(`   Tokens: ${result.tokensUsed}`);
+        log(`   Saved: ${specPath}\n`);
+      } catch (error) {
+        console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error);
+        process.exit(1);
+      }
+    }
+
+    // Step 2: Render tests
+    if (!options.skipRender) {
+      log('📝 Step 3: Rendering test files...');
+
+      try {
+        const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
+        
+        const renderResult = await renderSpec(spec, {
+          outputDir: options.testDir,
+          framework,
+          includeFixtures: true,
+          generateMocks: true,
+          dryRun: false,
+        });
+
+        log(`   Framework: ${framework}`);
+        log(`   Files: ${renderResult.stats.fileCount}`);
+        log(`   Scenarios: ${renderResult.stats.scenarioCount}`);
+        log(`   Assertions: ${renderResult.stats.assertionCount}`);
+        log(`   Output: ${options.testDir}\n`);
+      } catch (error) {
+        console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error);
+        process.exit(1);
+      }
+    }
+
+    // Step 3: Run tests
+    if (!options.skipRun) {
+      log('🧪 Step 4: Running tests...\n');
+
+      try {
+        const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
+        const tracer = createTracer(spec.repo.name);
+
+        tracer.recordIntrospection({
+          filesAnalyzed: spec.scenarios.map(s => s.target.module),
+          totalFunctions: spec.scenarios.length,
+          duration: 0,
+        });
+
+        tracer.recordGeneration({
+          scenariosGenerated: spec.scenarios.length,
+          filesWritten: [options.testDir],
+        });
+
+        const result = await runTests(
+          options.testDir,
+          {
+            framework,
+            sandbox: true,
+            timeout: 300000,
+            parallel: false,
+            cwd: process.cwd(),
+          },
+          DEFAULT_SANDBOX_CONFIG
+        );
+
+        tracer.recordExecution({
+          testsPassed: result.summary.passed,
+          testsFailed: result.summary.failed,
+          testsSkipped: result.summary.skipped,
+        });
+
+        for (const test of result.tests) {
+          if (test.status === 'failed' || test.status === 'error') {
+            tracer.recordTestFailure({
+              scenarioId: test.id,
+              testName: test.name,
+              error: test.error?.message || 'Unknown error',
+              stack: test.error?.stack,
+            });
+          }
+        }
+
+        const trace = tracer.finalize();
+        const tracePath = await saveTrace(trace);
+
+        log(formatResults(result));
+        log(`📊 Trace saved: ${tracePath}`);
+        log(`   View with: evaluclaude view ${trace.id}\n`);
+
+        // Save results
+        const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
+        writeFileSync(resultsPath, JSON.stringify(result, null, 2));
+
+      } catch (error) {
+        console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error);
+        process.exit(1);
+      }
+    }
+
+    // Step 4: Generate Promptfoo config
+    if (options.promptfoo) {
+      log('📦 Step 5: Generating Promptfoo configuration...');
+
+      try {
+        const configPath = join(outputDir, 'promptfooconfig.yaml');
+        const providerPath = join(outputDir, 'providers', 'test-runner.py');
+        const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
+
+        await generatePromptfooConfig(spec, {
+          testDir: options.testDir,
+          outputPath: configPath,
+          framework,
+          includeTraceLinks: true,
+          providerPath,
+        });
+
+        await generateTestProvider(providerPath);
+
+        log(`   Config: ${configPath}`);
+        log(`   Provider: ${providerPath}`);
+        log(`\n   Launch UI with: evaluclaude ui\n`);
+      } catch (error) {
+        console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
+      }
+    }
+
+    console.log('═'.repeat(50));
+    console.log('✅ Pipeline complete!');
+    console.log('═'.repeat(50));
+    console.log(`\nNext steps:`);
+    console.log(`   View traces:     evaluclaude view --last`);
+    console.log(`   List all traces: evaluclaude traces`);
+    if (options.promptfoo) {
+      console.log(`   Launch UI:       evaluclaude ui`);
+      console.log(`   Run Promptfoo:   evaluclaude eval --spec ${specPath}`);
+    }
+    console.log('');
+  });
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@ -10,6 +10,7 @@ import {
  DEFAULT_SANDBOX_CONFIG 
 } from '../../runners/index.js';
 import { createTracer, saveTrace } from '../../observability/index.js';
+import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
 import type { EvalSpec } from '../../analyzer/types.js';

 export const runCommand = new Command('run')
@ -25,6 +26,7 @@ export const runCommand = new Command('run')
  .option('-o, --output <file>', 'Output results to JSON file')
  .option('--trace', 'Record execution trace', true)
  .option('--no-trace', 'Disable execution tracing')
+  .option('--export-promptfoo', 'Export results in Promptfoo format', false)
  .option('-w, --watch', 'Watch mode (rerun on changes)', false)
  .action(async (testDir: string, options) => {
    try {
@ -109,6 +111,16 @@ export const runCommand = new Command('run')
        console.log(`\n📁 Results saved to: ${options.output}`);
      }

+      // Export to Promptfoo format for UI viewing
+      if (options.exportPromptfoo) {
+        const exportPath = await exportToPromptfooFormat(result, spec, {
+          outputDir: '.evaluclaude/results',
+          evalId: `eval-${Date.now()}`,
+        });
+        console.log(`\n📦 Promptfoo results exported: ${exportPath}`);
+        console.log(`   View with: evaluclaude ui`);
+      }
+
      if (tracer) {
        const trace = tracer.finalize();
        const tracePath = await saveTrace(trace);
--- a/src/cli/commands/ui.ts
+++ b/src/cli/commands/ui.ts
@ -1,7 +1,7 @@
 import { Command } from 'commander';
-import { spawn, type ChildProcess } from 'child_process';
+import { spawn } from 'child_process';
 import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
-import { join, dirname } from 'path';
+import { join, dirname, resolve as resolvePath } from 'path';
 import type { EvalSpec } from '../../analyzer/types.js';
 import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';

@ -21,6 +21,7 @@ export const uiCommand = new Command('ui')
      const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
      const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');

+      // If spec provided with --generate, create/update Promptfoo config
      if (options.spec && options.generate) {
        console.log('\n📄 Generating Promptfoo configuration...');
        
@ -36,6 +37,7 @@ export const uiCommand = new Command('ui')
          outputPath: configPath,
          framework: detectFramework(spec),
          includeTraceLinks: true,
+          providerPath: providerPath,
        });

        await generateTestProvider(providerPath);
@ -44,20 +46,31 @@ export const uiCommand = new Command('ui')
        console.log(`   Provider: ${providerPath}`);
      }

+      // Check for existing config, create default if missing
      if (!existsSync(configPath)) {
        console.log('\n⚠️  No Promptfoo config found.');
-        console.log('   Run with --spec <file> --generate to create one.\n');
-        console.log('   Or create one manually:');
-        console.log(`   ${configPath}\n`);
+        console.log('   Creating default configuration...\n');
        
        await createDefaultConfig(configPath, providerPath);
-        console.log(`   Created default config at ${configPath}`);
+        console.log(`   Created: ${configPath}`);
+      }
+
+      // Check for results to display
+      const resultsDir = join(EVALUCLAUDE_DIR, 'results');
+      const latestResults = join(resultsDir, 'latest.json');
+      
+      if (!existsSync(latestResults)) {
+        console.log('\n⚠️  No evaluation results found.');
+        console.log('   Run `evaluclaude run --export-promptfoo` first to generate results.\n');
+        console.log('   Or run the full pipeline:');
+        console.log('   evaluclaude pipeline <path> --promptfoo\n');
      }

      console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
-      console.log(`   Config: ${configPath}\n`);
+      console.log(`   Results: ${latestResults}\n`);

-      await launchPromptfooUI(port, configPath, options.open);
+      // Use promptfoo view with the results file
+      await launchPromptfooView(port, latestResults, options.open);
    } catch (error) {
      console.error('Error launching UI:', error instanceof Error ? error.message : error);
      process.exit(1);
@ -71,12 +84,21 @@ export const evalCommand = new Command('eval')
  .option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
  .option('--view', 'Launch UI after evaluation', false)
  .option('-p, --port <port>', 'Port for UI', '3000')
+  .option('--no-cache', 'Disable Promptfoo caching', false)
  .action(async (options) => {
    try {
      const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
+      const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');

+      // Generate config from spec if provided
      if (options.spec) {
        console.log('\n📄 Generating Promptfoo configuration from spec...');
+        
+        if (!existsSync(options.spec)) {
+          console.error(`Error: Spec file not found: ${options.spec}`);
+          process.exit(1);
+        }
+
        const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
        
        await generatePromptfooConfig(spec, {
@ -84,30 +106,57 @@ export const evalCommand = new Command('eval')
          outputPath: configPath,
          framework: detectFramework(spec),
          includeTraceLinks: true,
+          providerPath: providerPath,
        });

-        const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
        await generateTestProvider(providerPath);
+        
+        console.log(`   Config: ${configPath}`);
+        console.log(`   Provider: ${providerPath}`);
+        console.log(`   Scenarios: ${spec.scenarios.length}`);
      }

      if (!existsSync(configPath)) {
-        console.error(`Error: Config not found: ${configPath}`);
-        console.log('Run with --spec <file> to generate from EvalSpec.');
+        console.error(`\nError: Config not found: ${configPath}`);
+        console.log('Run with --spec <file> to generate from EvalSpec, or create config manually.');
        process.exit(1);
      }

-      console.log('\n🧪 Running Promptfoo evaluations...\n');
+      // Ensure output directory exists
+      mkdirSync(options.output, { recursive: true });
+
+      console.log('\n🧪 Running Promptfoo evaluations...');
+      console.log(`   Config: ${configPath}`);
+      console.log(`   Output: ${options.output}\n`);

      const outputFile = join(options.output, `eval-${Date.now()}.json`);
-      mkdirSync(dirname(outputFile), { recursive: true });

-      await runPromptfooEval(configPath, outputFile);
+      const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache);

-      console.log(`\n📁 Results saved: ${outputFile}`);
+      if (exitCode === 0) {
+        console.log(`\n✅ Evaluation complete!`);
+        console.log(`📁 Results: ${outputFile}`);
+      } else {
+        console.log(`\n⚠️  Evaluation finished with exit code ${exitCode}`);
+        console.log(`📁 Results: ${outputFile}`);
+      }
+
+      // List traces generated during evaluation
+      const tracesDir = join(EVALUCLAUDE_DIR, 'traces');
+      if (existsSync(tracesDir)) {
+        const { readdirSync } = await import('fs');
+        const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json'));
+        if (traces.length > 0) {
+          console.log(`\n📊 Traces generated: ${traces.length}`);
+          console.log(`   View with: evaluclaude view --last`);
+        }
+      }

      if (options.view) {
        console.log(`\n🚀 Launching UI on port ${options.port}...`);
        await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
+      } else {
+        console.log(`\n   View results: evaluclaude ui`);
      }
    } catch (error) {
      console.error('Error running eval:', error instanceof Error ? error.message : error);
@ -115,6 +164,64 @@ export const evalCommand = new Command('eval')
    }
  });

+/**
+ * Launch Promptfoo view to display pre-computed results.
+ */
+async function launchPromptfooView(
+  port: number,
+  resultsFile: string,
+  openBrowser: boolean
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    // Use 'promptfoo view' which opens the web UI showing results from the output directory
+    const resultsDir = dirname(resolvePath(resultsFile));
+    const args = ['promptfoo', 'view', '--port', String(port)];
+    
+    if (openBrowser) {
+      args.push('-y');
+    } else {
+      args.push('-n');
+    }
+
+    // Pass the directory containing results
+    args.push(resultsDir);
+
+    console.log(`   Running: npx ${args.join(' ')}\n`);
+
+    const child = spawn('npx', args, {
+      stdio: 'inherit',
+      env: { ...process.env },
+    });
+
+    child.on('error', (error) => {
+      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
+        console.error('\n❌ Promptfoo not found.');
+        console.error('   Install with: npm install -g promptfoo');
+        console.error('   Or run: npx promptfoo --version\n');
+      } else {
+        reject(error);
+      }
+    });
+
+    child.on('close', (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`Promptfoo exited with code ${code}`));
+      }
+    });
+
+    // Handle Ctrl+C gracefully
+    process.on('SIGINT', () => {
+      child.kill('SIGINT');
+      process.exit(0);
+    });
+  });
+}
+
+/**
+ * Launch Promptfoo with a config file (for running evals).
+ */
 async function launchPromptfooUI(
  port: number, 
  configPath: string, 
@ -129,7 +236,8 @@ async function launchPromptfooUI(
      args.push('-n');
    }

-    const configDir = dirname(configPath);
+    // Pass the directory containing the config
+    const configDir = dirname(resolvePath(configPath));
    args.push(configDir);

    console.log(`   Running: npx ${args.join(' ')}\n`);
@ -141,7 +249,9 @@ async function launchPromptfooUI(

    child.on('error', (error) => {
      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
-        console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
+        console.error('\n❌ Promptfoo not found.');
+        console.error('   Install with: npm install -g promptfoo');
+        console.error('   Or run: npx promptfoo --version\n');
      } else {
        reject(error);
      }
@ -155,6 +265,7 @@ async function launchPromptfooUI(
      }
    });

+    // Handle Ctrl+C gracefully
    process.on('SIGINT', () => {
      child.kill('SIGINT');
      process.exit(0);
@ -162,16 +273,23 @@ async function launchPromptfooUI(
  });
 }

-async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
+async function runPromptfooEval(
+  configPath: string, 
+  outputFile: string,
+  noCache: boolean
+): Promise<number> {
  return new Promise((resolve, reject) => {
    const args = [
      'promptfoo', 
      'eval', 
      '-c', configPath,
      '-o', outputFile,
-      '--no-cache',
    ];

+    if (noCache) {
+      args.push('--no-cache');
+    }
+
    console.log(`   Running: npx ${args.join(' ')}\n`);

    const child = spawn('npx', args, {
@ -179,14 +297,18 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
      env: { ...process.env },
    });

-    child.on('error', reject);
+    child.on('error', (error) => {
+      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
+        console.error('\n❌ Promptfoo not found.');
+        console.error('   Install with: npm install -g promptfoo\n');
+        reject(error);
+      } else {
+        reject(error);
+      }
+    });

    child.on('close', (code) => {
-      if (code === 0) {
-        resolve();
-      } else {
-        reject(new Error(`Promptfoo eval exited with code ${code}`));
-      }
+      resolve(code ?? 1);
    });
  });
 }
@ -194,6 +316,14 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
 async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
  const defaultConfig = `# Evaluclaude Promptfoo Configuration
 # Generated by evaluclaude
+# 
+# To populate this config from an EvalSpec:
+#   evaluclaude eval --spec <evalspec.json>
+#
+# Or run the full pipeline:
+#   evaluclaude analyze <path> -o spec.json
+#   evaluclaude render spec.json -o tests/generated
+#   evaluclaude eval --spec spec.json

 description: "Evaluclaude functional test evaluations"

@ -204,12 +334,13 @@ providers:
      test_dir: ./tests/generated
      framework: pytest
      timeout: 300
+      sandbox: true

 prompts:
  - "{{scenario_id}}"

 tests:
-  - description: "Example test"
+  - description: "Example test - replace with real scenarios"
    vars:
      scenario_id: "test_example"
    assert:
@ -219,12 +350,19 @@ tests:
          result = json.loads(output)
          result.get('passed', 0) > 0

+# Default test configuration
+defaultTest:
+  metadata:
+    evaluclaude: true
+    tracesDir: .evaluclaude/traces
+
 outputPath: .evaluclaude/results/promptfoo-results.json
 `;

  mkdirSync(dirname(configPath), { recursive: true });
  writeFileSync(configPath, defaultConfig);

+  // Also generate the provider
  await generateTestProvider(providerPath);
 }

@ -232,5 +370,8 @@ function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
  if (spec.repo.languages.includes('python')) {
    return 'pytest';
  }
+  if (spec.repo.languages.includes('typescript') || spec.repo.languages.includes('javascript')) {
+    return 'vitest';
+  }
  return 'vitest';
 }