promptfoo ui and testcon

2026-04-18 00:02:46 +00:00 · 2026-01-11 18:28:03 -05:00 · 2026-01-11 18:28:03 -05:00 · 6698c12e5b
commit 6698c12e5b
parent e0c36241b0
18 changed files with 2169 additions and 0 deletions
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@ -0,0 +1,145 @@
+import { Command } from 'commander';
+import { existsSync, readFileSync } from 'fs';
+import { join } from 'path';
+import { 
+  runTests, 
+  formatResults, 
+  detectTestFramework,
+  type TestFramework,
+  type ExecutionOptions,
+  DEFAULT_SANDBOX_CONFIG 
+} from '../../runners/index.js';
+import { createTracer, saveTrace } from '../../observability/index.js';
+import type { EvalSpec } from '../../analyzer/types.js';
+
+export const runCommand = new Command('run')
+  .description('Run generated tests and collect results')
+  .argument('[test-dir]', 'Directory containing test files', './tests/generated')
+  .option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
+  .option('-s, --spec <spec>', 'Path to EvalSpec JSON for result mapping')
+  .option('--sandbox', 'Run tests in sandbox mode', true)
+  .option('--no-sandbox', 'Disable sandbox mode')
+  .option('-t, --timeout <ms>', 'Test timeout in milliseconds', '300000')
+  .option('-p, --parallel', 'Run tests in parallel', false)
+  .option('--filter <patterns...>', 'Run only tests matching patterns')
+  .option('-o, --output <file>', 'Output results to JSON file')
+  .option('--trace', 'Record execution trace', true)
+  .option('--no-trace', 'Disable execution tracing')
+  .option('-w, --watch', 'Watch mode (rerun on changes)', false)
+  .action(async (testDir: string, options) => {
+    try {
+      console.log(`\n🧪 Running tests from ${testDir}...\n`);
+
+      if (!existsSync(testDir)) {
+        console.error(`Error: Test directory not found: ${testDir}`);
+        process.exit(1);
+      }
+
+      const framework: TestFramework = options.framework || detectTestFramework(testDir);
+      console.log(`   Framework: ${framework}`);
+      console.log(`   Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`);
+      console.log(`   Timeout: ${options.timeout}ms`);
+
+      let spec: EvalSpec | undefined;
+      if (options.spec && existsSync(options.spec)) {
+        spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
+        console.log(`   Spec: ${options.spec} (${spec.scenarios.length} scenarios)`);
+      }
+
+      const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
+
+      const execOptions: ExecutionOptions = {
+        framework,
+        sandbox: options.sandbox,
+        timeout: parseInt(options.timeout, 10),
+        parallel: options.parallel,
+        filter: options.filter,
+        cwd: process.cwd(),
+      };
+
+      if (tracer) {
+        tracer.recordIntrospection({
+          filesAnalyzed: [testDir],
+          duration: 0,
+        });
+      }
+
+      console.log('\n   Running tests...\n');
+      const startTime = Date.now();
+
+      const result = await runTests(
+        testDir,
+        execOptions,
+        options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
+      );
+
+      if (tracer) {
+        tracer.recordExecution({
+          testsPassed: result.summary.passed,
+          testsFailed: result.summary.failed,
+          testsSkipped: result.summary.skipped,
+        });
+
+        for (const test of result.tests) {
+          if (test.status === 'failed' || test.status === 'error') {
+            tracer.recordTestFailure({
+              scenarioId: test.id,
+              testName: test.name,
+              error: test.error?.message || 'Unknown error',
+              stack: test.error?.stack,
+            });
+          }
+        }
+      }
+
+      console.log(formatResults(result));
+
+      if (spec) {
+        const mappedResults = mapResultsToScenarios(result, spec);
+        console.log(`\n📊 Scenario Coverage:`);
+        console.log(`   Covered:   ${mappedResults.covered}/${spec.scenarios.length}`);
+        console.log(`   Unmapped:  ${mappedResults.unmapped}`);
+      }
+
+      if (options.output) {
+        const { writeFileSync, mkdirSync } = await import('fs');
+        const { dirname } = await import('path');
+        mkdirSync(dirname(options.output), { recursive: true });
+        writeFileSync(options.output, JSON.stringify(result, null, 2));
+        console.log(`\n📁 Results saved to: ${options.output}`);
+      }
+
+      if (tracer) {
+        const trace = tracer.finalize();
+        const tracePath = await saveTrace(trace);
+        console.log(`\n📊 Trace saved: ${tracePath}`);
+        console.log(`   View with: evaluclaude view ${trace.id}`);
+      }
+
+      if (result.summary.failed > 0) {
+        process.exit(1);
+      }
+    } catch (error) {
+      console.error('Error running tests:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+function mapResultsToScenarios(
+  result: Awaited<ReturnType<typeof runTests>>,
+  spec: EvalSpec
+): { covered: number; unmapped: number } {
+  const scenarioIds = new Set(spec.scenarios.map(s => s.id));
+  let covered = 0;
+  let unmapped = 0;
+
+  for (const test of result.tests) {
+    if (scenarioIds.has(test.id)) {
+      covered++;
+    } else {
+      unmapped++;
+    }
+  }
+
+  return { covered, unmapped };
+}
--- a/src/cli/commands/ui.ts
+++ b/src/cli/commands/ui.ts
@ -0,0 +1,236 @@
+import { Command } from 'commander';
+import { spawn, type ChildProcess } from 'child_process';
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
+import { join, dirname } from 'path';
+import type { EvalSpec } from '../../analyzer/types.js';
+import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
+
+const EVALUCLAUDE_DIR = '.evaluclaude';
+const CONFIG_FILE = 'promptfooconfig.yaml';
+const PROVIDERS_DIR = 'providers';
+
+export const uiCommand = new Command('ui')
+  .description('Launch the evaluation dashboard UI')
+  .option('-p, --port <port>', 'Port to run the UI on', '3000')
+  .option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
+  .option('--generate', 'Regenerate Promptfoo config from spec')
+  .option('--no-open', 'Do not auto-open browser')
+  .action(async (options) => {
+    try {
+      const port = parseInt(options.port, 10);
+      const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
+      const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
+
+      if (options.spec && options.generate) {
+        console.log('\n📄 Generating Promptfoo configuration...');
+        
+        if (!existsSync(options.spec)) {
+          console.error(`Error: Spec file not found: ${options.spec}`);
+          process.exit(1);
+        }
+
+        const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
+        
+        await generatePromptfooConfig(spec, {
+          testDir: './tests/generated',
+          outputPath: configPath,
+          framework: detectFramework(spec),
+          includeTraceLinks: true,
+        });
+
+        await generateTestProvider(providerPath);
+
+        console.log(`   Config: ${configPath}`);
+        console.log(`   Provider: ${providerPath}`);
+      }
+
+      if (!existsSync(configPath)) {
+        console.log('\n⚠️  No Promptfoo config found.');
+        console.log('   Run with --spec <file> --generate to create one.\n');
+        console.log('   Or create one manually:');
+        console.log(`   ${configPath}\n`);
+        
+        await createDefaultConfig(configPath, providerPath);
+        console.log(`   Created default config at ${configPath}`);
+      }
+
+      console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
+      console.log(`   Config: ${configPath}\n`);
+
+      await launchPromptfooUI(port, configPath, options.open);
+    } catch (error) {
+      console.error('Error launching UI:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+export const evalCommand = new Command('eval')
+  .description('Run evaluations with Promptfoo and optionally launch UI')
+  .option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
+  .option('-c, --config <config>', 'Path to promptfooconfig.yaml')
+  .option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
+  .option('--view', 'Launch UI after evaluation', false)
+  .option('-p, --port <port>', 'Port for UI', '3000')
+  .action(async (options) => {
+    try {
+      const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
+
+      if (options.spec) {
+        console.log('\n📄 Generating Promptfoo configuration from spec...');
+        const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
+        
+        await generatePromptfooConfig(spec, {
+          testDir: './tests/generated',
+          outputPath: configPath,
+          framework: detectFramework(spec),
+          includeTraceLinks: true,
+        });
+
+        const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
+        await generateTestProvider(providerPath);
+      }
+
+      if (!existsSync(configPath)) {
+        console.error(`Error: Config not found: ${configPath}`);
+        console.log('Run with --spec <file> to generate from EvalSpec.');
+        process.exit(1);
+      }
+
+      console.log('\n🧪 Running Promptfoo evaluations...\n');
+
+      const outputFile = join(options.output, `eval-${Date.now()}.json`);
+      mkdirSync(dirname(outputFile), { recursive: true });
+
+      await runPromptfooEval(configPath, outputFile);
+
+      console.log(`\n📁 Results saved: ${outputFile}`);
+
+      if (options.view) {
+        console.log(`\n🚀 Launching UI on port ${options.port}...`);
+        await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
+      }
+    } catch (error) {
+      console.error('Error running eval:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+async function launchPromptfooUI(
+  port: number, 
+  configPath: string, 
+  openBrowser: boolean
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const args = ['promptfoo', 'view', '--port', String(port)];
+    
+    if (openBrowser) {
+      args.push('-y');
+    } else {
+      args.push('-n');
+    }
+
+    const configDir = dirname(configPath);
+    args.push(configDir);
+
+    console.log(`   Running: npx ${args.join(' ')}\n`);
+
+    const child = spawn('npx', args, {
+      stdio: 'inherit',
+      env: { ...process.env },
+    });
+
+    child.on('error', (error) => {
+      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
+        console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
+      } else {
+        reject(error);
+      }
+    });
+
+    child.on('close', (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`Promptfoo exited with code ${code}`));
+      }
+    });
+
+    process.on('SIGINT', () => {
+      child.kill('SIGINT');
+      process.exit(0);
+    });
+  });
+}
+
+async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const args = [
+      'promptfoo', 
+      'eval', 
+      '-c', configPath,
+      '-o', outputFile,
+      '--no-cache',
+    ];
+
+    console.log(`   Running: npx ${args.join(' ')}\n`);
+
+    const child = spawn('npx', args, {
+      stdio: 'inherit',
+      env: { ...process.env },
+    });
+
+    child.on('error', reject);
+
+    child.on('close', (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`Promptfoo eval exited with code ${code}`));
+      }
+    });
+  });
+}
+
+async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
+  const defaultConfig = `# Evaluclaude Promptfoo Configuration
+# Generated by evaluclaude
+
+description: "Evaluclaude functional test evaluations"
+
+providers:
+  - id: file://${providerPath}
+    label: functional-tests
+    config:
+      test_dir: ./tests/generated
+      framework: pytest
+      timeout: 300
+
+prompts:
+  - "{{scenario_id}}"
+
+tests:
+  - description: "Example test"
+    vars:
+      scenario_id: "test_example"
+    assert:
+      - type: python
+        value: |
+          import json
+          result = json.loads(output)
+          result.get('passed', 0) > 0
+
+outputPath: .evaluclaude/results/promptfoo-results.json
+`;
+
+  mkdirSync(dirname(configPath), { recursive: true });
+  writeFileSync(configPath, defaultConfig);
+
+  await generateTestProvider(providerPath);
+}
+
+function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
+  if (spec.repo.languages.includes('python')) {
+    return 'pytest';
+  }
+  return 'vitest';
+}
--- a/src/cli/commands/view.ts
+++ b/src/cli/commands/view.ts
@ -0,0 +1,90 @@
+import { Command } from 'commander';
+import { 
+  loadTrace, 
+  listTraces, 
+  getLatestTrace,
+  formatTrace, 
+  formatTraceList 
+} from '../../observability/index.js';
+
+export const viewCommand = new Command('view')
+  .description('View evaluation traces')
+  .argument('[trace-id]', 'Specific trace ID to view')
+  .option('--last', 'View the most recent trace')
+  .option('--list', 'List all traces')
+  .option('--json', 'Output as raw JSON')
+  .option('-v, --verbose', 'Show verbose output including tool calls')
+  .option('--tools', 'Show tool call details')
+  .option('--questions', 'Show questions asked', true)
+  .option('--decisions', 'Show decisions made', true)
+  .option('-n, --limit <count>', 'Limit number of traces listed', '20')
+  .option('--eval <eval-id>', 'Filter traces by eval ID')
+  .action(async (traceId: string | undefined, options) => {
+    try {
+      if (options.list) {
+        const traces = await listTraces(options.eval);
+        const limited = traces.slice(0, parseInt(options.limit, 10));
+        
+        if (traces.length === 0) {
+          console.log('\nNo traces found.');
+          console.log('Run `evaluclaude run` to generate traces.\n');
+          return;
+        }
+        
+        console.log(formatTraceList(limited));
+        
+        if (traces.length > limited.length) {
+          console.log(`Showing ${limited.length} of ${traces.length} traces.`);
+          console.log(`Use --limit to see more.\n`);
+        }
+        return;
+      }
+
+      let trace;
+
+      if (options.last || !traceId) {
+        trace = await getLatestTrace();
+        if (!trace) {
+          console.log('\nNo traces found.');
+          console.log('Run `evaluclaude run` to generate traces.\n');
+          return;
+        }
+      } else {
+        trace = await loadTrace(traceId);
+        if (!trace) {
+          console.error(`\nTrace not found: ${traceId}`);
+          console.log('Use `evaluclaude view --list` to see available traces.\n');
+          process.exit(1);
+        }
+      }
+
+      const output = formatTrace(trace, {
+        json: options.json,
+        verbose: options.verbose,
+        showToolCalls: options.tools || options.verbose,
+        showQuestions: options.questions,
+        showDecisions: options.decisions,
+      });
+
+      console.log(output);
+    } catch (error) {
+      console.error('Error viewing trace:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+export const tracesCommand = new Command('traces')
+  .description('List all evaluation traces (alias for view --list)')
+  .option('-n, --limit <count>', 'Limit number of traces', '20')
+  .option('--eval <eval-id>', 'Filter by eval ID')
+  .action(async (options) => {
+    const traces = await listTraces(options.eval);
+    const limited = traces.slice(0, parseInt(options.limit, 10));
+    
+    if (traces.length === 0) {
+      console.log('\nNo traces found.');
+      return;
+    }
+    
+    console.log(formatTraceList(limited));
+  });