promptfoo ui and testcon

2026-04-15 06:04:41 +00:00 · 2026-01-11 18:28:03 -05:00 · 2026-01-11 18:28:03 -05:00 · 6698c12e5b
commit 6698c12e5b
parent e0c36241b0
18 changed files with 2169 additions and 0 deletions
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@ -0,0 +1,145 @@
+import { Command } from 'commander';
+import { existsSync, readFileSync } from 'fs';
+import { join } from 'path';
+import { 
+  runTests, 
+  formatResults, 
+  detectTestFramework,
+  type TestFramework,
+  type ExecutionOptions,
+  DEFAULT_SANDBOX_CONFIG 
+} from '../../runners/index.js';
+import { createTracer, saveTrace } from '../../observability/index.js';
+import type { EvalSpec } from '../../analyzer/types.js';
+
+export const runCommand = new Command('run')
+  .description('Run generated tests and collect results')
+  .argument('[test-dir]', 'Directory containing test files', './tests/generated')
+  .option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
+  .option('-s, --spec <spec>', 'Path to EvalSpec JSON for result mapping')
+  .option('--sandbox', 'Run tests in sandbox mode', true)
+  .option('--no-sandbox', 'Disable sandbox mode')
+  .option('-t, --timeout <ms>', 'Test timeout in milliseconds', '300000')
+  .option('-p, --parallel', 'Run tests in parallel', false)
+  .option('--filter <patterns...>', 'Run only tests matching patterns')
+  .option('-o, --output <file>', 'Output results to JSON file')
+  .option('--trace', 'Record execution trace', true)
+  .option('--no-trace', 'Disable execution tracing')
+  .option('-w, --watch', 'Watch mode (rerun on changes)', false)
+  .action(async (testDir: string, options) => {
+    try {
+      console.log(`\n🧪 Running tests from ${testDir}...\n`);
+
+      if (!existsSync(testDir)) {
+        console.error(`Error: Test directory not found: ${testDir}`);
+        process.exit(1);
+      }
+
+      const framework: TestFramework = options.framework || detectTestFramework(testDir);
+      console.log(`   Framework: ${framework}`);
+      console.log(`   Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`);
+      console.log(`   Timeout: ${options.timeout}ms`);
+
+      let spec: EvalSpec | undefined;
+      if (options.spec && existsSync(options.spec)) {
+        spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
+        console.log(`   Spec: ${options.spec} (${spec.scenarios.length} scenarios)`);
+      }
+
+      const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
+
+      const execOptions: ExecutionOptions = {
+        framework,
+        sandbox: options.sandbox,
+        timeout: parseInt(options.timeout, 10),
+        parallel: options.parallel,
+        filter: options.filter,
+        cwd: process.cwd(),
+      };
+
+      if (tracer) {
+        tracer.recordIntrospection({
+          filesAnalyzed: [testDir],
+          duration: 0,
+        });
+      }
+
+      console.log('\n   Running tests...\n');
+      const startTime = Date.now();
+
+      const result = await runTests(
+        testDir,
+        execOptions,
+        options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
+      );
+
+      if (tracer) {
+        tracer.recordExecution({
+          testsPassed: result.summary.passed,
+          testsFailed: result.summary.failed,
+          testsSkipped: result.summary.skipped,
+        });
+
+        for (const test of result.tests) {
+          if (test.status === 'failed' || test.status === 'error') {
+            tracer.recordTestFailure({
+              scenarioId: test.id,
+              testName: test.name,
+              error: test.error?.message || 'Unknown error',
+              stack: test.error?.stack,
+            });
+          }
+        }
+      }
+
+      console.log(formatResults(result));
+
+      if (spec) {
+        const mappedResults = mapResultsToScenarios(result, spec);
+        console.log(`\n📊 Scenario Coverage:`);
+        console.log(`   Covered:   ${mappedResults.covered}/${spec.scenarios.length}`);
+        console.log(`   Unmapped:  ${mappedResults.unmapped}`);
+      }
+
+      if (options.output) {
+        const { writeFileSync, mkdirSync } = await import('fs');
+        const { dirname } = await import('path');
+        mkdirSync(dirname(options.output), { recursive: true });
+        writeFileSync(options.output, JSON.stringify(result, null, 2));
+        console.log(`\n📁 Results saved to: ${options.output}`);
+      }
+
+      if (tracer) {
+        const trace = tracer.finalize();
+        const tracePath = await saveTrace(trace);
+        console.log(`\n📊 Trace saved: ${tracePath}`);
+        console.log(`   View with: evaluclaude view ${trace.id}`);
+      }
+
+      if (result.summary.failed > 0) {
+        process.exit(1);
+      }
+    } catch (error) {
+      console.error('Error running tests:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+function mapResultsToScenarios(
+  result: Awaited<ReturnType<typeof runTests>>,
+  spec: EvalSpec
+): { covered: number; unmapped: number } {
+  const scenarioIds = new Set(spec.scenarios.map(s => s.id));
+  let covered = 0;
+  let unmapped = 0;
+
+  for (const test of result.tests) {
+    if (scenarioIds.has(test.id)) {
+      covered++;
+    } else {
+      unmapped++;
+    }
+  }
+
+  return { covered, unmapped };
+}
--- a/src/cli/commands/ui.ts
+++ b/src/cli/commands/ui.ts
@ -0,0 +1,236 @@
+import { Command } from 'commander';
+import { spawn, type ChildProcess } from 'child_process';
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
+import { join, dirname } from 'path';
+import type { EvalSpec } from '../../analyzer/types.js';
+import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
+
+const EVALUCLAUDE_DIR = '.evaluclaude';
+const CONFIG_FILE = 'promptfooconfig.yaml';
+const PROVIDERS_DIR = 'providers';
+
+export const uiCommand = new Command('ui')
+  .description('Launch the evaluation dashboard UI')
+  .option('-p, --port <port>', 'Port to run the UI on', '3000')
+  .option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
+  .option('--generate', 'Regenerate Promptfoo config from spec')
+  .option('--no-open', 'Do not auto-open browser')
+  .action(async (options) => {
+    try {
+      const port = parseInt(options.port, 10);
+      const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
+      const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
+
+      if (options.spec && options.generate) {
+        console.log('\n📄 Generating Promptfoo configuration...');
+        
+        if (!existsSync(options.spec)) {
+          console.error(`Error: Spec file not found: ${options.spec}`);
+          process.exit(1);
+        }
+
+        const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
+        
+        await generatePromptfooConfig(spec, {
+          testDir: './tests/generated',
+          outputPath: configPath,
+          framework: detectFramework(spec),
+          includeTraceLinks: true,
+        });
+
+        await generateTestProvider(providerPath);
+
+        console.log(`   Config: ${configPath}`);
+        console.log(`   Provider: ${providerPath}`);
+      }
+
+      if (!existsSync(configPath)) {
+        console.log('\n⚠️  No Promptfoo config found.');
+        console.log('   Run with --spec <file> --generate to create one.\n');
+        console.log('   Or create one manually:');
+        console.log(`   ${configPath}\n`);
+        
+        await createDefaultConfig(configPath, providerPath);
+        console.log(`   Created default config at ${configPath}`);
+      }
+
+      console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
+      console.log(`   Config: ${configPath}\n`);
+
+      await launchPromptfooUI(port, configPath, options.open);
+    } catch (error) {
+      console.error('Error launching UI:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+export const evalCommand = new Command('eval')
+  .description('Run evaluations with Promptfoo and optionally launch UI')
+  .option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
+  .option('-c, --config <config>', 'Path to promptfooconfig.yaml')
+  .option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
+  .option('--view', 'Launch UI after evaluation', false)
+  .option('-p, --port <port>', 'Port for UI', '3000')
+  .action(async (options) => {
+    try {
+      const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
+
+      if (options.spec) {
+        console.log('\n📄 Generating Promptfoo configuration from spec...');
+        const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
+        
+        await generatePromptfooConfig(spec, {
+          testDir: './tests/generated',
+          outputPath: configPath,
+          framework: detectFramework(spec),
+          includeTraceLinks: true,
+        });
+
+        const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
+        await generateTestProvider(providerPath);
+      }
+
+      if (!existsSync(configPath)) {
+        console.error(`Error: Config not found: ${configPath}`);
+        console.log('Run with --spec <file> to generate from EvalSpec.');
+        process.exit(1);
+      }
+
+      console.log('\n🧪 Running Promptfoo evaluations...\n');
+
+      const outputFile = join(options.output, `eval-${Date.now()}.json`);
+      mkdirSync(dirname(outputFile), { recursive: true });
+
+      await runPromptfooEval(configPath, outputFile);
+
+      console.log(`\n📁 Results saved: ${outputFile}`);
+
+      if (options.view) {
+        console.log(`\n🚀 Launching UI on port ${options.port}...`);
+        await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
+      }
+    } catch (error) {
+      console.error('Error running eval:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+async function launchPromptfooUI(
+  port: number, 
+  configPath: string, 
+  openBrowser: boolean
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const args = ['promptfoo', 'view', '--port', String(port)];
+    
+    if (openBrowser) {
+      args.push('-y');
+    } else {
+      args.push('-n');
+    }
+
+    const configDir = dirname(configPath);
+    args.push(configDir);
+
+    console.log(`   Running: npx ${args.join(' ')}\n`);
+
+    const child = spawn('npx', args, {
+      stdio: 'inherit',
+      env: { ...process.env },
+    });
+
+    child.on('error', (error) => {
+      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
+        console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
+      } else {
+        reject(error);
+      }
+    });
+
+    child.on('close', (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`Promptfoo exited with code ${code}`));
+      }
+    });
+
+    process.on('SIGINT', () => {
+      child.kill('SIGINT');
+      process.exit(0);
+    });
+  });
+}
+
+async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const args = [
+      'promptfoo', 
+      'eval', 
+      '-c', configPath,
+      '-o', outputFile,
+      '--no-cache',
+    ];
+
+    console.log(`   Running: npx ${args.join(' ')}\n`);
+
+    const child = spawn('npx', args, {
+      stdio: 'inherit',
+      env: { ...process.env },
+    });
+
+    child.on('error', reject);
+
+    child.on('close', (code) => {
+      if (code === 0) {
+        resolve();
+      } else {
+        reject(new Error(`Promptfoo eval exited with code ${code}`));
+      }
+    });
+  });
+}
+
+async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
+  const defaultConfig = `# Evaluclaude Promptfoo Configuration
+# Generated by evaluclaude
+
+description: "Evaluclaude functional test evaluations"
+
+providers:
+  - id: file://${providerPath}
+    label: functional-tests
+    config:
+      test_dir: ./tests/generated
+      framework: pytest
+      timeout: 300
+
+prompts:
+  - "{{scenario_id}}"
+
+tests:
+  - description: "Example test"
+    vars:
+      scenario_id: "test_example"
+    assert:
+      - type: python
+        value: |
+          import json
+          result = json.loads(output)
+          result.get('passed', 0) > 0
+
+outputPath: .evaluclaude/results/promptfoo-results.json
+`;
+
+  mkdirSync(dirname(configPath), { recursive: true });
+  writeFileSync(configPath, defaultConfig);
+
+  await generateTestProvider(providerPath);
+}
+
+function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
+  if (spec.repo.languages.includes('python')) {
+    return 'pytest';
+  }
+  return 'vitest';
+}
--- a/src/cli/commands/view.ts
+++ b/src/cli/commands/view.ts
@ -0,0 +1,90 @@
+import { Command } from 'commander';
+import { 
+  loadTrace, 
+  listTraces, 
+  getLatestTrace,
+  formatTrace, 
+  formatTraceList 
+} from '../../observability/index.js';
+
+export const viewCommand = new Command('view')
+  .description('View evaluation traces')
+  .argument('[trace-id]', 'Specific trace ID to view')
+  .option('--last', 'View the most recent trace')
+  .option('--list', 'List all traces')
+  .option('--json', 'Output as raw JSON')
+  .option('-v, --verbose', 'Show verbose output including tool calls')
+  .option('--tools', 'Show tool call details')
+  .option('--questions', 'Show questions asked', true)
+  .option('--decisions', 'Show decisions made', true)
+  .option('-n, --limit <count>', 'Limit number of traces listed', '20')
+  .option('--eval <eval-id>', 'Filter traces by eval ID')
+  .action(async (traceId: string | undefined, options) => {
+    try {
+      if (options.list) {
+        const traces = await listTraces(options.eval);
+        const limited = traces.slice(0, parseInt(options.limit, 10));
+        
+        if (traces.length === 0) {
+          console.log('\nNo traces found.');
+          console.log('Run `evaluclaude run` to generate traces.\n');
+          return;
+        }
+        
+        console.log(formatTraceList(limited));
+        
+        if (traces.length > limited.length) {
+          console.log(`Showing ${limited.length} of ${traces.length} traces.`);
+          console.log(`Use --limit to see more.\n`);
+        }
+        return;
+      }
+
+      let trace;
+
+      if (options.last || !traceId) {
+        trace = await getLatestTrace();
+        if (!trace) {
+          console.log('\nNo traces found.');
+          console.log('Run `evaluclaude run` to generate traces.\n');
+          return;
+        }
+      } else {
+        trace = await loadTrace(traceId);
+        if (!trace) {
+          console.error(`\nTrace not found: ${traceId}`);
+          console.log('Use `evaluclaude view --list` to see available traces.\n');
+          process.exit(1);
+        }
+      }
+
+      const output = formatTrace(trace, {
+        json: options.json,
+        verbose: options.verbose,
+        showToolCalls: options.tools || options.verbose,
+        showQuestions: options.questions,
+        showDecisions: options.decisions,
+      });
+
+      console.log(output);
+    } catch (error) {
+      console.error('Error viewing trace:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+
+export const tracesCommand = new Command('traces')
+  .description('List all evaluation traces (alias for view --list)')
+  .option('-n, --limit <count>', 'Limit number of traces', '20')
+  .option('--eval <eval-id>', 'Filter by eval ID')
+  .action(async (options) => {
+    const traces = await listTraces(options.eval);
+    const limited = traces.slice(0, parseInt(options.limit, 10));
+    
+    if (traces.length === 0) {
+      console.log('\nNo traces found.');
+      return;
+    }
+    
+    console.log(formatTraceList(limited));
+  });
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@ -5,6 +5,9 @@ import { introCommand } from './commands/intro.js';
 import { analyzeCommand } from './commands/analyze.js';
 import { renderCommand } from './commands/render.js';
 import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/grade.js';
+import { runCommand } from './commands/run.js';
+import { viewCommand, tracesCommand } from './commands/view.js';
+import { uiCommand, evalCommand } from './commands/ui.js';

 const program = new Command();

@ -19,5 +22,10 @@ program.addCommand(renderCommand);
 program.addCommand(gradeCommand);
 program.addCommand(listRubricsCommand);
 program.addCommand(calibrateCommand);
+program.addCommand(runCommand);
+program.addCommand(viewCommand);
+program.addCommand(tracesCommand);
+program.addCommand(uiCommand);
+program.addCommand(evalCommand);

 program.parse(process.argv);
--- a/src/index.ts
+++ b/src/index.ts
@ -2,3 +2,6 @@ export * from './introspector/index.js';
 export * from './analyzer/index.js';
 export * from './renderers/index.js';
 export * from './graders/index.js';
+export * from './runners/index.js';
+export * from './observability/index.js';
+export * from './promptfoo/index.js';
--- a/src/observability/index.ts
+++ b/src/observability/index.ts
@ -0,0 +1,15 @@
+export * from './types.js';
+export { Tracer, createTracer } from './tracer.js';
+export { 
+  TraceStore, 
+  traceStore, 
+  saveTrace, 
+  loadTrace, 
+  listTraces, 
+  getLatestTrace 
+} from './trace-store.js';
+export { 
+  formatTrace, 
+  formatTraceList, 
+  type ViewOptions 
+} from './trace-viewer.js';
--- a/src/observability/trace-store.ts
+++ b/src/observability/trace-store.ts
@ -0,0 +1,117 @@
+import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
+import { existsSync } from 'fs';
+import { join } from 'path';
+import type { EvalTrace, TraceListItem } from './types.js';
+
+const DEFAULT_TRACES_DIR = '.evaluclaude/traces';
+
+export class TraceStore {
+  private tracesDir: string;
+
+  constructor(tracesDir: string = DEFAULT_TRACES_DIR) {
+    this.tracesDir = tracesDir;
+  }
+
+  async save(trace: EvalTrace): Promise<string> {
+    await mkdir(this.tracesDir, { recursive: true });
+    const filePath = join(this.tracesDir, `${trace.id}.json`);
+    await writeFile(filePath, JSON.stringify(trace, null, 2));
+    return filePath;
+  }
+
+  async load(traceId: string): Promise<EvalTrace | null> {
+    const filePath = join(this.tracesDir, `${traceId}.json`);
+    if (!existsSync(filePath)) {
+      return null;
+    }
+    const content = await readFile(filePath, 'utf-8');
+    return JSON.parse(content) as EvalTrace;
+  }
+
+  async list(evalId?: string): Promise<TraceListItem[]> {
+    if (!existsSync(this.tracesDir)) {
+      return [];
+    }
+
+    const files = await readdir(this.tracesDir);
+    const jsonFiles = files.filter(f => f.endsWith('.json'));
+
+    const traces: TraceListItem[] = [];
+
+    for (const file of jsonFiles) {
+      try {
+        const content = await readFile(join(this.tracesDir, file), 'utf-8');
+        const trace = JSON.parse(content) as EvalTrace;
+
+        if (evalId && trace.evalId !== evalId) {
+          continue;
+        }
+
+        traces.push({
+          id: trace.id,
+          evalId: trace.evalId,
+          startedAt: trace.startedAt,
+          status: trace.status,
+          duration: trace.duration,
+          testsPassed: trace.execution.testsPassed,
+          testsFailed: trace.execution.testsFailed,
+        });
+      } catch (e) {
+      }
+    }
+
+    return traces.sort((a, b) => 
+      new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime()
+    );
+  }
+
+  async getLatest(): Promise<EvalTrace | null> {
+    const traces = await this.list();
+    if (traces.length === 0) {
+      return null;
+    }
+    return this.load(traces[0].id);
+  }
+
+  async delete(traceId: string): Promise<boolean> {
+    const filePath = join(this.tracesDir, `${traceId}.json`);
+    if (!existsSync(filePath)) {
+      return false;
+    }
+    const { unlink } = await import('fs/promises');
+    await unlink(filePath);
+    return true;
+  }
+
+  async cleanup(keepCount: number = 50): Promise<number> {
+    const traces = await this.list();
+    const toDelete = traces.slice(keepCount);
+    
+    let deleted = 0;
+    for (const trace of toDelete) {
+      if (await this.delete(trace.id)) {
+        deleted++;
+      }
+    }
+    
+    return deleted;
+  }
+}
+
+export const traceStore = new TraceStore();
+
+export async function saveTrace(trace: EvalTrace): Promise<string> {
+  return traceStore.save(trace);
+}
+
+export async function loadTrace(traceId: string): Promise<EvalTrace | null> {
+  return traceStore.load(traceId);
+}
+
+export async function listTraces(evalId?: string): Promise<TraceListItem[]> {
+  return traceStore.list(evalId);
+}
+
+export async function getLatestTrace(): Promise<EvalTrace | null> {
+  return traceStore.getLatest();
+}
--- a/src/observability/trace-viewer.ts
+++ b/src/observability/trace-viewer.ts
@ -0,0 +1,226 @@
+import type { EvalTrace, ToolCall, Question, Decision, TestFailure } from './types.js';
+
+export interface ViewOptions {
+  json: boolean;
+  verbose: boolean;
+  showToolCalls: boolean;
+  showQuestions: boolean;
+  showDecisions: boolean;
+}
+
+const DEFAULT_VIEW_OPTIONS: ViewOptions = {
+  json: false,
+  verbose: false,
+  showToolCalls: false,
+  showQuestions: true,
+  showDecisions: true,
+};
+
+export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}): string {
+  const opts = { ...DEFAULT_VIEW_OPTIONS, ...options };
+
+  if (opts.json) {
+    return JSON.stringify(trace, null, 2);
+  }
+
+  const lines: string[] = [];
+
+  lines.push('');
+  lines.push('═'.repeat(60));
+  lines.push(`📊 Trace: ${trace.id}`);
+  lines.push('═'.repeat(60));
+  lines.push('');
+
+  lines.push(`   Status:     ${formatStatus(trace.status)}`);
+  lines.push(`   Started:    ${formatDate(trace.startedAt)}`);
+  lines.push(`   Duration:   ${formatDuration(trace.duration)}`);
+  lines.push(`   Eval ID:    ${trace.evalId}`);
+  lines.push('');
+
+  lines.push('📂 Introspection');
+  lines.push('─'.repeat(40));
+  lines.push(`   Files analyzed:    ${trace.introspection.filesAnalyzed.length}`);
+  lines.push(`   Functions found:   ${trace.introspection.totalFunctions}`);
+  lines.push(`   Classes found:     ${trace.introspection.totalClasses}`);
+  lines.push(`   Duration:          ${formatDuration(trace.introspection.duration)}`);
+  lines.push('');
+
+  lines.push('🤖 Analysis');
+  lines.push('─'.repeat(40));
+  lines.push(`   Tool calls:        ${trace.analysis.toolCalls.length}`);
+  lines.push(`   Questions asked:   ${trace.analysis.questionsAsked.length}`);
+  lines.push(`   Decisions made:    ${trace.analysis.decisions.length}`);
+  lines.push(`   Prompt tokens:     ${trace.analysis.promptTokens.toLocaleString()}`);
+  lines.push(`   Completion tokens: ${trace.analysis.completionTokens.toLocaleString()}`);
+  lines.push('');
+
+  lines.push('📝 Generation');
+  lines.push('─'.repeat(40));
+  lines.push(`   Scenarios:         ${trace.generation.scenariosGenerated}`);
+  lines.push(`   Files written:     ${trace.generation.filesWritten.length}`);
+  lines.push('');
+
+  lines.push('🧪 Execution');
+  lines.push('─'.repeat(40));
+  lines.push(`   ✅ Passed:         ${trace.execution.testsPassed}`);
+  lines.push(`   ❌ Failed:         ${trace.execution.testsFailed}`);
+  lines.push(`   ⏭️  Skipped:        ${trace.execution.testsSkipped}`);
+  lines.push('');
+
+  if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
+    lines.push('❓ Questions Asked');
+    lines.push('─'.repeat(40));
+    for (const q of trace.analysis.questionsAsked) {
+      lines.push(formatQuestion(q));
+    }
+    lines.push('');
+  }
+
+  if (opts.showDecisions && trace.analysis.decisions.length > 0) {
+    lines.push('🎯 Key Decisions');
+    lines.push('─'.repeat(40));
+    for (const d of trace.analysis.decisions.slice(0, 10)) {
+      lines.push(formatDecision(d));
+    }
+    if (trace.analysis.decisions.length > 10) {
+      lines.push(`   ... and ${trace.analysis.decisions.length - 10} more`);
+    }
+    lines.push('');
+  }
+
+  if (opts.showToolCalls && trace.analysis.toolCalls.length > 0) {
+    lines.push('🔧 Tool Calls');
+    lines.push('─'.repeat(40));
+    for (const tc of trace.analysis.toolCalls.slice(0, 20)) {
+      lines.push(formatToolCall(tc, opts.verbose));
+    }
+    if (trace.analysis.toolCalls.length > 20) {
+      lines.push(`   ... and ${trace.analysis.toolCalls.length - 20} more`);
+    }
+    lines.push('');
+  }
+
+  if (trace.execution.failures.length > 0) {
+    lines.push('❌ Test Failures');
+    lines.push('─'.repeat(40));
+    for (const f of trace.execution.failures) {
+      lines.push(formatFailure(f));
+    }
+    lines.push('');
+  }
+
+  if (trace.errors.length > 0) {
+    lines.push('⚠️  Errors');
+    lines.push('─'.repeat(40));
+    for (const e of trace.errors) {
+      lines.push(`   [${formatDate(e.timestamp)}]`);
+      lines.push(`   ${e.message}`);
+      if (e.context) {
+        lines.push(`   Context: ${e.context}`);
+      }
+      lines.push('');
+    }
+  }
+
+  lines.push('═'.repeat(60));
+  lines.push('');
+
+  return lines.join('\n');
+}
+
+function formatStatus(status: EvalTrace['status']): string {
+  switch (status) {
+    case 'success':
+      return '✅ Success';
+    case 'partial':
+      return '⚠️  Partial';
+    case 'failed':
+      return '❌ Failed';
+    default:
+      return status;
+  }
+}
+
+function formatDate(iso: string): string {
+  return new Date(iso).toLocaleString();
+}
+
+function formatDuration(ms: number): string {
+  if (ms < 1000) {
+    return `${ms}ms`;
+  }
+  if (ms < 60000) {
+    return `${(ms / 1000).toFixed(1)}s`;
+  }
+  const minutes = Math.floor(ms / 60000);
+  const seconds = ((ms % 60000) / 1000).toFixed(0);
+  return `${minutes}m ${seconds}s`;
+}
+
+function formatQuestion(q: Question): string {
+  const lines: string[] = [];
+  lines.push(`   Q: ${q.question}`);
+  if (q.answer) {
+    lines.push(`   A: ${q.answer}`);
+  } else {
+    lines.push(`   A: (no answer)`);
+  }
+  lines.push('');
+  return lines.join('\n');
+}
+
+function formatDecision(d: Decision): string {
+  const icon = d.type === 'include' ? '✓' : d.type === 'exclude' ? '✗' : '→';
+  return `   ${icon} [${d.type}] ${d.subject}\n     Reason: ${d.reasoning}\n     Confidence: ${(d.confidence * 100).toFixed(0)}%\n`;
+}
+
+function formatToolCall(tc: ToolCall, verbose: boolean): string {
+  const duration = formatDuration(tc.duration);
+  if (verbose) {
+    return `   [${tc.tool}] (${duration})\n     Input: ${JSON.stringify(tc.input).slice(0, 100)}...\n`;
+  }
+  return `   ${tc.tool} (${duration})`;
+}
+
+function formatFailure(f: TestFailure): string {
+  const lines: string[] = [];
+  lines.push(`   • ${f.testName}`);
+  lines.push(`     Scenario: ${f.scenarioId}`);
+  lines.push(`     Error: ${f.error}`);
+  if (f.expected !== undefined && f.actual !== undefined) {
+    lines.push(`     Expected: ${JSON.stringify(f.expected)}`);
+    lines.push(`     Actual: ${JSON.stringify(f.actual)}`);
+  }
+  lines.push('');
+  return lines.join('\n');
+}
+
+export function formatTraceList(traces: Array<{ 
+  id: string; 
+  startedAt: string; 
+  status: string; 
+  duration: number;
+  testsPassed: number;
+  testsFailed: number;
+}>): string {
+  const lines: string[] = [];
+  
+  lines.push('');
+  lines.push('📋 Recent Traces');
+  lines.push('═'.repeat(80));
+  lines.push('');
+  lines.push('ID                                     Status     Passed  Failed  Duration');
+  lines.push('─'.repeat(80));
+
+  for (const t of traces) {
+    const statusIcon = t.status === 'success' ? '✅' : t.status === 'partial' ? '⚠️ ' : '❌';
+    const id = t.id.slice(0, 36);
+    const passed = String(t.testsPassed).padStart(6);
+    const failed = String(t.testsFailed).padStart(6);
+    const duration = formatDuration(t.duration).padStart(8);
+    lines.push(`${id}  ${statusIcon}  ${passed}  ${failed}  ${duration}`);
+  }
+
+  lines.push('');
+  return lines.join('\n');
+}
--- a/src/observability/tracer.ts
+++ b/src/observability/tracer.ts
@ -0,0 +1,168 @@
+import { randomUUID } from 'crypto';
+import type {
+  EvalTrace,
+  ToolCall,
+  Question,
+  Decision,
+  TraceError,
+  TestFailure,
+  IntrospectionTrace,
+  GenerationTrace,
+  ExecutionTrace,
+} from './types.js';
+
+export class Tracer {
+  private trace: EvalTrace;
+  private currentToolCall?: { name: string; input: unknown; startTime: number };
+  private startTime: number;
+
+  constructor(evalId: string) {
+    this.startTime = Date.now();
+    this.trace = {
+      id: randomUUID(),
+      evalId,
+      startedAt: new Date().toISOString(),
+      completedAt: '',
+      duration: 0,
+      status: 'success',
+      introspection: {
+        filesAnalyzed: [],
+        totalFunctions: 0,
+        totalClasses: 0,
+        duration: 0,
+      },
+      analysis: {
+        promptTokens: 0,
+        completionTokens: 0,
+        toolCalls: [],
+        questionsAsked: [],
+        decisions: [],
+      },
+      generation: {
+        scenariosGenerated: 0,
+        filesWritten: [],
+      },
+      execution: {
+        testsPassed: 0,
+        testsFailed: 0,
+        testsSkipped: 0,
+        failures: [],
+      },
+      errors: [],
+    };
+  }
+
+  get traceId(): string {
+    return this.trace.id;
+  }
+
+  recordToolStart(name: string, input: unknown): void {
+    this.currentToolCall = { name, input, startTime: Date.now() };
+  }
+
+  recordToolEnd(name: string, output: unknown): void {
+    if (this.currentToolCall?.name === name) {
+      const toolCall: ToolCall = {
+        timestamp: new Date().toISOString(),
+        tool: name,
+        input: this.currentToolCall.input,
+        output,
+        duration: Date.now() - this.currentToolCall.startTime,
+      };
+      this.trace.analysis.toolCalls.push(toolCall);
+      this.currentToolCall = undefined;
+    }
+  }
+
+  recordQuestion(question: Question): void {
+    this.trace.analysis.questionsAsked.push({
+      ...question,
+      timestamp: new Date().toISOString(),
+    });
+  }
+
+  recordAnswer(questionId: string, answer: string): void {
+    const question = this.trace.analysis.questionsAsked.find(q => q.id === questionId);
+    if (question) {
+      question.answer = answer;
+    }
+  }
+
+  recordDecision(
+    type: Decision['type'],
+    subject: string,
+    reasoning: string,
+    confidence: number
+  ): void {
+    this.trace.analysis.decisions.push({
+      timestamp: new Date().toISOString(),
+      type,
+      subject,
+      reasoning,
+      confidence: Math.max(0, Math.min(1, confidence)),
+    });
+  }
+
+  recordIntrospection(data: Partial<IntrospectionTrace>): void {
+    Object.assign(this.trace.introspection, data);
+  }
+
+  recordGeneration(data: Partial<GenerationTrace>): void {
+    Object.assign(this.trace.generation, data);
+  }
+
+  recordExecution(data: Partial<ExecutionTrace>): void {
+    Object.assign(this.trace.execution, data);
+  }
+
+  recordTestFailure(failure: TestFailure): void {
+    this.trace.execution.failures.push(failure);
+    this.trace.execution.testsFailed++;
+  }
+
+  recordTestPass(): void {
+    this.trace.execution.testsPassed++;
+  }
+
+  recordTokenUsage(promptTokens: number, completionTokens: number): void {
+    this.trace.analysis.promptTokens += promptTokens;
+    this.trace.analysis.completionTokens += completionTokens;
+  }
+
+  recordError(error: Error, context?: string): void {
+    const traceError: TraceError = {
+      timestamp: new Date().toISOString(),
+      message: error.message,
+      stack: error.stack,
+      context,
+    };
+    this.trace.errors.push(traceError);
+    
+    if (this.trace.status === 'success') {
+      this.trace.status = 'partial';
+    }
+  }
+
+  setStatus(status: EvalTrace['status']): void {
+    this.trace.status = status;
+  }
+
+  finalize(): EvalTrace {
+    this.trace.completedAt = new Date().toISOString();
+    this.trace.duration = Date.now() - this.startTime;
+    
+    if (this.trace.errors.length > 0 && this.trace.execution.testsPassed === 0) {
+      this.trace.status = 'failed';
+    }
+    
+    return this.trace;
+  }
+
+  getTrace(): EvalTrace {
+    return { ...this.trace };
+  }
+}
+
+export function createTracer(evalId: string): Tracer {
+  return new Tracer(evalId);
+}
--- a/src/observability/types.ts
+++ b/src/observability/types.ts
@ -0,0 +1,100 @@
+export interface EvalTrace {
+  id: string;
+  evalId: string;
+  startedAt: string;
+  completedAt: string;
+  duration: number;
+  
+  status: 'success' | 'partial' | 'failed';
+  
+  introspection: IntrospectionTrace;
+  analysis: AnalysisTrace;
+  generation: GenerationTrace;
+  execution: ExecutionTrace;
+  
+  errors: TraceError[];
+}
+
+export interface IntrospectionTrace {
+  filesAnalyzed: string[];
+  totalFunctions: number;
+  totalClasses: number;
+  duration: number;
+}
+
+export interface AnalysisTrace {
+  promptTokens: number;
+  completionTokens: number;
+  toolCalls: ToolCall[];
+  questionsAsked: Question[];
+  decisions: Decision[];
+}
+
+export interface GenerationTrace {
+  scenariosGenerated: number;
+  filesWritten: string[];
+}
+
+export interface ExecutionTrace {
+  testsPassed: number;
+  testsFailed: number;
+  testsSkipped: number;
+  failures: TestFailure[];
+}
+
+export interface ToolCall {
+  timestamp: string;
+  tool: string;
+  input: unknown;
+  output: unknown;
+  duration: number;
+}
+
+export interface Question {
+  id: string;
+  timestamp: string;
+  question: string;
+  options?: string[];
+  answer?: string;
+  defaultAnswer?: string;
+}
+
+export interface Decision {
+  timestamp: string;
+  type: 'include' | 'exclude' | 'prioritize' | 'question';
+  subject: string;
+  reasoning: string;
+  confidence: number;
+}
+
+export interface TestFailure {
+  scenarioId: string;
+  testName: string;
+  error: string;
+  stack?: string;
+  expected?: unknown;
+  actual?: unknown;
+}
+
+export interface TraceError {
+  timestamp: string;
+  message: string;
+  stack?: string;
+  context?: string;
+}
+
+export interface TraceEvent {
+  timestamp: string;
+  type: 'tool_start' | 'tool_end' | 'question' | 'decision' | 'error' | 'info';
+  data: unknown;
+}
+
+export interface TraceListItem {
+  id: string;
+  evalId: string;
+  startedAt: string;
+  status: EvalTrace['status'];
+  duration: number;
+  testsPassed: number;
+  testsFailed: number;
+}
--- a/src/promptfoo/config-generator.ts
+++ b/src/promptfoo/config-generator.ts
@ -0,0 +1,271 @@
+import { writeFile, mkdir } from 'fs/promises';
+import { dirname, join } from 'path';
+import * as yaml from 'js-yaml';
+import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
+import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
+
+export interface ConfigOptions {
+  testDir: string;
+  outputPath: string;
+  framework: 'pytest' | 'vitest' | 'jest';
+  includeTraceLinks: boolean;
+}
+
+export async function generatePromptfooConfig(
+  spec: EvalSpec,
+  options: ConfigOptions
+): Promise<string> {
+  const config = buildConfig(spec, options);
+  const yamlContent = yaml.dump(config, { 
+    lineWidth: 120,
+    quotingType: '"',
+  });
+
+  await mkdir(dirname(options.outputPath), { recursive: true });
+  await writeFile(options.outputPath, yamlContent);
+
+  return yamlContent;
+}
+
+function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
+  const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
+
+  return {
+    description: `Evaluclaude functional tests for ${spec.repo.name}`,
+    providers: [
+      {
+        id: `file://providers/test-runner.py`,
+        label: 'functional-tests',
+        config: {
+          test_dir: options.testDir,
+          framework: options.framework,
+          timeout: 300,
+        },
+      },
+    ],
+    prompts: ['{{scenario_id}}'],
+    tests,
+    defaultTest: options.includeTraceLinks
+      ? {
+          metadata: {
+            traceFile: '.evaluclaude/traces/{{evalId}}.json',
+          },
+        }
+      : undefined,
+    outputPath: '.evaluclaude/results/promptfoo-results.json',
+  };
+}
+
+function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest {
+  const assertions = scenario.assertions
+    .filter(a => a.type !== 'llm-rubric')
+    .map(a => buildAssertion(a));
+
+  const llmRubrics = scenario.assertions
+    .filter(a => a.type === 'llm-rubric')
+    .map(a => ({
+      type: 'llm-rubric' as const,
+      value: (a as any).rubric,
+      threshold: (a as any).passingThreshold ?? 0.7,
+    }));
+
+  return {
+    description: scenario.description,
+    vars: {
+      scenario_id: scenario.id,
+      target_module: scenario.target.module,
+      target_function: scenario.target.function,
+      input_args: scenario.input.args,
+      input_kwargs: scenario.input.kwargs,
+    },
+    assert: [...assertions, ...llmRubrics],
+    metadata: {
+      category: scenario.category,
+      priority: scenario.priority,
+      tags: scenario.tags,
+    },
+  };
+}
+
+function buildAssertion(assertion: any): PromptfooAssertion {
+  switch (assertion.type) {
+    case 'equals':
+      return {
+        type: 'equals',
+        value: assertion.expected,
+      };
+
+    case 'contains':
+      return {
+        type: 'contains',
+        value: assertion.value,
+      };
+
+    case 'matches':
+      return {
+        type: 'regex',
+        value: assertion.pattern,
+      };
+
+    case 'typeof':
+      return {
+        type: 'python',
+        value: `type(output).__name__ == '${assertion.expected}'`,
+      };
+
+    case 'throws':
+      return {
+        type: 'python',
+        value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`,
+      };
+
+    case 'truthy':
+      return {
+        type: 'python',
+        value: 'bool(output)',
+      };
+
+    case 'falsy':
+      return {
+        type: 'python',
+        value: 'not bool(output)',
+      };
+
+    case 'custom':
+      return {
+        type: 'python',
+        value: assertion.check,
+      };
+
+    default:
+      return {
+        type: 'python',
+        value: 'True',
+      };
+  }
+}
+
+export async function generateTestProvider(outputPath: string): Promise<void> {
+  const providerCode = `#!/usr/bin/env python3
+"""Promptfoo provider that executes tests and returns structured results."""
+
+import subprocess
+import json
+import sys
+import os
+
+def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
+    """Runs tests and returns structured results."""
+    
+    test_dir = options.get('config', {}).get('test_dir', './tests')
+    framework = options.get('config', {}).get('framework', 'pytest')
+    timeout = options.get('config', {}).get('timeout', 300)
+    
+    scenario_id = prompt.strip()
+    
+    try:
+        if framework == 'pytest':
+            result = subprocess.run(
+                [
+                    'python', '-m', 'pytest',
+                    '--json-report',
+                    '--json-report-file=/tmp/pytest_results.json',
+                    '-k', scenario_id,
+                    test_dir
+                ],
+                capture_output=True,
+                text=True,
+                timeout=timeout
+            )
+            
+            try:
+                with open('/tmp/pytest_results.json') as f:
+                    report = json.load(f)
+                    
+                output = {
+                    'passed': report.get('summary', {}).get('passed', 0),
+                    'failed': report.get('summary', {}).get('failed', 0),
+                    'skipped': report.get('summary', {}).get('skipped', 0),
+                    'tests': report.get('tests', []),
+                    'stdout': result.stdout,
+                    'stderr': result.stderr,
+                    'exit_code': result.returncode,
+                }
+            except FileNotFoundError:
+                output = {
+                    'passed': 0,
+                    'failed': 1,
+                    'error': 'Failed to generate pytest report',
+                    'stdout': result.stdout,
+                    'stderr': result.stderr,
+                }
+                
+        elif framework in ('vitest', 'jest'):
+            cmd = ['npx', framework, 'run', '--reporter=json']
+            if scenario_id:
+                cmd.extend(['--testNamePattern', scenario_id])
+            cmd.append(test_dir)
+            
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout
+            )
+            
+            try:
+                report = json.loads(result.stdout)
+                output = {
+                    'passed': report.get('numPassedTests', 0),
+                    'failed': report.get('numFailedTests', 0),
+                    'skipped': report.get('numSkippedTests', 0),
+                    'tests': report.get('testResults', []),
+                    'exit_code': result.returncode,
+                }
+            except json.JSONDecodeError:
+                output = {
+                    'passed': 0,
+                    'failed': 1,
+                    'error': 'Failed to parse test output',
+                    'stdout': result.stdout,
+                    'stderr': result.stderr,
+                }
+        else:
+            output = {'error': f'Unknown framework: {framework}'}
+            
+        return {
+            'output': json.dumps(output),
+            'error': None,
+        }
+        
+    except subprocess.TimeoutExpired:
+        return {
+            'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
+            'error': None,
+        }
+    except Exception as e:
+        return {
+            'output': None,
+            'error': str(e),
+        }
+
+if __name__ == '__main__':
+    # For testing the provider directly
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--scenario', default='')
+    parser.add_argument('--test-dir', default='./tests')
+    parser.add_argument('--framework', default='pytest')
+    args = parser.parse_args()
+    
+    result = get_provider_response(
+        args.scenario,
+        {'config': {'test_dir': args.test_dir, 'framework': args.framework}},
+        {}
+    )
+    print(json.dumps(result, indent=2))
+`;
+
+  await mkdir(dirname(outputPath), { recursive: true });
+  await writeFile(outputPath, providerCode);
+}
--- a/src/promptfoo/index.ts
+++ b/src/promptfoo/index.ts
@ -0,0 +1,2 @@
+export * from './types.js';
+export { generatePromptfooConfig, generateTestProvider } from './config-generator.js';
--- a/src/promptfoo/types.ts
+++ b/src/promptfoo/types.ts
@ -0,0 +1,89 @@
+export interface PromptfooConfig {
+  description?: string;
+  providers: PromptfooProvider[];
+  prompts: string[];
+  tests: PromptfooTest[];
+  defaultTest?: PromptfooDefaultTest;
+  outputPath?: string;
+}
+
+export interface PromptfooProvider {
+  id: string;
+  label?: string;
+  config?: Record<string, unknown>;
+}
+
+export interface PromptfooTest {
+  description?: string;
+  vars?: Record<string, unknown>;
+  assert?: PromptfooAssertion[];
+  options?: Record<string, unknown>;
+  metadata?: Record<string, unknown>;
+}
+
+export interface PromptfooAssertion {
+  type: string;
+  value?: unknown;
+  threshold?: number;
+  weight?: number;
+  provider?: string;
+}
+
+export interface PromptfooDefaultTest {
+  assert?: PromptfooAssertion[];
+  options?: Record<string, unknown>;
+  metadata?: Record<string, unknown>;
+}
+
+export interface PromptfooResult {
+  version: number;
+  timestamp: string;
+  results: PromptfooTestResult[];
+  stats: {
+    successes: number;
+    failures: number;
+    tokenUsage: {
+      total: number;
+      prompt: number;
+      completion: number;
+    };
+  };
+}
+
+export interface PromptfooTestResult {
+  prompt: {
+    raw: string;
+    label: string;
+  };
+  vars: Record<string, unknown>;
+  response: {
+    output: string;
+    tokenUsage?: {
+      total: number;
+      prompt: number;
+      completion: number;
+    };
+  };
+  gradingResult: {
+    pass: boolean;
+    score: number;
+    reason?: string;
+    componentResults?: Array<{
+      pass: boolean;
+      score: number;
+      reason: string;
+      assertion: PromptfooAssertion;
+    }>;
+  };
+  success: boolean;
+  error?: string;
+}
+
+export interface EvalConfig {
+  specPath: string;
+  testDir: string;
+  outputDir: string;
+  framework: 'pytest' | 'vitest' | 'jest';
+  uiPort: number;
+  watch: boolean;
+}
--- a/src/runners/index.ts
+++ b/src/runners/index.ts
@ -0,0 +1,101 @@
+import type { Runner, TestFramework, RunnerConfig, ExecutionResult, ExecutionOptions, SandboxConfig } from './types.js';
+import { PytestRunner } from './pytest-runner.js';
+import { VitestRunner, JestRunner } from './vitest-runner.js';
+import { DEFAULT_SANDBOX_CONFIG } from './types.js';
+
+export * from './types.js';
+export { PytestRunner } from './pytest-runner.js';
+export { VitestRunner, JestRunner } from './vitest-runner.js';
+export { sandboxedExec } from './sandbox.js';
+
+const runnerRegistry: Record<TestFramework, new () => Runner> = {
+  pytest: PytestRunner,
+  vitest: VitestRunner,
+  jest: JestRunner,
+};
+
+export function createRunner(framework: TestFramework): Runner {
+  const RunnerClass = runnerRegistry[framework];
+  if (!RunnerClass) {
+    throw new Error(`Unknown test framework: ${framework}`);
+  }
+  return new RunnerClass();
+}
+
+export async function runTests(
+  testDir: string,
+  options: ExecutionOptions,
+  sandboxConfig: SandboxConfig = DEFAULT_SANDBOX_CONFIG
+): Promise<ExecutionResult> {
+  const runner = createRunner(options.framework);
+  
+  const config: RunnerConfig = {
+    testDir,
+    outputFile: `.evaluclaude/results/${options.framework}-${Date.now()}.json`,
+    options,
+    sandboxConfig: options.sandbox ? sandboxConfig : undefined,
+  };
+
+  return runner.run(config);
+}
+
+export function detectTestFramework(testDir: string): TestFramework {
+  const fs = require('fs');
+  const path = require('path');
+
+  const pythonFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.py'));
+  const tsFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.ts') || f.endsWith('.js'));
+
+  if (pythonFiles.length > tsFiles.length) {
+    return 'pytest';
+  }
+
+  const packageJsonPath = path.join(testDir, '..', 'package.json');
+  if (fs.existsSync(packageJsonPath)) {
+    try {
+      const pkg = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
+      if (pkg.devDependencies?.jest || pkg.dependencies?.jest) {
+        return 'jest';
+      }
+    } catch (e) {
+    }
+  }
+
+  return 'vitest';
+}
+
+export function formatResults(result: ExecutionResult): string {
+  const lines: string[] = [];
+  
+  lines.push('');
+  lines.push('📊 Test Execution Results');
+  lines.push('═'.repeat(40));
+  lines.push(`   Total:   ${result.summary.total}`);
+  lines.push(`   ✅ Passed:  ${result.summary.passed}`);
+  lines.push(`   ❌ Failed:  ${result.summary.failed}`);
+  lines.push(`   ⏭️  Skipped: ${result.summary.skipped}`);
+  lines.push(`   ⏱️  Duration: ${result.summary.duration}ms`);
+  
+  if (result.errors.length > 0) {
+    lines.push('');
+    lines.push('⚠️  Errors:');
+    for (const error of result.errors) {
+      lines.push(`   • ${error}`);
+    }
+  }
+
+  const failures = result.tests.filter(t => t.status === 'failed' || t.status === 'error');
+  if (failures.length > 0) {
+    lines.push('');
+    lines.push('❌ Failed Tests:');
+    for (const test of failures) {
+      lines.push(`   • ${test.name}`);
+      if (test.error) {
+        lines.push(`     ${test.error.message}`);
+      }
+    }
+  }
+
+  lines.push('');
+  return lines.join('\n');
+}
--- a/src/runners/pytest-runner.ts
+++ b/src/runners/pytest-runner.ts
@ -0,0 +1,164 @@
+import { readFile, writeFile, mkdir } from 'fs/promises';
+import { existsSync } from 'fs';
+import { join, dirname } from 'path';
+import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js';
+import { sandboxedExec } from './sandbox.js';
+
+interface PytestJsonReport {
+  created: number;
+  duration: number;
+  exitcode: number;
+  root: string;
+  environment: Record<string, string>;
+  summary: {
+    passed: number;
+    failed: number;
+    error: number;
+    skipped: number;
+    total: number;
+    collected: number;
+  };
+  tests: PytestTestResult[];
+}
+
+interface PytestTestResult {
+  nodeid: string;
+  outcome: 'passed' | 'failed' | 'skipped' | 'error';
+  keywords: string[];
+  setup?: { duration: number; outcome: string };
+  call?: { 
+    duration: number; 
+    outcome: string;
+    crash?: { message: string; path: string; lineno: number };
+    traceback?: Array<{ path: string; lineno: number; message: string }>;
+    longrepr?: string;
+  };
+  teardown?: { duration: number; outcome: string };
+}
+
+export class PytestRunner implements Runner {
+  name = 'pytest' as const;
+
+  async run(config: RunnerConfig): Promise<ExecutionResult> {
+    const { testDir, outputFile, options, sandboxConfig } = config;
+    
+    const reportFile = join(testDir, '.pytest_report.json');
+    
+    const args = [
+      '-v',
+      '--tb=short',
+      '--json-report',
+      `--json-report-file=${reportFile}`,
+    ];
+
+    if (options.parallel) {
+      args.push('-n', 'auto');
+    }
+
+    if (options.filter && options.filter.length > 0) {
+      args.push('-k', options.filter.join(' or '));
+    }
+
+    args.push(testDir);
+
+    const result = await sandboxedExec('python', ['-m', 'pytest', ...args], {
+      cwd: options.cwd || process.cwd(),
+      timeout: options.timeout,
+      env: options.env,
+      sandboxConfig: sandboxConfig,
+    });
+
+    let report: PytestJsonReport | undefined;
+    if (existsSync(reportFile)) {
+      try {
+        const content = await readFile(reportFile, 'utf-8');
+        report = JSON.parse(content);
+      } catch (e) {
+      }
+    }
+
+    const executionResult = this.parseResults(result.stdout + result.stderr, report);
+
+    if (result.timedOut) {
+      executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
+    }
+
+    if (outputFile) {
+      await mkdir(dirname(outputFile), { recursive: true });
+      await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
+    }
+
+    return executionResult;
+  }
+
+  parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
+    const report = jsonReport as PytestJsonReport | undefined;
+
+    if (!report) {
+      return this.parseFromStdout(rawOutput);
+    }
+
+    const summary: ExecutionSummary = {
+      total: report.summary.total,
+      passed: report.summary.passed,
+      failed: report.summary.failed,
+      skipped: report.summary.skipped,
+      duration: report.duration * 1000,
+    };
+
+    const tests: TestResult[] = report.tests.map((t) => ({
+      id: this.extractScenarioId(t.nodeid),
+      name: t.nodeid,
+      status: t.outcome === 'error' ? 'error' : t.outcome,
+      duration: (t.call?.duration || 0) * 1000,
+      assertions: {
+        passed: t.outcome === 'passed' ? 1 : 0,
+        failed: t.outcome === 'failed' ? 1 : 0,
+        details: [],
+      },
+      error: t.call?.crash
+        ? { message: t.call.crash.message, stack: t.call.longrepr }
+        : undefined,
+    }));
+
+    return {
+      summary,
+      tests,
+      errors: report.summary.error > 0 ? [`${report.summary.error} tests had errors`] : [],
+    };
+  }
+
+  private parseFromStdout(stdout: string): ExecutionResult {
+    const lines = stdout.split('\n');
+    const summaryMatch = stdout.match(/(\d+) passed|(\d+) failed|(\d+) skipped|(\d+) error/g);
+    
+    let passed = 0, failed = 0, skipped = 0;
+    
+    if (summaryMatch) {
+      for (const match of summaryMatch) {
+        const [num, type] = match.split(' ');
+        const count = parseInt(num, 10);
+        if (type === 'passed') passed = count;
+        if (type === 'failed') failed = count;
+        if (type === 'skipped') skipped = count;
+      }
+    }
+
+    return {
+      summary: {
+        total: passed + failed + skipped,
+        passed,
+        failed,
+        skipped,
+        duration: 0,
+      },
+      tests: [],
+      errors: [],
+    };
+  }
+
+  private extractScenarioId(nodeid: string): string {
+    const match = nodeid.match(/test_([a-zA-Z0-9_-]+)/);
+    return match ? match[1] : nodeid;
+  }
+}
--- a/src/runners/sandbox.ts
+++ b/src/runners/sandbox.ts
@ -0,0 +1,126 @@
+import { spawn, type ChildProcess, type SpawnOptions } from 'child_process';
+import type { SandboxConfig, DEFAULT_SANDBOX_CONFIG } from './types.js';
+
+export interface SandboxedExecResult {
+  exitCode: number;
+  stdout: string;
+  stderr: string;
+  timedOut: boolean;
+}
+
+export async function sandboxedExec(
+  command: string,
+  args: string[],
+  options: {
+    cwd: string;
+    timeout: number;
+    env?: Record<string, string>;
+    sandboxConfig?: SandboxConfig;
+  }
+): Promise<SandboxedExecResult> {
+  const { cwd, timeout, env = {}, sandboxConfig } = options;
+
+  const spawnEnv: Record<string, string> = {};
+  
+  if (sandboxConfig?.enabled) {
+    for (const key of sandboxConfig.env.inherit) {
+      if (process.env[key]) {
+        spawnEnv[key] = process.env[key]!;
+      }
+    }
+    Object.assign(spawnEnv, sandboxConfig.env.set);
+  } else {
+    Object.assign(spawnEnv, process.env);
+  }
+  
+  Object.assign(spawnEnv, env);
+
+  const spawnOptions: SpawnOptions = {
+    cwd,
+    env: spawnEnv,
+    stdio: ['pipe', 'pipe', 'pipe'],
+  };
+
+  return new Promise((resolve) => {
+    let stdout = '';
+    let stderr = '';
+    let timedOut = false;
+
+    const child: ChildProcess = spawn(command, args, spawnOptions);
+    
+    const timeoutId = setTimeout(() => {
+      timedOut = true;
+      child.kill('SIGTERM');
+      setTimeout(() => child.kill('SIGKILL'), 1000);
+    }, timeout);
+
+    child.stdout?.on('data', (data: Buffer) => {
+      stdout += data.toString();
+    });
+
+    child.stderr?.on('data', (data: Buffer) => {
+      stderr += data.toString();
+    });
+
+    child.on('close', (code) => {
+      clearTimeout(timeoutId);
+      resolve({
+        exitCode: code ?? 1,
+        stdout,
+        stderr,
+        timedOut,
+      });
+    });
+
+    child.on('error', (err) => {
+      clearTimeout(timeoutId);
+      resolve({
+        exitCode: 1,
+        stdout,
+        stderr: stderr + '\n' + err.message,
+        timedOut: false,
+      });
+    });
+  });
+}
+
+export function buildSandboxCommand(
+  command: string,
+  args: string[],
+  config: SandboxConfig
+): { command: string; args: string[] } {
+  if (!config.enabled) {
+    return { command, args };
+  }
+
+  if (process.platform === 'darwin') {
+    const sandboxArgs: string[] = [];
+    
+    if (!config.network.allowOutbound) {
+      sandboxArgs.push('--deny-network-outbound');
+    }
+    
+    return {
+      command: 'sandbox-exec',
+      args: ['-p', buildSandboxProfile(config), command, ...args],
+    };
+  }
+
+  return { command, args };
+}
+
+function buildSandboxProfile(config: SandboxConfig): string {
+  const rules: string[] = ['(version 1)', '(allow default)'];
+
+  if (!config.network.allowOutbound) {
+    rules.push('(deny network-outbound (remote ip "*:*"))');
+  }
+
+  for (const path of config.filesystem.readOnly) {
+    if (path !== '/') {
+      rules.push(`(deny file-write* (subpath "${path}"))`);
+    }
+  }
+
+  return rules.join('\n');
+}
--- a/src/runners/types.ts
+++ b/src/runners/types.ts
@ -0,0 +1,95 @@
+export type TestFramework = 'pytest' | 'vitest' | 'jest';
+
+export interface ExecutionOptions {
+  framework: TestFramework;
+  sandbox: boolean;
+  timeout: number;
+  parallel: boolean;
+  filter?: string[];
+  cwd?: string;
+  env?: Record<string, string>;
+}
+
+export interface ExecutionResult {
+  summary: ExecutionSummary;
+  tests: TestResult[];
+  errors: string[];
+  traceId?: string;
+}
+
+export interface ExecutionSummary {
+  total: number;
+  passed: number;
+  failed: number;
+  skipped: number;
+  duration: number;
+}
+
+export interface TestResult {
+  id: string;
+  name: string;
+  status: 'passed' | 'failed' | 'skipped' | 'error';
+  duration: number;
+  assertions: {
+    passed: number;
+    failed: number;
+    details: AssertionResult[];
+  };
+  error?: { message: string; stack?: string };
+  stdout?: string;
+  stderr?: string;
+}
+
+export interface AssertionResult {
+  description: string;
+  passed: boolean;
+  expected?: unknown;
+  actual?: unknown;
+}
+
+export interface SandboxConfig {
+  enabled: boolean;
+  autoAllowBashIfSandboxed: boolean;
+  network: {
+    allowLocalBinding: boolean;
+    allowOutbound: boolean;
+  };
+  filesystem: {
+    readOnly: string[];
+    writable: string[];
+  };
+  env: {
+    inherit: string[];
+    set: Record<string, string>;
+  };
+}
+
+export const DEFAULT_SANDBOX_CONFIG: SandboxConfig = {
+  enabled: true,
+  autoAllowBashIfSandboxed: true,
+  network: {
+    allowLocalBinding: true,
+    allowOutbound: false,
+  },
+  filesystem: {
+    readOnly: ['/'],
+    writable: ['/tmp', './test-output'],
+  },
+  env: {
+    inherit: ['PATH', 'HOME', 'USER'],
+    set: { CI: 'true', NODE_ENV: 'test' },
+  },
+};
+
+export interface RunnerConfig {
+  testDir: string;
+  outputFile: string;
+  options: ExecutionOptions;
+  sandboxConfig?: SandboxConfig;
+}
+
+export interface Runner {
+  name: TestFramework;
+  run(config: RunnerConfig): Promise<ExecutionResult>;
+  parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult;
+}
--- a/src/runners/vitest-runner.ts
+++ b/src/runners/vitest-runner.ts
@ -0,0 +1,213 @@
+import { readFile, writeFile, mkdir } from 'fs/promises';
+import { existsSync } from 'fs';
+import { join, dirname } from 'path';
+import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js';
+import { sandboxedExec } from './sandbox.js';
+
+interface VitestJsonReport {
+  numTotalTestSuites: number;
+  numPassedTestSuites: number;
+  numFailedTestSuites: number;
+  numTotalTests: number;
+  numPassedTests: number;
+  numFailedTests: number;
+  numSkippedTests: number;
+  startTime: number;
+  endTime: number;
+  testResults: VitestTestFile[];
+}
+
+interface VitestTestFile {
+  name: string;
+  status: 'passed' | 'failed';
+  startTime: number;
+  endTime: number;
+  assertionResults: VitestAssertion[];
+}
+
+interface VitestAssertion {
+  ancestorTitles: string[];
+  fullName: string;
+  status: 'passed' | 'failed' | 'skipped';
+  title: string;
+  duration: number;
+  failureMessages: string[];
+}
+
+export class VitestRunner implements Runner {
+  name = 'vitest' as const;
+
+  async run(config: RunnerConfig): Promise<ExecutionResult> {
+    const { testDir, outputFile, options, sandboxConfig } = config;
+    
+    const reportFile = join(testDir, '.vitest_report.json');
+    
+    const args = [
+      'vitest',
+      'run',
+      '--reporter=json',
+      `--outputFile=${reportFile}`,
+    ];
+
+    if (options.filter && options.filter.length > 0) {
+      args.push('--testNamePattern', options.filter.join('|'));
+    }
+
+    args.push(testDir);
+
+    const result = await sandboxedExec('npx', args, {
+      cwd: options.cwd || process.cwd(),
+      timeout: options.timeout,
+      env: options.env,
+      sandboxConfig: sandboxConfig,
+    });
+
+    let report: VitestJsonReport | undefined;
+    if (existsSync(reportFile)) {
+      try {
+        const content = await readFile(reportFile, 'utf-8');
+        report = JSON.parse(content);
+      } catch (e) {
+      }
+    }
+
+    const executionResult = this.parseResults(result.stdout + result.stderr, report);
+
+    if (result.timedOut) {
+      executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
+    }
+
+    if (outputFile) {
+      await mkdir(dirname(outputFile), { recursive: true });
+      await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
+    }
+
+    return executionResult;
+  }
+
+  parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
+    const report = jsonReport as VitestJsonReport | undefined;
+
+    if (!report) {
+      return this.parseFromStdout(rawOutput);
+    }
+
+    const summary: ExecutionSummary = {
+      total: report.numTotalTests,
+      passed: report.numPassedTests,
+      failed: report.numFailedTests,
+      skipped: report.numSkippedTests,
+      duration: report.endTime - report.startTime,
+    };
+
+    const tests: TestResult[] = [];
+    
+    for (const file of report.testResults) {
+      for (const assertion of file.assertionResults) {
+        tests.push({
+          id: this.extractScenarioId(assertion.fullName),
+          name: assertion.fullName,
+          status: assertion.status === 'skipped' ? 'skipped' : assertion.status,
+          duration: assertion.duration,
+          assertions: {
+            passed: assertion.status === 'passed' ? 1 : 0,
+            failed: assertion.status === 'failed' ? 1 : 0,
+            details: [],
+          },
+          error: assertion.failureMessages.length > 0
+            ? { message: assertion.failureMessages.join('\n') }
+            : undefined,
+        });
+      }
+    }
+
+    return {
+      summary,
+      tests,
+      errors: [],
+    };
+  }
+
+  private parseFromStdout(stdout: string): ExecutionResult {
+    const passMatch = stdout.match(/(\d+) passed/);
+    const failMatch = stdout.match(/(\d+) failed/);
+    const skipMatch = stdout.match(/(\d+) skipped/);
+
+    const passed = passMatch ? parseInt(passMatch[1], 10) : 0;
+    const failed = failMatch ? parseInt(failMatch[1], 10) : 0;
+    const skipped = skipMatch ? parseInt(skipMatch[1], 10) : 0;
+
+    return {
+      summary: {
+        total: passed + failed + skipped,
+        passed,
+        failed,
+        skipped,
+        duration: 0,
+      },
+      tests: [],
+      errors: [],
+    };
+  }
+
+  private extractScenarioId(fullName: string): string {
+    const match = fullName.match(/test[_\s]([a-zA-Z0-9_-]+)/i);
+    return match ? match[1] : fullName.replace(/\s+/g, '_');
+  }
+}
+
+export class JestRunner implements Runner {
+  name = 'jest' as const;
+
+  async run(config: RunnerConfig): Promise<ExecutionResult> {
+    const { testDir, outputFile, options, sandboxConfig } = config;
+    
+    const reportFile = join(testDir, '.jest_report.json');
+    
+    const args = [
+      'jest',
+      '--json',
+      `--outputFile=${reportFile}`,
+    ];
+
+    if (options.filter && options.filter.length > 0) {
+      args.push('--testNamePattern', options.filter.join('|'));
+    }
+
+    args.push(testDir);
+
+    const result = await sandboxedExec('npx', args, {
+      cwd: options.cwd || process.cwd(),
+      timeout: options.timeout,
+      env: options.env,
+      sandboxConfig: sandboxConfig,
+    });
+
+    let report: VitestJsonReport | undefined;
+    if (existsSync(reportFile)) {
+      try {
+        const content = await readFile(reportFile, 'utf-8');
+        report = JSON.parse(content);
+      } catch (e) {
+      }
+    }
+
+    const executionResult = this.parseResults(result.stdout + result.stderr, report);
+
+    if (result.timedOut) {
+      executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
+    }
+
+    if (outputFile) {
+      await mkdir(dirname(outputFile), { recursive: true });
+      await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
+    }
+
+    return executionResult;
+  }
+
+  parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
+    const vitestRunner = new VitestRunner();
+    return vitestRunner.parseResults(rawOutput, jsonReport);
+  }
+}