improvements and promptfoo

2026-04-15 07:04:47 +00:00 · 2026-01-11 20:02:30 -05:00 · 2026-01-11 20:02:30 -05:00 · ff5300f4e0
commit ff5300f4e0
parent 6698c12e5b
13 changed files with 1082 additions and 117 deletions
--- a/rubrics/code-quality.yaml
+++ b/rubrics/code-quality.yaml
@ -0,0 +1,32 @@
 name: code-quality
 description: Evaluates generated code for quality and maintainability
 passingThreshold: 0.7
 criteria:
  - name: readability
    weight: 0.3
    description: Code is easy to read and understand
    examples:
      good: "Clear variable names, logical flow, proper indentation"
      bad: "Single-letter variables, deeply nested logic, inconsistent style"
  - name: correctness
    weight: 0.4
    description: Code correctly implements the intended behavior
    examples:
      good: "Handles edge cases, correct algorithm, proper error handling"
      bad: "Missing edge cases, off-by-one errors, swallowed exceptions"
  - name: efficiency
    weight: 0.2
    description: Code uses appropriate data structures and algorithms
    examples:
      good: "O(n) where O(n) is optimal, avoids unnecessary allocations"
      bad: "O(n²) when O(n) is possible, creates objects in tight loops"
  - name: maintainability
    weight: 0.1
    description: Code is easy to modify and extend
    examples:
      good: "Single responsibility, low coupling, clear interfaces"
      bad: "God functions, tight coupling, magic numbers"
--- a/rubrics/documentation.yaml
+++ b/rubrics/documentation.yaml
@ -0,0 +1,32 @@
 name: documentation
 description: Evaluates quality of code documentation and docstrings
 passingThreshold: 0.65
 criteria:
  - name: completeness
    weight: 0.35
    description: Documentation covers all parameters, return values, and exceptions
    examples:
      good: "Fully documents args, returns, raises, and includes usage example"
      bad: "Missing parameter descriptions or return type"
  - name: accuracy
    weight: 0.35
    description: Documentation accurately describes the function's behavior
    examples:
      good: "Description matches implementation, types are correct"
      bad: "Outdated docs that don't match current behavior"
  - name: examples
    weight: 0.2
    description: Includes helpful usage examples
    examples:
      good: "Shows common use cases with expected outputs"
      bad: "No examples or only trivial ones"
  - name: style
    weight: 0.1
    description: Follows project/language documentation conventions
    examples:
      good: "Uses standard docstring format (Google, NumPy, or reStructuredText)"
      bad: "Inconsistent or non-standard format"
--- a/rubrics/error-messages.yaml
+++ b/rubrics/error-messages.yaml
@ -0,0 +1,25 @@
 name: error-messages
 description: Evaluates quality of error messages
 passingThreshold: 0.6
 criteria:
  - name: clarity
    weight: 0.4
    description: Error message clearly explains what went wrong
    examples:
      good: "Invalid email format: 'not-an-email' is missing '@' symbol"
      bad: "Error: validation failed"
  - name: actionability
    weight: 0.4
    description: Error message suggests how to fix the problem
    examples:
      good: "File not found. Create the file or check the path spelling."
      bad: "ENOENT"
  - name: context
    weight: 0.2
    description: Error message includes relevant context (file, line, values)
    examples:
      good: "TypeError at line 42 in auth.py: expected str, got int (value=123)"
      bad: "type error"
--- a/src/cli/commands/pipeline.ts
+++ b/src/cli/commands/pipeline.ts
@ -0,0 +1,257 @@
 import { Command } from 'commander';
 import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
 import { join, resolve } from 'path';
 import { analyze } from '../../introspector/index.js';
 import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
 import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
 import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
 import { createTracer, saveTrace } from '../../observability/index.js';
 import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
 import type { EvalSpec } from '../../analyzer/types.js';
 const EVALUCLAUDE_DIR = '.evaluclaude';
 interface PipelineOptions {
  output?: string;
  interactive?: boolean;
  focus?: string;
  maxScenarios: string;
  testDir: string;
  framework?: string;
  skipAnalyze?: boolean;
  skipRender?: boolean;
  skipRun?: boolean;
  promptfoo?: boolean;
  quiet?: boolean;
 }
 export const pipelineCommand = new Command('pipeline')
  .description('Run the full eval generation pipeline: introspect → analyze → render → run')
  .argument('[path]', 'Path to the repository to analyze', '.')
  .option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude')
  .option('-i, --interactive', 'Enable interactive mode with clarifying questions')
  .option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
  .option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
  .option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
  .option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
  .option('--skip-analyze', 'Skip analysis, use existing spec')
  .option('--skip-render', 'Skip rendering, use existing tests')
  .option('--skip-run', 'Skip test execution')
  .option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
  .option('--quiet', 'Suppress progress messages')
  .action(async (repoPath: string, options: PipelineOptions) => {
    const absolutePath = resolve(repoPath);
    const log = options.quiet ? () => {} : console.log;
    const outputDir = options.output || EVALUCLAUDE_DIR;
    console.log('\n🚀 Evaluclaude Pipeline');
    console.log('═'.repeat(50));
    console.log(`   Repository: ${absolutePath}`);
    console.log(`   Output: ${outputDir}`);
    console.log('═'.repeat(50) + '\n');
    // Ensure output directories exist
    mkdirSync(outputDir, { recursive: true });
    mkdirSync(options.testDir, { recursive: true });
    const specPath = join(outputDir, 'spec.json');
    const tracesDir = join(outputDir, 'traces');
    const resultsDir = join(outputDir, 'results');
    mkdirSync(tracesDir, { recursive: true });
    mkdirSync(resultsDir, { recursive: true });
    let spec: EvalSpec;
    // Step 1: Introspection + Analysis
    if (options.skipAnalyze && existsSync(specPath)) {
      log('📋 Using existing EvalSpec...');
      spec = JSON.parse(readFileSync(specPath, 'utf-8'));
      log(`   Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
    } else {
      log('🔬 Step 1: Introspecting codebase...');
      try {
        const repoSummary = await analyze({
          root: absolutePath,
          onProgress: options.quiet ? undefined : (msg) => log(`   ${msg}`),
        });
        log(`   Files: ${repoSummary.files.length}`);
        log(`   Languages: ${repoSummary.languages.join(', ')}`);
        log('');
        log('🤖 Step 2: Generating EvalSpec with Claude...\n');
        const focus = options.focus?.split(',').map(s => s.trim());
        const maxScenarios = parseInt(options.maxScenarios, 10);
        let result;
        if (options.interactive) {
          const { default: inquirer } = await import('inquirer');
          result = await generateEvalSpecInteractive(
            repoSummary,
            async (question: string) => {
              const { answer } = await inquirer.prompt([{
                type: 'input',
                name: 'answer',
                message: `🤖 Claude asks: ${question}`,
              }]);
              return answer;
            },
            { focus, maxScenarios }
          );
        } else {
          result = await generateEvalSpec(repoSummary, {
            interactive: false,
            focus,
            maxScenarios,
          });
        }
        spec = result.spec;
        // Save the spec
        writeFileSync(specPath, JSON.stringify(spec, null, 2));
        log(`\n✅ EvalSpec generated!`);
        log(`   Scenarios: ${spec.scenarios.length}`);
        log(`   Tokens: ${result.tokensUsed}`);
        log(`   Saved: ${specPath}\n`);
      } catch (error) {
        console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error);
        process.exit(1);
      }
    }
    // Step 2: Render tests
    if (!options.skipRender) {
      log('📝 Step 3: Rendering test files...');
      try {
        const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
        const renderResult = await renderSpec(spec, {
          outputDir: options.testDir,
          framework,
          includeFixtures: true,
          generateMocks: true,
          dryRun: false,
        });
        log(`   Framework: ${framework}`);
        log(`   Files: ${renderResult.stats.fileCount}`);
        log(`   Scenarios: ${renderResult.stats.scenarioCount}`);
        log(`   Assertions: ${renderResult.stats.assertionCount}`);
        log(`   Output: ${options.testDir}\n`);
      } catch (error) {
        console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error);
        process.exit(1);
      }
    }
    // Step 3: Run tests
    if (!options.skipRun) {
      log('🧪 Step 4: Running tests...\n');
      try {
        const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
        const tracer = createTracer(spec.repo.name);
        tracer.recordIntrospection({
          filesAnalyzed: spec.scenarios.map(s => s.target.module),
          totalFunctions: spec.scenarios.length,
          duration: 0,
        });
        tracer.recordGeneration({
          scenariosGenerated: spec.scenarios.length,
          filesWritten: [options.testDir],
        });
        const result = await runTests(
          options.testDir,
          {
            framework,
            sandbox: true,
            timeout: 300000,
            parallel: false,
            cwd: process.cwd(),
          },
          DEFAULT_SANDBOX_CONFIG
        );
        tracer.recordExecution({
          testsPassed: result.summary.passed,
          testsFailed: result.summary.failed,
          testsSkipped: result.summary.skipped,
        });
        for (const test of result.tests) {
          if (test.status === 'failed' || test.status === 'error') {
            tracer.recordTestFailure({
              scenarioId: test.id,
              testName: test.name,
              error: test.error?.message || 'Unknown error',
              stack: test.error?.stack,
            });
          }
        }
        const trace = tracer.finalize();
        const tracePath = await saveTrace(trace);
        log(formatResults(result));
        log(`📊 Trace saved: ${tracePath}`);
        log(`   View with: evaluclaude view ${trace.id}\n`);
        // Save results
        const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
        writeFileSync(resultsPath, JSON.stringify(result, null, 2));
      } catch (error) {
        console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error);
        process.exit(1);
      }
    }
    // Step 4: Generate Promptfoo config
    if (options.promptfoo) {
      log('📦 Step 5: Generating Promptfoo configuration...');
      try {
        const configPath = join(outputDir, 'promptfooconfig.yaml');
        const providerPath = join(outputDir, 'providers', 'test-runner.py');
        const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
        await generatePromptfooConfig(spec, {
          testDir: options.testDir,
          outputPath: configPath,
          framework,
          includeTraceLinks: true,
          providerPath,
        });
        await generateTestProvider(providerPath);
        log(`   Config: ${configPath}`);
        log(`   Provider: ${providerPath}`);
        log(`\n   Launch UI with: evaluclaude ui\n`);
      } catch (error) {
        console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
      }
    }
    console.log('═'.repeat(50));
    console.log('✅ Pipeline complete!');
    console.log('═'.repeat(50));
    console.log(`\nNext steps:`);
    console.log(`   View traces:     evaluclaude view --last`);
    console.log(`   List all traces: evaluclaude traces`);
    if (options.promptfoo) {
      console.log(`   Launch UI:       evaluclaude ui`);
      console.log(`   Run Promptfoo:   evaluclaude eval --spec ${specPath}`);
    }
    console.log('');
  });
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@ -10,6 +10,7 @@ import {
  DEFAULT_SANDBOX_CONFIG 
 } from '../../runners/index.js';
 import { createTracer, saveTrace } from '../../observability/index.js';
 import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
 import type { EvalSpec } from '../../analyzer/types.js';
 export const runCommand = new Command('run')
@ -25,6 +26,7 @@ export const runCommand = new Command('run')
  .option('-o, --output <file>', 'Output results to JSON file')
  .option('--trace', 'Record execution trace', true)
  .option('--no-trace', 'Disable execution tracing')
  .option('--export-promptfoo', 'Export results in Promptfoo format', false)
  .option('-w, --watch', 'Watch mode (rerun on changes)', false)
  .action(async (testDir: string, options) => {
    try {
@ -109,6 +111,16 @@ export const runCommand = new Command('run')
        console.log(`\n📁 Results saved to: ${options.output}`);
      }
      // Export to Promptfoo format for UI viewing
      if (options.exportPromptfoo) {
        const exportPath = await exportToPromptfooFormat(result, spec, {
          outputDir: '.evaluclaude/results',
          evalId: `eval-${Date.now()}`,
        });
        console.log(`\n📦 Promptfoo results exported: ${exportPath}`);
        console.log(`   View with: evaluclaude ui`);
      }
      if (tracer) {
        const trace = tracer.finalize();
        const tracePath = await saveTrace(trace);
--- a/src/cli/commands/ui.ts
+++ b/src/cli/commands/ui.ts
@ -1,7 +1,7 @@
 import { Command } from 'commander';
-import { spawn, type ChildProcess } from 'child_process';
+import { spawn } from 'child_process';
 import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
-import { join, dirname } from 'path';
+import { join, dirname, resolve as resolvePath } from 'path';
 import type { EvalSpec } from '../../analyzer/types.js';
 import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
@ -21,6 +21,7 @@ export const uiCommand = new Command('ui')
      const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
      const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
      // If spec provided with --generate, create/update Promptfoo config
      if (options.spec && options.generate) {
        console.log('\n📄 Generating Promptfoo configuration...');
@ -36,6 +37,7 @@ export const uiCommand = new Command('ui')
          outputPath: configPath,
          framework: detectFramework(spec),
          includeTraceLinks: true,
          providerPath: providerPath,
        });
        await generateTestProvider(providerPath);
@ -44,20 +46,31 @@ export const uiCommand = new Command('ui')
        console.log(`   Provider: ${providerPath}`);
      }
      // Check for existing config, create default if missing
      if (!existsSync(configPath)) {
        console.log('\n⚠️  No Promptfoo config found.');
-        console.log('   Run with --spec <file> --generate to create one.\n');
+        console.log('   Creating default configuration...\n');
        console.log('   Or create one manually:');
        console.log(`   ${configPath}\n`);
        await createDefaultConfig(configPath, providerPath);
-        console.log(`   Created default config at ${configPath}`);
+        console.log(`   Created: ${configPath}`);
      }
      // Check for results to display
      const resultsDir = join(EVALUCLAUDE_DIR, 'results');
      const latestResults = join(resultsDir, 'latest.json');
      if (!existsSync(latestResults)) {
        console.log('\n⚠️  No evaluation results found.');
        console.log('   Run `evaluclaude run --export-promptfoo` first to generate results.\n');
        console.log('   Or run the full pipeline:');
        console.log('   evaluclaude pipeline <path> --promptfoo\n');
      }
      console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
-      console.log(`   Config: ${configPath}\n`);
+      console.log(`   Results: ${latestResults}\n`);
-      await launchPromptfooUI(port, configPath, options.open);
+      // Use promptfoo view with the results file
      await launchPromptfooView(port, latestResults, options.open);
    } catch (error) {
      console.error('Error launching UI:', error instanceof Error ? error.message : error);
      process.exit(1);
@ -71,12 +84,21 @@ export const evalCommand = new Command('eval')
  .option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
  .option('--view', 'Launch UI after evaluation', false)
  .option('-p, --port <port>', 'Port for UI', '3000')
  .option('--no-cache', 'Disable Promptfoo caching', false)
  .action(async (options) => {
    try {
      const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
      const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
      // Generate config from spec if provided
      if (options.spec) {
        console.log('\n📄 Generating Promptfoo configuration from spec...');
        if (!existsSync(options.spec)) {
          console.error(`Error: Spec file not found: ${options.spec}`);
          process.exit(1);
        }
        const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
        await generatePromptfooConfig(spec, {
@ -84,30 +106,57 @@ export const evalCommand = new Command('eval')
          outputPath: configPath,
          framework: detectFramework(spec),
          includeTraceLinks: true,
          providerPath: providerPath,
        });
        const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
        await generateTestProvider(providerPath);
        console.log(`   Config: ${configPath}`);
        console.log(`   Provider: ${providerPath}`);
        console.log(`   Scenarios: ${spec.scenarios.length}`);
      }
      if (!existsSync(configPath)) {
-        console.error(`Error: Config not found: ${configPath}`);
+        console.error(`\nError: Config not found: ${configPath}`);
-        console.log('Run with --spec <file> to generate from EvalSpec.');
+        console.log('Run with --spec <file> to generate from EvalSpec, or create config manually.');
        process.exit(1);
      }
-      console.log('\n🧪 Running Promptfoo evaluations...\n');
+      // Ensure output directory exists
      mkdirSync(options.output, { recursive: true });
      console.log('\n🧪 Running Promptfoo evaluations...');
      console.log(`   Config: ${configPath}`);
      console.log(`   Output: ${options.output}\n`);
      const outputFile = join(options.output, `eval-${Date.now()}.json`);
      mkdirSync(dirname(outputFile), { recursive: true });
-      await runPromptfooEval(configPath, outputFile);
+      const exitCode = await runPromptfooEval(configPath, outputFile, !options.cache);
-      console.log(`\n📁 Results saved: ${outputFile}`);
+      if (exitCode === 0) {
        console.log(`\n✅ Evaluation complete!`);
        console.log(`📁 Results: ${outputFile}`);
      } else {
        console.log(`\n⚠️  Evaluation finished with exit code ${exitCode}`);
        console.log(`📁 Results: ${outputFile}`);
      }
      // List traces generated during evaluation
      const tracesDir = join(EVALUCLAUDE_DIR, 'traces');
      if (existsSync(tracesDir)) {
        const { readdirSync } = await import('fs');
        const traces = readdirSync(tracesDir).filter(f => f.endsWith('.json'));
        if (traces.length > 0) {
          console.log(`\n📊 Traces generated: ${traces.length}`);
          console.log(`   View with: evaluclaude view --last`);
        }
      }
      if (options.view) {
        console.log(`\n🚀 Launching UI on port ${options.port}...`);
        await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
      } else {
        console.log(`\n   View results: evaluclaude ui`);
      }
    } catch (error) {
      console.error('Error running eval:', error instanceof Error ? error.message : error);
@ -115,6 +164,64 @@ export const evalCommand = new Command('eval')
    }
  });
 /**
 * Launch Promptfoo view to display pre-computed results.
 */
 async function launchPromptfooView(
  port: number,
  resultsFile: string,
  openBrowser: boolean
 ): Promise<void> {
  return new Promise((resolve, reject) => {
    // Use 'promptfoo view' which opens the web UI showing results from the output directory
    const resultsDir = dirname(resolvePath(resultsFile));
    const args = ['promptfoo', 'view', '--port', String(port)];
    if (openBrowser) {
      args.push('-y');
    } else {
      args.push('-n');
    }
    // Pass the directory containing results
    args.push(resultsDir);
    console.log(`   Running: npx ${args.join(' ')}\n`);
    const child = spawn('npx', args, {
      stdio: 'inherit',
      env: { ...process.env },
    });
    child.on('error', (error) => {
      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
        console.error('\n❌ Promptfoo not found.');
        console.error('   Install with: npm install -g promptfoo');
        console.error('   Or run: npx promptfoo --version\n');
      } else {
        reject(error);
      }
    });
    child.on('close', (code) => {
      if (code === 0) {
        resolve();
      } else {
        reject(new Error(`Promptfoo exited with code ${code}`));
      }
    });
    // Handle Ctrl+C gracefully
    process.on('SIGINT', () => {
      child.kill('SIGINT');
      process.exit(0);
    });
  });
 }
 /**
 * Launch Promptfoo with a config file (for running evals).
 */
 async function launchPromptfooUI(
  port: number, 
  configPath: string, 
@ -129,7 +236,8 @@ async function launchPromptfooUI(
      args.push('-n');
    }
-    const configDir = dirname(configPath);
+    // Pass the directory containing the config
    const configDir = dirname(resolvePath(configPath));
    args.push(configDir);
    console.log(`   Running: npx ${args.join(' ')}\n`);
@ -141,7 +249,9 @@ async function launchPromptfooUI(
    child.on('error', (error) => {
      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
-        console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
+        console.error('\n❌ Promptfoo not found.');
        console.error('   Install with: npm install -g promptfoo');
        console.error('   Or run: npx promptfoo --version\n');
      } else {
        reject(error);
      }
@ -155,6 +265,7 @@ async function launchPromptfooUI(
      }
    });
    // Handle Ctrl+C gracefully
    process.on('SIGINT', () => {
      child.kill('SIGINT');
      process.exit(0);
@ -162,16 +273,23 @@ async function launchPromptfooUI(
  });
 }
-async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
+async function runPromptfooEval(
  configPath: string, 
  outputFile: string,
  noCache: boolean
 ): Promise<number> {
  return new Promise((resolve, reject) => {
    const args = [
      'promptfoo', 
      'eval', 
      '-c', configPath,
      '-o', outputFile,
      '--no-cache',
    ];
    if (noCache) {
      args.push('--no-cache');
    }
    console.log(`   Running: npx ${args.join(' ')}\n`);
    const child = spawn('npx', args, {
@ -179,14 +297,18 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
      env: { ...process.env },
    });
-    child.on('error', reject);
+    child.on('error', (error) => {
      if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
        console.error('\n❌ Promptfoo not found.');
        console.error('   Install with: npm install -g promptfoo\n');
        reject(error);
      } else {
        reject(error);
      }
    });
    child.on('close', (code) => {
-      if (code === 0) {
+      resolve(code ?? 1);
        resolve();
      } else {
        reject(new Error(`Promptfoo eval exited with code ${code}`));
      }
    });
  });
 }
@ -194,6 +316,14 @@ async function runPromptfooEval(configPath: string, outputFile: string): Promise
 async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
  const defaultConfig = `# Evaluclaude Promptfoo Configuration
 # Generated by evaluclaude
 # 
 # To populate this config from an EvalSpec:
 #   evaluclaude eval --spec <evalspec.json>
 #
 # Or run the full pipeline:
 #   evaluclaude analyze <path> -o spec.json
 #   evaluclaude render spec.json -o tests/generated
 #   evaluclaude eval --spec spec.json
 description: "Evaluclaude functional test evaluations"
@ -204,12 +334,13 @@ providers:
      test_dir: ./tests/generated
      framework: pytest
      timeout: 300
      sandbox: true
 prompts:
  - "{{scenario_id}}"
 tests:
-  - description: "Example test"
+  - description: "Example test - replace with real scenarios"
    vars:
      scenario_id: "test_example"
    assert:
@ -219,12 +350,19 @@ tests:
          result = json.loads(output)
          result.get('passed', 0) > 0
 # Default test configuration
 defaultTest:
  metadata:
    evaluclaude: true
    tracesDir: .evaluclaude/traces
 outputPath: .evaluclaude/results/promptfoo-results.json
 `;
  mkdirSync(dirname(configPath), { recursive: true });
  writeFileSync(configPath, defaultConfig);
  // Also generate the provider
  await generateTestProvider(providerPath);
 }
@ -232,5 +370,8 @@ function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
  if (spec.repo.languages.includes('python')) {
    return 'pytest';
  }
  if (spec.repo.languages.includes('typescript') || spec.repo.languages.includes('javascript')) {
    return 'vitest';
  }
  return 'vitest';
 }
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@ -8,6 +8,7 @@ import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/g
 import { runCommand } from './commands/run.js';
 import { viewCommand, tracesCommand } from './commands/view.js';
 import { uiCommand, evalCommand } from './commands/ui.js';
 import { pipelineCommand } from './commands/pipeline.js';
 const program = new Command();
@ -16,15 +17,25 @@ program
  .description('Zero-to-evals in one command. Claude analyzes codebases and generates functional tests.')
  .version('0.1.0');
 // Core pipeline command - the "zero to evals" experience
 program.addCommand(pipelineCommand);
 // Individual step commands
 program.addCommand(introCommand);
 program.addCommand(analyzeCommand);
 program.addCommand(renderCommand);
 program.addCommand(runCommand);
 // Grading commands
 program.addCommand(gradeCommand);
 program.addCommand(listRubricsCommand);
 program.addCommand(calibrateCommand);
-program.addCommand(runCommand);
+
 // Observability commands
 program.addCommand(viewCommand);
 program.addCommand(tracesCommand);
 // Promptfoo integration commands
 program.addCommand(uiCommand);
 program.addCommand(evalCommand);
--- a/src/observability/trace-viewer.ts
+++ b/src/observability/trace-viewer.ts
@ -64,7 +64,7 @@ export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}
  lines.push('─'.repeat(40));
  lines.push(`   ✅ Passed:         ${trace.execution.testsPassed}`);
  lines.push(`   ❌ Failed:         ${trace.execution.testsFailed}`);
-  lines.push(`   ⏭️  Skipped:        ${trace.execution.testsSkipped}`);
+  lines.push(`   ⏭️  Skipped:        ${trace.execution.testsSkipped ?? 0}`);
  lines.push('');
  if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
--- a/src/promptfoo/config-generator.ts
+++ b/src/promptfoo/config-generator.ts
@ -1,5 +1,5 @@
 import { writeFile, mkdir } from 'fs/promises';
-import { dirname, join } from 'path';
+import { dirname, join, resolve } from 'path';
 import * as yaml from 'js-yaml';
 import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
 import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
@ -9,6 +9,7 @@ export interface ConfigOptions {
  outputPath: string;
  framework: 'pytest' | 'vitest' | 'jest';
  includeTraceLinks: boolean;
  providerPath?: string;
 }
 export async function generatePromptfooConfig(
@ -30,16 +31,23 @@ export async function generatePromptfooConfig(
 function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
  const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
  // Provider path should be relative to the config file location
  // Since config is at .evaluclaude/promptfooconfig.yaml, the provider is at ./providers/test-runner.py
  const providerRelativePath = options.providerPath 
    ? options.providerPath.replace('.evaluclaude/', './').replace(/^\.evaluclaude\//, './')
    : './providers/test-runner.py';
  return {
    description: `Evaluclaude functional tests for ${spec.repo.name}`,
    providers: [
      {
-        id: `file://providers/test-runner.py`,
+        id: `file://${providerRelativePath}`,
        label: 'functional-tests',
        config: {
-          test_dir: options.testDir,
+          test_dir: resolve(options.testDir),
          framework: options.framework,
          timeout: 300,
          sandbox: true,
        },
      },
    ],
@ -48,11 +56,12 @@ function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
    defaultTest: options.includeTraceLinks
      ? {
          metadata: {
-            traceFile: '.evaluclaude/traces/{{evalId}}.json',
+            evaluclaude: true,
            tracesDir: './traces',
          },
        }
      : undefined,
-    outputPath: '.evaluclaude/results/promptfoo-results.json',
+    outputPath: './results/promptfoo-results.json',
  };
 }
@ -147,91 +156,50 @@ function buildAssertion(assertion: any): PromptfooAssertion {
 export async function generateTestProvider(outputPath: string): Promise<void> {
  const providerCode = `#!/usr/bin/env python3
-"""Promptfoo provider that executes tests and returns structured results."""
+"""
 Promptfoo provider that executes tests and returns structured results.
 This provider integrates with evaluclaude-harness test runners to execute
 functional tests in a sandboxed environment and return results compatible
 with Promptfoo's assertion system.
 """
 import subprocess
 import json
 import sys
 import os
 import tempfile
 import uuid
 from pathlib import Path
-def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
+
 def call_api(prompt: str, options: dict, context: dict) -> dict:
    """Runs tests and returns structured results."""
-    test_dir = options.get('config', {}).get('test_dir', './tests')
+    config = options.get('config', {})
-    framework = options.get('config', {}).get('framework', 'pytest')
+    test_dir = config.get('test_dir', './tests/generated')
-    timeout = options.get('config', {}).get('timeout', 300)
+    framework = config.get('framework', 'pytest')
    timeout = config.get('timeout', 300)
    sandbox = config.get('sandbox', True)
    scenario_id = prompt.strip()
    eval_id = f"eval-{uuid.uuid4().hex[:8]}"
    # Ensure traces directory exists
    traces_dir = Path('.evaluclaude/traces')
    traces_dir.mkdir(parents=True, exist_ok=True)
    try:
        if framework == 'pytest':
-            result = subprocess.run(
+            output = run_pytest(test_dir, scenario_id, timeout, eval_id)
                [
                    'python', '-m', 'pytest',
                    '--json-report',
                    '--json-report-file=/tmp/pytest_results.json',
                    '-k', scenario_id,
                    test_dir
                ],
                capture_output=True,
                text=True,
                timeout=timeout
            )
            try:
                with open('/tmp/pytest_results.json') as f:
                    report = json.load(f)
                output = {
                    'passed': report.get('summary', {}).get('passed', 0),
                    'failed': report.get('summary', {}).get('failed', 0),
                    'skipped': report.get('summary', {}).get('skipped', 0),
                    'tests': report.get('tests', []),
                    'stdout': result.stdout,
                    'stderr': result.stderr,
                    'exit_code': result.returncode,
                }
            except FileNotFoundError:
                output = {
                    'passed': 0,
                    'failed': 1,
                    'error': 'Failed to generate pytest report',
                    'stdout': result.stdout,
                    'stderr': result.stderr,
                }
        elif framework in ('vitest', 'jest'):
-            cmd = ['npx', framework, 'run', '--reporter=json']
+            output = run_js_tests(test_dir, scenario_id, timeout, framework, eval_id)
            if scenario_id:
                cmd.extend(['--testNamePattern', scenario_id])
            cmd.append(test_dir)
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=timeout
            )
            try:
                report = json.loads(result.stdout)
                output = {
                    'passed': report.get('numPassedTests', 0),
                    'failed': report.get('numFailedTests', 0),
                    'skipped': report.get('numSkippedTests', 0),
                    'tests': report.get('testResults', []),
                    'exit_code': result.returncode,
                }
            except json.JSONDecodeError:
                output = {
                    'passed': 0,
                    'failed': 1,
                    'error': 'Failed to parse test output',
                    'stdout': result.stdout,
                    'stderr': result.stderr,
                }
        else:
-            output = {'error': f'Unknown framework: {framework}'}
+            output = {'error': f'Unknown framework: {framework}', 'passed': 0, 'failed': 1}
        # Add trace reference
        output['eval_id'] = eval_id
        output['trace_file'] = str(traces_dir / f"{eval_id}.json")
        return {
            'output': json.dumps(output),
@ -240,32 +208,187 @@ def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
    except subprocess.TimeoutExpired:
        return {
-            'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
+            'output': json.dumps({
                'error': 'Test execution timed out',
                'passed': 0,
                'failed': 1,
                'eval_id': eval_id,
            }),
            'error': None,
        }
    except Exception as e:
        return {
-            'output': None,
+            'output': json.dumps({
                'error': str(e),
                'passed': 0,
                'failed': 1,
                'eval_id': eval_id,
            }),
            'error': str(e),
        }
 def run_pytest(test_dir: str, scenario_id: str, timeout: int, eval_id: str) -> dict:
    """Run pytest and return structured results."""
    with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
        report_file = f.name
    cmd = [
        sys.executable, '-m', 'pytest',
        '--json-report',
        f'--json-report-file={report_file}',
        '-v',
        '--tb=short',
    ]
    if scenario_id:
        cmd.extend(['-k', scenario_id])
    cmd.append(test_dir)
    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        timeout=timeout,
        cwd=os.getcwd(),
    )
    try:
        with open(report_file) as f:
            report = json.load(f)
        summary = report.get('summary', {})
        tests = report.get('tests', [])
        output = {
            'passed': summary.get('passed', 0),
            'failed': summary.get('failed', 0),
            'skipped': summary.get('skipped', 0),
            'total': summary.get('total', 0),
            'duration': report.get('duration', 0) * 1000,  # Convert to ms
            'tests': [
                {
                    'id': extract_scenario_id(t.get('nodeid', '')),
                    'name': t.get('nodeid', ''),
                    'status': t.get('outcome', 'unknown'),
                    'duration': (t.get('call', {}).get('duration', 0) or 0) * 1000,
                    'error': t.get('call', {}).get('crash', {}).get('message') if t.get('call', {}).get('crash') else None,
                }
                for t in tests
            ],
            'exit_code': result.returncode,
        }
    except (FileNotFoundError, json.JSONDecodeError) as e:
        output = {
            'passed': 0,
            'failed': 1,
            'error': f'Failed to parse pytest report: {e}',
            'stdout': result.stdout[-2000:] if result.stdout else '',
            'stderr': result.stderr[-2000:] if result.stderr else '',
        }
    finally:
        try:
            os.unlink(report_file)
        except OSError:
            pass
    return output
 def run_js_tests(test_dir: str, scenario_id: str, timeout: int, framework: str, eval_id: str) -> dict:
    """Run vitest/jest and return structured results."""
    with tempfile.NamedTemporaryFile(suffix='.json', delete=False) as f:
        report_file = f.name
    cmd = ['npx', framework, 'run', '--reporter=json', f'--outputFile={report_file}']
    if scenario_id:
        cmd.extend(['--testNamePattern', scenario_id])
    cmd.append(test_dir)
    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        timeout=timeout,
        cwd=os.getcwd(),
    )
    try:
        with open(report_file) as f:
            report = json.load(f)
        output = {
            'passed': report.get('numPassedTests', 0),
            'failed': report.get('numFailedTests', 0),
            'skipped': report.get('numSkippedTests', 0),
            'total': report.get('numTotalTests', 0),
            'tests': [],
            'exit_code': result.returncode,
        }
        for test_file in report.get('testResults', []):
            for assertion in test_file.get('assertionResults', []):
                output['tests'].append({
                    'id': extract_scenario_id(assertion.get('fullName', '')),
                    'name': assertion.get('fullName', ''),
                    'status': assertion.get('status', 'unknown'),
                    'duration': assertion.get('duration', 0),
                    'error': assertion.get('failureMessages', [None])[0] if assertion.get('failureMessages') else None,
                })
    except (FileNotFoundError, json.JSONDecodeError) as e:
        output = {
            'passed': 0,
            'failed': 1,
            'error': f'Failed to parse {framework} report: {e}',
            'stdout': result.stdout[-2000:] if result.stdout else '',
            'stderr': result.stderr[-2000:] if result.stderr else '',
        }
    finally:
        try:
            os.unlink(report_file)
        except OSError:
            pass
    return output
 def extract_scenario_id(nodeid: str) -> str:
    """Extract scenario ID from test name."""
    import re
    match = re.search(r'test[_\\s]([a-zA-Z0-9_-]+)', nodeid, re.IGNORECASE)
    return match.group(1) if match else nodeid.replace(' ', '_')
 def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
    """Alias for call_api for backwards compatibility."""
    return call_api(prompt, options, context)
 if __name__ == '__main__':
    # For testing the provider directly
    import argparse
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description='Run tests for Promptfoo')
-    parser.add_argument('--scenario', default='')
+    parser.add_argument('--scenario', default='', help='Scenario ID to filter')
-    parser.add_argument('--test-dir', default='./tests')
+    parser.add_argument('--test-dir', default='./tests/generated', help='Test directory')
-    parser.add_argument('--framework', default='pytest')
+    parser.add_argument('--framework', default='pytest', help='Test framework')
    parser.add_argument('--timeout', type=int, default=300, help='Timeout in seconds')
    args = parser.parse_args()
-    result = get_provider_response(
+    result = call_api(
        args.scenario,
-        {'config': {'test_dir': args.test_dir, 'framework': args.framework}},
+        {'config': {
            'test_dir': args.test_dir,
            'framework': args.framework,
            'timeout': args.timeout,
        }},
        {}
    )
-    print(json.dumps(result, indent=2))
+    print(json.dumps(json.loads(result['output']), indent=2) if result['output'] else result['error'])
 `;
  await mkdir(dirname(outputPath), { recursive: true });
-  await writeFile(outputPath, providerCode);
+  await writeFile(outputPath, providerCode, { mode: 0o755 });
 }
--- a/src/promptfoo/index.ts
+++ b/src/promptfoo/index.ts
@ -1,2 +1,13 @@
 export * from './types.js';
-export { generatePromptfooConfig, generateTestProvider } from './config-generator.js';
+export { generatePromptfooConfig, generateTestProvider, type ConfigOptions } from './config-generator.js';
 export { 
  runTestsForPromptfoo, 
  savePromptfooResults,
  type RunTestsForPromptfooOptions,
  type PromptfooProviderResult,
 } from './runner-bridge.js';
 export {
  exportToPromptfooFormat,
  generateViewOnlyConfig,
  type ExportOptions,
 } from './results-exporter.js';
--- a/src/promptfoo/results-exporter.ts
+++ b/src/promptfoo/results-exporter.ts
@ -0,0 +1,127 @@
 /**
 * Export test execution results to Promptfoo format for viewing in the UI.
 * 
 * Instead of using Promptfoo to run tests (which requires a provider that
 * responds quickly), we run tests ourselves and export results to Promptfoo's
 * result format. This allows us to use Promptfoo's excellent visualization UI.
 */
 import { writeFile, mkdir } from 'fs/promises';
 import { join } from 'path';
 import type { ExecutionResult } from '../runners/types.js';
 import type { EvalSpec } from '../analyzer/types.js';
 import type { PromptfooResult, PromptfooTestResult } from './types.js';
 export interface ExportOptions {
  outputDir: string;
  evalId?: string;
  includeSpec?: boolean;
 }
 /**
 * Export ExecutionResult to Promptfoo result format.
 */
 export async function exportToPromptfooFormat(
  result: ExecutionResult,
  spec: EvalSpec | undefined,
  options: ExportOptions
 ): Promise<string> {
  const { outputDir, evalId = `eval-${Date.now()}` } = options;
  const promptfooResult = buildPromptfooResult(result, spec, evalId);
  await mkdir(outputDir, { recursive: true });
  const outputPath = join(outputDir, `${evalId}.json`);
  await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
  // Also write the latest.json symlink equivalent
  const latestPath = join(outputDir, 'latest.json');
  await writeFile(latestPath, JSON.stringify(promptfooResult, null, 2));
  return outputPath;
 }
 function buildPromptfooResult(
  result: ExecutionResult,
  spec: EvalSpec | undefined,
  evalId: string
 ): PromptfooResult {
  const testResults: PromptfooTestResult[] = result.tests.map(test => {
    // Try to find matching scenario from spec
    const scenario = spec?.scenarios.find(s => 
      s.id === test.id || test.name.includes(s.id)
    );
    return {
      prompt: {
        raw: scenario?.id || test.id,
        label: scenario?.name || test.name,
      },
      vars: {
        scenario_id: scenario?.id || test.id,
        target_module: scenario?.target.module || '',
        target_function: scenario?.target.function || '',
        description: scenario?.description || test.name,
      },
      response: {
        output: test.status === 'passed' 
          ? 'Test passed successfully'
          : test.error?.message || 'Test failed',
      },
      gradingResult: {
        pass: test.status === 'passed',
        score: test.status === 'passed' ? 1 : 0,
        reason: test.status === 'passed'
          ? 'All assertions passed'
          : test.error?.message || 'Test failed',
        componentResults: test.assertions.details.map(a => ({
          pass: a.passed,
          score: a.passed ? 1 : 0,
          reason: a.description,
          assertion: {
            type: 'custom',
            value: a.description,
          },
        })),
      },
      success: test.status === 'passed',
      error: test.error?.message,
    };
  });
  return {
    version: 1,
    timestamp: new Date().toISOString(),
    results: testResults,
    stats: {
      successes: result.summary.passed,
      failures: result.summary.failed,
      tokenUsage: {
        total: 0,
        prompt: 0,
        completion: 0,
      },
    },
  };
 }
 /**
 * Generate a minimal Promptfoo config that just views results (no provider).
 */
 export function generateViewOnlyConfig(spec: EvalSpec): string {
  return `# Evaluclaude Results Config
 # This config is for viewing results only - tests are run via evaluclaude run
 description: "Test results for ${spec.repo.name}"
 # No providers needed - we pre-run tests and import results
 providers: []
 prompts: []
 tests: []
 # Results are stored here by evaluclaude run --export-promptfoo
 outputPath: .evaluclaude/results/latest.json
 `;
 }
--- a/src/promptfoo/runner-bridge.ts
+++ b/src/promptfoo/runner-bridge.ts
@ -0,0 +1,194 @@
 /**
 * Bridge between our test runners and Promptfoo's provider interface.
 * 
 * This module provides a unified way to run tests that works both:
 * 1. Standalone via our `run` command
 * 2. As a Promptfoo provider via the generated test-runner.py
 * 
 * Results are stored in a format compatible with Promptfoo's expectations.
 */
 import { writeFile, mkdir } from 'fs/promises';
 import { join, dirname } from 'path';
 import { runTests, type ExecutionResult, type ExecutionOptions, DEFAULT_SANDBOX_CONFIG } from '../runners/index.js';
 import { createTracer, saveTrace, type EvalTrace } from '../observability/index.js';
 export interface PromptfooProviderResult {
  output: string;
  error: string | null;
  tokenUsage?: {
    total: number;
    prompt: number;
    completion: number;
  };
 }
 export interface RunTestsForPromptfooOptions {
  scenarioId: string;
  testDir: string;
  framework: 'pytest' | 'vitest' | 'jest';
  timeout?: number;
  sandbox?: boolean;
  evalId?: string;
  recordTrace?: boolean;
 }
 /**
 * Run tests for a specific scenario and format results for Promptfoo.
 */
 export async function runTestsForPromptfoo(
  options: RunTestsForPromptfooOptions
 ): Promise<PromptfooProviderResult> {
  const {
    scenarioId,
    testDir,
    framework,
    timeout = 300000,
    sandbox = true,
    evalId = `eval-${Date.now()}`,
    recordTrace = true,
  } = options;
  const tracer = recordTrace ? createTracer(evalId) : null;
  try {
    const execOptions: ExecutionOptions = {
      framework,
      sandbox,
      timeout,
      parallel: false,
      filter: scenarioId ? [scenarioId] : undefined,
      cwd: process.cwd(),
    };
    tracer?.recordIntrospection({
      filesAnalyzed: [testDir],
      duration: 0,
    });
    const result = await runTests(
      testDir,
      execOptions,
      sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
    );
    // Record execution results in trace
    if (tracer) {
      tracer.recordExecution({
        testsPassed: result.summary.passed,
        testsFailed: result.summary.failed,
        testsSkipped: result.summary.skipped,
      });
      for (const test of result.tests) {
        if (test.status === 'failed' || test.status === 'error') {
          tracer.recordTestFailure({
            scenarioId: test.id,
            testName: test.name,
            error: test.error?.message || 'Unknown error',
            stack: test.error?.stack,
          });
        }
      }
    }
    // Build Promptfoo-compatible output
    const promptfooOutput = buildPromptfooOutput(result, scenarioId);
    // Save trace if enabled
    if (tracer) {
      const trace = tracer.finalize();
      await saveTrace(trace);
    }
    return {
      output: JSON.stringify(promptfooOutput),
      error: null,
    };
  } catch (error) {
    if (tracer) {
      tracer.recordError(error instanceof Error ? error : new Error(String(error)));
      const trace = tracer.finalize();
      await saveTrace(trace);
    }
    return {
      output: JSON.stringify({
        passed: 0,
        failed: 1,
        error: error instanceof Error ? error.message : String(error),
      }),
      error: error instanceof Error ? error.message : String(error),
    };
  }
 }
 /**
 * Build Promptfoo-compatible output from ExecutionResult.
 */
 function buildPromptfooOutput(
  result: ExecutionResult,
  scenarioId?: string
 ): Record<string, unknown> {
  const matchingTests = scenarioId
    ? result.tests.filter(t => t.id === scenarioId || t.name.includes(scenarioId))
    : result.tests;
  return {
    passed: matchingTests.filter(t => t.status === 'passed').length,
    failed: matchingTests.filter(t => t.status === 'failed' || t.status === 'error').length,
    skipped: matchingTests.filter(t => t.status === 'skipped').length,
    total: matchingTests.length,
    tests: matchingTests.map(t => ({
      id: t.id,
      name: t.name,
      status: t.status,
      duration: t.duration,
      error: t.error?.message,
    })),
    summary: {
      ...result.summary,
      matchedScenario: scenarioId,
    },
    errors: result.errors,
  };
 }
 /**
 * Generate a Promptfoo-compatible results file from our execution results.
 */
 export async function savePromptfooResults(
  result: ExecutionResult,
  evalId: string,
  outputDir: string = '.evaluclaude/results'
 ): Promise<string> {
  const promptfooResult = {
    version: 1,
    timestamp: new Date().toISOString(),
    evalId,
    results: result.tests.map(t => ({
      prompt: { raw: t.id, label: t.name },
      vars: { scenario_id: t.id },
      response: {
        output: t.status === 'passed' ? 'PASS' : t.error?.message || 'FAIL',
      },
      gradingResult: {
        pass: t.status === 'passed',
        score: t.status === 'passed' ? 1 : 0,
        reason: t.error?.message || (t.status === 'passed' ? 'Test passed' : 'Test failed'),
      },
      success: t.status === 'passed',
      error: t.error?.message,
    })),
    stats: {
      successes: result.summary.passed,
      failures: result.summary.failed,
    },
  };
  await mkdir(outputDir, { recursive: true });
  const outputPath = join(outputDir, `promptfoo-${evalId}.json`);
  await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
  return outputPath;
 }
--- a/src/runners/index.ts
+++ b/src/runners/index.ts
@ -73,8 +73,8 @@ export function formatResults(result: ExecutionResult): string {
  lines.push(`   Total:   ${result.summary.total}`);
  lines.push(`   ✅ Passed:  ${result.summary.passed}`);
  lines.push(`   ❌ Failed:  ${result.summary.failed}`);
-  lines.push(`   ⏭️  Skipped: ${result.summary.skipped}`);
+  lines.push(`   ⏭️  Skipped: ${result.summary.skipped ?? 0}`);
-  lines.push(`   ⏱️  Duration: ${result.summary.duration}ms`);
+  lines.push(`   ⏱️  Duration: ${result.summary.duration || 0}ms`);
  if (result.errors.length > 0) {
    lines.push('');