promptfoo ui and testcon

This commit is contained in:
Harivansh Rathi 2026-01-11 18:28:03 -05:00
parent e0c36241b0
commit 6698c12e5b
18 changed files with 2169 additions and 0 deletions

145
src/cli/commands/run.ts Normal file
View file

@ -0,0 +1,145 @@
import { Command } from 'commander';
import { existsSync, readFileSync } from 'fs';
import { join } from 'path';
import {
runTests,
formatResults,
detectTestFramework,
type TestFramework,
type ExecutionOptions,
DEFAULT_SANDBOX_CONFIG
} from '../../runners/index.js';
import { createTracer, saveTrace } from '../../observability/index.js';
import type { EvalSpec } from '../../analyzer/types.js';
export const runCommand = new Command('run')
.description('Run generated tests and collect results')
.argument('[test-dir]', 'Directory containing test files', './tests/generated')
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON for result mapping')
.option('--sandbox', 'Run tests in sandbox mode', true)
.option('--no-sandbox', 'Disable sandbox mode')
.option('-t, --timeout <ms>', 'Test timeout in milliseconds', '300000')
.option('-p, --parallel', 'Run tests in parallel', false)
.option('--filter <patterns...>', 'Run only tests matching patterns')
.option('-o, --output <file>', 'Output results to JSON file')
.option('--trace', 'Record execution trace', true)
.option('--no-trace', 'Disable execution tracing')
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
.action(async (testDir: string, options) => {
try {
console.log(`\n🧪 Running tests from ${testDir}...\n`);
if (!existsSync(testDir)) {
console.error(`Error: Test directory not found: ${testDir}`);
process.exit(1);
}
const framework: TestFramework = options.framework || detectTestFramework(testDir);
console.log(` Framework: ${framework}`);
console.log(` Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`);
console.log(` Timeout: ${options.timeout}ms`);
let spec: EvalSpec | undefined;
if (options.spec && existsSync(options.spec)) {
spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
console.log(` Spec: ${options.spec} (${spec.scenarios.length} scenarios)`);
}
const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
const execOptions: ExecutionOptions = {
framework,
sandbox: options.sandbox,
timeout: parseInt(options.timeout, 10),
parallel: options.parallel,
filter: options.filter,
cwd: process.cwd(),
};
if (tracer) {
tracer.recordIntrospection({
filesAnalyzed: [testDir],
duration: 0,
});
}
console.log('\n Running tests...\n');
const startTime = Date.now();
const result = await runTests(
testDir,
execOptions,
options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
);
if (tracer) {
tracer.recordExecution({
testsPassed: result.summary.passed,
testsFailed: result.summary.failed,
testsSkipped: result.summary.skipped,
});
for (const test of result.tests) {
if (test.status === 'failed' || test.status === 'error') {
tracer.recordTestFailure({
scenarioId: test.id,
testName: test.name,
error: test.error?.message || 'Unknown error',
stack: test.error?.stack,
});
}
}
}
console.log(formatResults(result));
if (spec) {
const mappedResults = mapResultsToScenarios(result, spec);
console.log(`\n📊 Scenario Coverage:`);
console.log(` Covered: ${mappedResults.covered}/${spec.scenarios.length}`);
console.log(` Unmapped: ${mappedResults.unmapped}`);
}
if (options.output) {
const { writeFileSync, mkdirSync } = await import('fs');
const { dirname } = await import('path');
mkdirSync(dirname(options.output), { recursive: true });
writeFileSync(options.output, JSON.stringify(result, null, 2));
console.log(`\n📁 Results saved to: ${options.output}`);
}
if (tracer) {
const trace = tracer.finalize();
const tracePath = await saveTrace(trace);
console.log(`\n📊 Trace saved: ${tracePath}`);
console.log(` View with: evaluclaude view ${trace.id}`);
}
if (result.summary.failed > 0) {
process.exit(1);
}
} catch (error) {
console.error('Error running tests:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
function mapResultsToScenarios(
result: Awaited<ReturnType<typeof runTests>>,
spec: EvalSpec
): { covered: number; unmapped: number } {
const scenarioIds = new Set(spec.scenarios.map(s => s.id));
let covered = 0;
let unmapped = 0;
for (const test of result.tests) {
if (scenarioIds.has(test.id)) {
covered++;
} else {
unmapped++;
}
}
return { covered, unmapped };
}

236
src/cli/commands/ui.ts Normal file
View file

@ -0,0 +1,236 @@
import { Command } from 'commander';
import { spawn, type ChildProcess } from 'child_process';
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join, dirname } from 'path';
import type { EvalSpec } from '../../analyzer/types.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
const EVALUCLAUDE_DIR = '.evaluclaude';
const CONFIG_FILE = 'promptfooconfig.yaml';
const PROVIDERS_DIR = 'providers';
export const uiCommand = new Command('ui')
.description('Launch the evaluation dashboard UI')
.option('-p, --port <port>', 'Port to run the UI on', '3000')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
.option('--generate', 'Regenerate Promptfoo config from spec')
.option('--no-open', 'Do not auto-open browser')
.action(async (options) => {
try {
const port = parseInt(options.port, 10);
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
if (options.spec && options.generate) {
console.log('\n📄 Generating Promptfoo configuration...');
if (!existsSync(options.spec)) {
console.error(`Error: Spec file not found: ${options.spec}`);
process.exit(1);
}
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
await generatePromptfooConfig(spec, {
testDir: './tests/generated',
outputPath: configPath,
framework: detectFramework(spec),
includeTraceLinks: true,
});
await generateTestProvider(providerPath);
console.log(` Config: ${configPath}`);
console.log(` Provider: ${providerPath}`);
}
if (!existsSync(configPath)) {
console.log('\n⚠ No Promptfoo config found.');
console.log(' Run with --spec <file> --generate to create one.\n');
console.log(' Or create one manually:');
console.log(` ${configPath}\n`);
await createDefaultConfig(configPath, providerPath);
console.log(` Created default config at ${configPath}`);
}
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
console.log(` Config: ${configPath}\n`);
await launchPromptfooUI(port, configPath, options.open);
} catch (error) {
console.error('Error launching UI:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
export const evalCommand = new Command('eval')
.description('Run evaluations with Promptfoo and optionally launch UI')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
.option('-c, --config <config>', 'Path to promptfooconfig.yaml')
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
.option('--view', 'Launch UI after evaluation', false)
.option('-p, --port <port>', 'Port for UI', '3000')
.action(async (options) => {
try {
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
if (options.spec) {
console.log('\n📄 Generating Promptfoo configuration from spec...');
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
await generatePromptfooConfig(spec, {
testDir: './tests/generated',
outputPath: configPath,
framework: detectFramework(spec),
includeTraceLinks: true,
});
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
await generateTestProvider(providerPath);
}
if (!existsSync(configPath)) {
console.error(`Error: Config not found: ${configPath}`);
console.log('Run with --spec <file> to generate from EvalSpec.');
process.exit(1);
}
console.log('\n🧪 Running Promptfoo evaluations...\n');
const outputFile = join(options.output, `eval-${Date.now()}.json`);
mkdirSync(dirname(outputFile), { recursive: true });
await runPromptfooEval(configPath, outputFile);
console.log(`\n📁 Results saved: ${outputFile}`);
if (options.view) {
console.log(`\n🚀 Launching UI on port ${options.port}...`);
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
}
} catch (error) {
console.error('Error running eval:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
async function launchPromptfooUI(
port: number,
configPath: string,
openBrowser: boolean
): Promise<void> {
return new Promise((resolve, reject) => {
const args = ['promptfoo', 'view', '--port', String(port)];
if (openBrowser) {
args.push('-y');
} else {
args.push('-n');
}
const configDir = dirname(configPath);
args.push(configDir);
console.log(` Running: npx ${args.join(' ')}\n`);
const child = spawn('npx', args, {
stdio: 'inherit',
env: { ...process.env },
});
child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
} else {
reject(error);
}
});
child.on('close', (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`Promptfoo exited with code ${code}`));
}
});
process.on('SIGINT', () => {
child.kill('SIGINT');
process.exit(0);
});
});
}
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
return new Promise((resolve, reject) => {
const args = [
'promptfoo',
'eval',
'-c', configPath,
'-o', outputFile,
'--no-cache',
];
console.log(` Running: npx ${args.join(' ')}\n`);
const child = spawn('npx', args, {
stdio: 'inherit',
env: { ...process.env },
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`Promptfoo eval exited with code ${code}`));
}
});
});
}
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
const defaultConfig = `# Evaluclaude Promptfoo Configuration
# Generated by evaluclaude
description: "Evaluclaude functional test evaluations"
providers:
- id: file://${providerPath}
label: functional-tests
config:
test_dir: ./tests/generated
framework: pytest
timeout: 300
prompts:
- "{{scenario_id}}"
tests:
- description: "Example test"
vars:
scenario_id: "test_example"
assert:
- type: python
value: |
import json
result = json.loads(output)
result.get('passed', 0) > 0
outputPath: .evaluclaude/results/promptfoo-results.json
`;
mkdirSync(dirname(configPath), { recursive: true });
writeFileSync(configPath, defaultConfig);
await generateTestProvider(providerPath);
}
function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
if (spec.repo.languages.includes('python')) {
return 'pytest';
}
return 'vitest';
}

90
src/cli/commands/view.ts Normal file
View file

@ -0,0 +1,90 @@
import { Command } from 'commander';
import {
loadTrace,
listTraces,
getLatestTrace,
formatTrace,
formatTraceList
} from '../../observability/index.js';
export const viewCommand = new Command('view')
.description('View evaluation traces')
.argument('[trace-id]', 'Specific trace ID to view')
.option('--last', 'View the most recent trace')
.option('--list', 'List all traces')
.option('--json', 'Output as raw JSON')
.option('-v, --verbose', 'Show verbose output including tool calls')
.option('--tools', 'Show tool call details')
.option('--questions', 'Show questions asked', true)
.option('--decisions', 'Show decisions made', true)
.option('-n, --limit <count>', 'Limit number of traces listed', '20')
.option('--eval <eval-id>', 'Filter traces by eval ID')
.action(async (traceId: string | undefined, options) => {
try {
if (options.list) {
const traces = await listTraces(options.eval);
const limited = traces.slice(0, parseInt(options.limit, 10));
if (traces.length === 0) {
console.log('\nNo traces found.');
console.log('Run `evaluclaude run` to generate traces.\n');
return;
}
console.log(formatTraceList(limited));
if (traces.length > limited.length) {
console.log(`Showing ${limited.length} of ${traces.length} traces.`);
console.log(`Use --limit to see more.\n`);
}
return;
}
let trace;
if (options.last || !traceId) {
trace = await getLatestTrace();
if (!trace) {
console.log('\nNo traces found.');
console.log('Run `evaluclaude run` to generate traces.\n');
return;
}
} else {
trace = await loadTrace(traceId);
if (!trace) {
console.error(`\nTrace not found: ${traceId}`);
console.log('Use `evaluclaude view --list` to see available traces.\n');
process.exit(1);
}
}
const output = formatTrace(trace, {
json: options.json,
verbose: options.verbose,
showToolCalls: options.tools || options.verbose,
showQuestions: options.questions,
showDecisions: options.decisions,
});
console.log(output);
} catch (error) {
console.error('Error viewing trace:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
export const tracesCommand = new Command('traces')
.description('List all evaluation traces (alias for view --list)')
.option('-n, --limit <count>', 'Limit number of traces', '20')
.option('--eval <eval-id>', 'Filter by eval ID')
.action(async (options) => {
const traces = await listTraces(options.eval);
const limited = traces.slice(0, parseInt(options.limit, 10));
if (traces.length === 0) {
console.log('\nNo traces found.');
return;
}
console.log(formatTraceList(limited));
});