promptfoo ui and testcon

This commit is contained in:
Harivansh Rathi 2026-01-11 18:28:03 -05:00
parent e0c36241b0
commit 6698c12e5b
18 changed files with 2169 additions and 0 deletions

145
src/cli/commands/run.ts Normal file
View file

@ -0,0 +1,145 @@
import { Command } from 'commander';
import { existsSync, readFileSync } from 'fs';
import { join } from 'path';
import {
runTests,
formatResults,
detectTestFramework,
type TestFramework,
type ExecutionOptions,
DEFAULT_SANDBOX_CONFIG
} from '../../runners/index.js';
import { createTracer, saveTrace } from '../../observability/index.js';
import type { EvalSpec } from '../../analyzer/types.js';
export const runCommand = new Command('run')
.description('Run generated tests and collect results')
.argument('[test-dir]', 'Directory containing test files', './tests/generated')
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON for result mapping')
.option('--sandbox', 'Run tests in sandbox mode', true)
.option('--no-sandbox', 'Disable sandbox mode')
.option('-t, --timeout <ms>', 'Test timeout in milliseconds', '300000')
.option('-p, --parallel', 'Run tests in parallel', false)
.option('--filter <patterns...>', 'Run only tests matching patterns')
.option('-o, --output <file>', 'Output results to JSON file')
.option('--trace', 'Record execution trace', true)
.option('--no-trace', 'Disable execution tracing')
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
.action(async (testDir: string, options) => {
try {
console.log(`\n🧪 Running tests from ${testDir}...\n`);
if (!existsSync(testDir)) {
console.error(`Error: Test directory not found: ${testDir}`);
process.exit(1);
}
const framework: TestFramework = options.framework || detectTestFramework(testDir);
console.log(` Framework: ${framework}`);
console.log(` Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`);
console.log(` Timeout: ${options.timeout}ms`);
let spec: EvalSpec | undefined;
if (options.spec && existsSync(options.spec)) {
spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
console.log(` Spec: ${options.spec} (${spec.scenarios.length} scenarios)`);
}
const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
const execOptions: ExecutionOptions = {
framework,
sandbox: options.sandbox,
timeout: parseInt(options.timeout, 10),
parallel: options.parallel,
filter: options.filter,
cwd: process.cwd(),
};
if (tracer) {
tracer.recordIntrospection({
filesAnalyzed: [testDir],
duration: 0,
});
}
console.log('\n Running tests...\n');
const startTime = Date.now();
const result = await runTests(
testDir,
execOptions,
options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
);
if (tracer) {
tracer.recordExecution({
testsPassed: result.summary.passed,
testsFailed: result.summary.failed,
testsSkipped: result.summary.skipped,
});
for (const test of result.tests) {
if (test.status === 'failed' || test.status === 'error') {
tracer.recordTestFailure({
scenarioId: test.id,
testName: test.name,
error: test.error?.message || 'Unknown error',
stack: test.error?.stack,
});
}
}
}
console.log(formatResults(result));
if (spec) {
const mappedResults = mapResultsToScenarios(result, spec);
console.log(`\n📊 Scenario Coverage:`);
console.log(` Covered: ${mappedResults.covered}/${spec.scenarios.length}`);
console.log(` Unmapped: ${mappedResults.unmapped}`);
}
if (options.output) {
const { writeFileSync, mkdirSync } = await import('fs');
const { dirname } = await import('path');
mkdirSync(dirname(options.output), { recursive: true });
writeFileSync(options.output, JSON.stringify(result, null, 2));
console.log(`\n📁 Results saved to: ${options.output}`);
}
if (tracer) {
const trace = tracer.finalize();
const tracePath = await saveTrace(trace);
console.log(`\n📊 Trace saved: ${tracePath}`);
console.log(` View with: evaluclaude view ${trace.id}`);
}
if (result.summary.failed > 0) {
process.exit(1);
}
} catch (error) {
console.error('Error running tests:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
function mapResultsToScenarios(
result: Awaited<ReturnType<typeof runTests>>,
spec: EvalSpec
): { covered: number; unmapped: number } {
const scenarioIds = new Set(spec.scenarios.map(s => s.id));
let covered = 0;
let unmapped = 0;
for (const test of result.tests) {
if (scenarioIds.has(test.id)) {
covered++;
} else {
unmapped++;
}
}
return { covered, unmapped };
}

236
src/cli/commands/ui.ts Normal file
View file

@ -0,0 +1,236 @@
import { Command } from 'commander';
import { spawn, type ChildProcess } from 'child_process';
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
import { join, dirname } from 'path';
import type { EvalSpec } from '../../analyzer/types.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
const EVALUCLAUDE_DIR = '.evaluclaude';
const CONFIG_FILE = 'promptfooconfig.yaml';
const PROVIDERS_DIR = 'providers';
export const uiCommand = new Command('ui')
.description('Launch the evaluation dashboard UI')
.option('-p, --port <port>', 'Port to run the UI on', '3000')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
.option('--generate', 'Regenerate Promptfoo config from spec')
.option('--no-open', 'Do not auto-open browser')
.action(async (options) => {
try {
const port = parseInt(options.port, 10);
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
if (options.spec && options.generate) {
console.log('\n📄 Generating Promptfoo configuration...');
if (!existsSync(options.spec)) {
console.error(`Error: Spec file not found: ${options.spec}`);
process.exit(1);
}
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
await generatePromptfooConfig(spec, {
testDir: './tests/generated',
outputPath: configPath,
framework: detectFramework(spec),
includeTraceLinks: true,
});
await generateTestProvider(providerPath);
console.log(` Config: ${configPath}`);
console.log(` Provider: ${providerPath}`);
}
if (!existsSync(configPath)) {
console.log('\n⚠ No Promptfoo config found.');
console.log(' Run with --spec <file> --generate to create one.\n');
console.log(' Or create one manually:');
console.log(` ${configPath}\n`);
await createDefaultConfig(configPath, providerPath);
console.log(` Created default config at ${configPath}`);
}
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
console.log(` Config: ${configPath}\n`);
await launchPromptfooUI(port, configPath, options.open);
} catch (error) {
console.error('Error launching UI:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
export const evalCommand = new Command('eval')
.description('Run evaluations with Promptfoo and optionally launch UI')
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
.option('-c, --config <config>', 'Path to promptfooconfig.yaml')
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
.option('--view', 'Launch UI after evaluation', false)
.option('-p, --port <port>', 'Port for UI', '3000')
.action(async (options) => {
try {
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
if (options.spec) {
console.log('\n📄 Generating Promptfoo configuration from spec...');
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
await generatePromptfooConfig(spec, {
testDir: './tests/generated',
outputPath: configPath,
framework: detectFramework(spec),
includeTraceLinks: true,
});
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
await generateTestProvider(providerPath);
}
if (!existsSync(configPath)) {
console.error(`Error: Config not found: ${configPath}`);
console.log('Run with --spec <file> to generate from EvalSpec.');
process.exit(1);
}
console.log('\n🧪 Running Promptfoo evaluations...\n');
const outputFile = join(options.output, `eval-${Date.now()}.json`);
mkdirSync(dirname(outputFile), { recursive: true });
await runPromptfooEval(configPath, outputFile);
console.log(`\n📁 Results saved: ${outputFile}`);
if (options.view) {
console.log(`\n🚀 Launching UI on port ${options.port}...`);
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
}
} catch (error) {
console.error('Error running eval:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
async function launchPromptfooUI(
port: number,
configPath: string,
openBrowser: boolean
): Promise<void> {
return new Promise((resolve, reject) => {
const args = ['promptfoo', 'view', '--port', String(port)];
if (openBrowser) {
args.push('-y');
} else {
args.push('-n');
}
const configDir = dirname(configPath);
args.push(configDir);
console.log(` Running: npx ${args.join(' ')}\n`);
const child = spawn('npx', args, {
stdio: 'inherit',
env: { ...process.env },
});
child.on('error', (error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
} else {
reject(error);
}
});
child.on('close', (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`Promptfoo exited with code ${code}`));
}
});
process.on('SIGINT', () => {
child.kill('SIGINT');
process.exit(0);
});
});
}
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
return new Promise((resolve, reject) => {
const args = [
'promptfoo',
'eval',
'-c', configPath,
'-o', outputFile,
'--no-cache',
];
console.log(` Running: npx ${args.join(' ')}\n`);
const child = spawn('npx', args, {
stdio: 'inherit',
env: { ...process.env },
});
child.on('error', reject);
child.on('close', (code) => {
if (code === 0) {
resolve();
} else {
reject(new Error(`Promptfoo eval exited with code ${code}`));
}
});
});
}
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
const defaultConfig = `# Evaluclaude Promptfoo Configuration
# Generated by evaluclaude
description: "Evaluclaude functional test evaluations"
providers:
- id: file://${providerPath}
label: functional-tests
config:
test_dir: ./tests/generated
framework: pytest
timeout: 300
prompts:
- "{{scenario_id}}"
tests:
- description: "Example test"
vars:
scenario_id: "test_example"
assert:
- type: python
value: |
import json
result = json.loads(output)
result.get('passed', 0) > 0
outputPath: .evaluclaude/results/promptfoo-results.json
`;
mkdirSync(dirname(configPath), { recursive: true });
writeFileSync(configPath, defaultConfig);
await generateTestProvider(providerPath);
}
function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
if (spec.repo.languages.includes('python')) {
return 'pytest';
}
return 'vitest';
}

90
src/cli/commands/view.ts Normal file
View file

@ -0,0 +1,90 @@
import { Command } from 'commander';
import {
loadTrace,
listTraces,
getLatestTrace,
formatTrace,
formatTraceList
} from '../../observability/index.js';
export const viewCommand = new Command('view')
.description('View evaluation traces')
.argument('[trace-id]', 'Specific trace ID to view')
.option('--last', 'View the most recent trace')
.option('--list', 'List all traces')
.option('--json', 'Output as raw JSON')
.option('-v, --verbose', 'Show verbose output including tool calls')
.option('--tools', 'Show tool call details')
.option('--questions', 'Show questions asked', true)
.option('--decisions', 'Show decisions made', true)
.option('-n, --limit <count>', 'Limit number of traces listed', '20')
.option('--eval <eval-id>', 'Filter traces by eval ID')
.action(async (traceId: string | undefined, options) => {
try {
if (options.list) {
const traces = await listTraces(options.eval);
const limited = traces.slice(0, parseInt(options.limit, 10));
if (traces.length === 0) {
console.log('\nNo traces found.');
console.log('Run `evaluclaude run` to generate traces.\n');
return;
}
console.log(formatTraceList(limited));
if (traces.length > limited.length) {
console.log(`Showing ${limited.length} of ${traces.length} traces.`);
console.log(`Use --limit to see more.\n`);
}
return;
}
let trace;
if (options.last || !traceId) {
trace = await getLatestTrace();
if (!trace) {
console.log('\nNo traces found.');
console.log('Run `evaluclaude run` to generate traces.\n');
return;
}
} else {
trace = await loadTrace(traceId);
if (!trace) {
console.error(`\nTrace not found: ${traceId}`);
console.log('Use `evaluclaude view --list` to see available traces.\n');
process.exit(1);
}
}
const output = formatTrace(trace, {
json: options.json,
verbose: options.verbose,
showToolCalls: options.tools || options.verbose,
showQuestions: options.questions,
showDecisions: options.decisions,
});
console.log(output);
} catch (error) {
console.error('Error viewing trace:', error instanceof Error ? error.message : error);
process.exit(1);
}
});
export const tracesCommand = new Command('traces')
.description('List all evaluation traces (alias for view --list)')
.option('-n, --limit <count>', 'Limit number of traces', '20')
.option('--eval <eval-id>', 'Filter by eval ID')
.action(async (options) => {
const traces = await listTraces(options.eval);
const limited = traces.slice(0, parseInt(options.limit, 10));
if (traces.length === 0) {
console.log('\nNo traces found.');
return;
}
console.log(formatTraceList(limited));
});

View file

@ -5,6 +5,9 @@ import { introCommand } from './commands/intro.js';
import { analyzeCommand } from './commands/analyze.js';
import { renderCommand } from './commands/render.js';
import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/grade.js';
import { runCommand } from './commands/run.js';
import { viewCommand, tracesCommand } from './commands/view.js';
import { uiCommand, evalCommand } from './commands/ui.js';
const program = new Command();
@ -19,5 +22,10 @@ program.addCommand(renderCommand);
program.addCommand(gradeCommand);
program.addCommand(listRubricsCommand);
program.addCommand(calibrateCommand);
program.addCommand(runCommand);
program.addCommand(viewCommand);
program.addCommand(tracesCommand);
program.addCommand(uiCommand);
program.addCommand(evalCommand);
program.parse(process.argv);

View file

@ -2,3 +2,6 @@ export * from './introspector/index.js';
export * from './analyzer/index.js';
export * from './renderers/index.js';
export * from './graders/index.js';
export * from './runners/index.js';
export * from './observability/index.js';
export * from './promptfoo/index.js';

View file

@ -0,0 +1,15 @@
export * from './types.js';
export { Tracer, createTracer } from './tracer.js';
export {
TraceStore,
traceStore,
saveTrace,
loadTrace,
listTraces,
getLatestTrace
} from './trace-store.js';
export {
formatTrace,
formatTraceList,
type ViewOptions
} from './trace-viewer.js';

View file

@ -0,0 +1,117 @@
import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
import { existsSync } from 'fs';
import { join } from 'path';
import type { EvalTrace, TraceListItem } from './types.js';
const DEFAULT_TRACES_DIR = '.evaluclaude/traces';
export class TraceStore {
private tracesDir: string;
constructor(tracesDir: string = DEFAULT_TRACES_DIR) {
this.tracesDir = tracesDir;
}
async save(trace: EvalTrace): Promise<string> {
await mkdir(this.tracesDir, { recursive: true });
const filePath = join(this.tracesDir, `${trace.id}.json`);
await writeFile(filePath, JSON.stringify(trace, null, 2));
return filePath;
}
async load(traceId: string): Promise<EvalTrace | null> {
const filePath = join(this.tracesDir, `${traceId}.json`);
if (!existsSync(filePath)) {
return null;
}
const content = await readFile(filePath, 'utf-8');
return JSON.parse(content) as EvalTrace;
}
async list(evalId?: string): Promise<TraceListItem[]> {
if (!existsSync(this.tracesDir)) {
return [];
}
const files = await readdir(this.tracesDir);
const jsonFiles = files.filter(f => f.endsWith('.json'));
const traces: TraceListItem[] = [];
for (const file of jsonFiles) {
try {
const content = await readFile(join(this.tracesDir, file), 'utf-8');
const trace = JSON.parse(content) as EvalTrace;
if (evalId && trace.evalId !== evalId) {
continue;
}
traces.push({
id: trace.id,
evalId: trace.evalId,
startedAt: trace.startedAt,
status: trace.status,
duration: trace.duration,
testsPassed: trace.execution.testsPassed,
testsFailed: trace.execution.testsFailed,
});
} catch (e) {
}
}
return traces.sort((a, b) =>
new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime()
);
}
async getLatest(): Promise<EvalTrace | null> {
const traces = await this.list();
if (traces.length === 0) {
return null;
}
return this.load(traces[0].id);
}
async delete(traceId: string): Promise<boolean> {
const filePath = join(this.tracesDir, `${traceId}.json`);
if (!existsSync(filePath)) {
return false;
}
const { unlink } = await import('fs/promises');
await unlink(filePath);
return true;
}
async cleanup(keepCount: number = 50): Promise<number> {
const traces = await this.list();
const toDelete = traces.slice(keepCount);
let deleted = 0;
for (const trace of toDelete) {
if (await this.delete(trace.id)) {
deleted++;
}
}
return deleted;
}
}
export const traceStore = new TraceStore();
export async function saveTrace(trace: EvalTrace): Promise<string> {
return traceStore.save(trace);
}
export async function loadTrace(traceId: string): Promise<EvalTrace | null> {
return traceStore.load(traceId);
}
export async function listTraces(evalId?: string): Promise<TraceListItem[]> {
return traceStore.list(evalId);
}
export async function getLatestTrace(): Promise<EvalTrace | null> {
return traceStore.getLatest();
}

View file

@ -0,0 +1,226 @@
import type { EvalTrace, ToolCall, Question, Decision, TestFailure } from './types.js';
export interface ViewOptions {
json: boolean;
verbose: boolean;
showToolCalls: boolean;
showQuestions: boolean;
showDecisions: boolean;
}
const DEFAULT_VIEW_OPTIONS: ViewOptions = {
json: false,
verbose: false,
showToolCalls: false,
showQuestions: true,
showDecisions: true,
};
export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}): string {
const opts = { ...DEFAULT_VIEW_OPTIONS, ...options };
if (opts.json) {
return JSON.stringify(trace, null, 2);
}
const lines: string[] = [];
lines.push('');
lines.push('═'.repeat(60));
lines.push(`📊 Trace: ${trace.id}`);
lines.push('═'.repeat(60));
lines.push('');
lines.push(` Status: ${formatStatus(trace.status)}`);
lines.push(` Started: ${formatDate(trace.startedAt)}`);
lines.push(` Duration: ${formatDuration(trace.duration)}`);
lines.push(` Eval ID: ${trace.evalId}`);
lines.push('');
lines.push('📂 Introspection');
lines.push('─'.repeat(40));
lines.push(` Files analyzed: ${trace.introspection.filesAnalyzed.length}`);
lines.push(` Functions found: ${trace.introspection.totalFunctions}`);
lines.push(` Classes found: ${trace.introspection.totalClasses}`);
lines.push(` Duration: ${formatDuration(trace.introspection.duration)}`);
lines.push('');
lines.push('🤖 Analysis');
lines.push('─'.repeat(40));
lines.push(` Tool calls: ${trace.analysis.toolCalls.length}`);
lines.push(` Questions asked: ${trace.analysis.questionsAsked.length}`);
lines.push(` Decisions made: ${trace.analysis.decisions.length}`);
lines.push(` Prompt tokens: ${trace.analysis.promptTokens.toLocaleString()}`);
lines.push(` Completion tokens: ${trace.analysis.completionTokens.toLocaleString()}`);
lines.push('');
lines.push('📝 Generation');
lines.push('─'.repeat(40));
lines.push(` Scenarios: ${trace.generation.scenariosGenerated}`);
lines.push(` Files written: ${trace.generation.filesWritten.length}`);
lines.push('');
lines.push('🧪 Execution');
lines.push('─'.repeat(40));
lines.push(` ✅ Passed: ${trace.execution.testsPassed}`);
lines.push(` ❌ Failed: ${trace.execution.testsFailed}`);
lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped}`);
lines.push('');
if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
lines.push('❓ Questions Asked');
lines.push('─'.repeat(40));
for (const q of trace.analysis.questionsAsked) {
lines.push(formatQuestion(q));
}
lines.push('');
}
if (opts.showDecisions && trace.analysis.decisions.length > 0) {
lines.push('🎯 Key Decisions');
lines.push('─'.repeat(40));
for (const d of trace.analysis.decisions.slice(0, 10)) {
lines.push(formatDecision(d));
}
if (trace.analysis.decisions.length > 10) {
lines.push(` ... and ${trace.analysis.decisions.length - 10} more`);
}
lines.push('');
}
if (opts.showToolCalls && trace.analysis.toolCalls.length > 0) {
lines.push('🔧 Tool Calls');
lines.push('─'.repeat(40));
for (const tc of trace.analysis.toolCalls.slice(0, 20)) {
lines.push(formatToolCall(tc, opts.verbose));
}
if (trace.analysis.toolCalls.length > 20) {
lines.push(` ... and ${trace.analysis.toolCalls.length - 20} more`);
}
lines.push('');
}
if (trace.execution.failures.length > 0) {
lines.push('❌ Test Failures');
lines.push('─'.repeat(40));
for (const f of trace.execution.failures) {
lines.push(formatFailure(f));
}
lines.push('');
}
if (trace.errors.length > 0) {
lines.push('⚠️ Errors');
lines.push('─'.repeat(40));
for (const e of trace.errors) {
lines.push(` [${formatDate(e.timestamp)}]`);
lines.push(` ${e.message}`);
if (e.context) {
lines.push(` Context: ${e.context}`);
}
lines.push('');
}
}
lines.push('═'.repeat(60));
lines.push('');
return lines.join('\n');
}
function formatStatus(status: EvalTrace['status']): string {
switch (status) {
case 'success':
return '✅ Success';
case 'partial':
return '⚠️ Partial';
case 'failed':
return '❌ Failed';
default:
return status;
}
}
function formatDate(iso: string): string {
return new Date(iso).toLocaleString();
}
function formatDuration(ms: number): string {
if (ms < 1000) {
return `${ms}ms`;
}
if (ms < 60000) {
return `${(ms / 1000).toFixed(1)}s`;
}
const minutes = Math.floor(ms / 60000);
const seconds = ((ms % 60000) / 1000).toFixed(0);
return `${minutes}m ${seconds}s`;
}
function formatQuestion(q: Question): string {
const lines: string[] = [];
lines.push(` Q: ${q.question}`);
if (q.answer) {
lines.push(` A: ${q.answer}`);
} else {
lines.push(` A: (no answer)`);
}
lines.push('');
return lines.join('\n');
}
function formatDecision(d: Decision): string {
const icon = d.type === 'include' ? '✓' : d.type === 'exclude' ? '✗' : '→';
return ` ${icon} [${d.type}] ${d.subject}\n Reason: ${d.reasoning}\n Confidence: ${(d.confidence * 100).toFixed(0)}%\n`;
}
function formatToolCall(tc: ToolCall, verbose: boolean): string {
const duration = formatDuration(tc.duration);
if (verbose) {
return ` [${tc.tool}] (${duration})\n Input: ${JSON.stringify(tc.input).slice(0, 100)}...\n`;
}
return ` ${tc.tool} (${duration})`;
}
function formatFailure(f: TestFailure): string {
const lines: string[] = [];
lines.push(`${f.testName}`);
lines.push(` Scenario: ${f.scenarioId}`);
lines.push(` Error: ${f.error}`);
if (f.expected !== undefined && f.actual !== undefined) {
lines.push(` Expected: ${JSON.stringify(f.expected)}`);
lines.push(` Actual: ${JSON.stringify(f.actual)}`);
}
lines.push('');
return lines.join('\n');
}
export function formatTraceList(traces: Array<{
id: string;
startedAt: string;
status: string;
duration: number;
testsPassed: number;
testsFailed: number;
}>): string {
const lines: string[] = [];
lines.push('');
lines.push('📋 Recent Traces');
lines.push('═'.repeat(80));
lines.push('');
lines.push('ID Status Passed Failed Duration');
lines.push('─'.repeat(80));
for (const t of traces) {
const statusIcon = t.status === 'success' ? '✅' : t.status === 'partial' ? '⚠️ ' : '❌';
const id = t.id.slice(0, 36);
const passed = String(t.testsPassed).padStart(6);
const failed = String(t.testsFailed).padStart(6);
const duration = formatDuration(t.duration).padStart(8);
lines.push(`${id} ${statusIcon} ${passed} ${failed} ${duration}`);
}
lines.push('');
return lines.join('\n');
}

168
src/observability/tracer.ts Normal file
View file

@ -0,0 +1,168 @@
import { randomUUID } from 'crypto';
import type {
EvalTrace,
ToolCall,
Question,
Decision,
TraceError,
TestFailure,
IntrospectionTrace,
GenerationTrace,
ExecutionTrace,
} from './types.js';
export class Tracer {
private trace: EvalTrace;
private currentToolCall?: { name: string; input: unknown; startTime: number };
private startTime: number;
constructor(evalId: string) {
this.startTime = Date.now();
this.trace = {
id: randomUUID(),
evalId,
startedAt: new Date().toISOString(),
completedAt: '',
duration: 0,
status: 'success',
introspection: {
filesAnalyzed: [],
totalFunctions: 0,
totalClasses: 0,
duration: 0,
},
analysis: {
promptTokens: 0,
completionTokens: 0,
toolCalls: [],
questionsAsked: [],
decisions: [],
},
generation: {
scenariosGenerated: 0,
filesWritten: [],
},
execution: {
testsPassed: 0,
testsFailed: 0,
testsSkipped: 0,
failures: [],
},
errors: [],
};
}
get traceId(): string {
return this.trace.id;
}
recordToolStart(name: string, input: unknown): void {
this.currentToolCall = { name, input, startTime: Date.now() };
}
recordToolEnd(name: string, output: unknown): void {
if (this.currentToolCall?.name === name) {
const toolCall: ToolCall = {
timestamp: new Date().toISOString(),
tool: name,
input: this.currentToolCall.input,
output,
duration: Date.now() - this.currentToolCall.startTime,
};
this.trace.analysis.toolCalls.push(toolCall);
this.currentToolCall = undefined;
}
}
recordQuestion(question: Question): void {
this.trace.analysis.questionsAsked.push({
...question,
timestamp: new Date().toISOString(),
});
}
recordAnswer(questionId: string, answer: string): void {
const question = this.trace.analysis.questionsAsked.find(q => q.id === questionId);
if (question) {
question.answer = answer;
}
}
recordDecision(
type: Decision['type'],
subject: string,
reasoning: string,
confidence: number
): void {
this.trace.analysis.decisions.push({
timestamp: new Date().toISOString(),
type,
subject,
reasoning,
confidence: Math.max(0, Math.min(1, confidence)),
});
}
recordIntrospection(data: Partial<IntrospectionTrace>): void {
Object.assign(this.trace.introspection, data);
}
recordGeneration(data: Partial<GenerationTrace>): void {
Object.assign(this.trace.generation, data);
}
recordExecution(data: Partial<ExecutionTrace>): void {
Object.assign(this.trace.execution, data);
}
recordTestFailure(failure: TestFailure): void {
this.trace.execution.failures.push(failure);
this.trace.execution.testsFailed++;
}
recordTestPass(): void {
this.trace.execution.testsPassed++;
}
recordTokenUsage(promptTokens: number, completionTokens: number): void {
this.trace.analysis.promptTokens += promptTokens;
this.trace.analysis.completionTokens += completionTokens;
}
recordError(error: Error, context?: string): void {
const traceError: TraceError = {
timestamp: new Date().toISOString(),
message: error.message,
stack: error.stack,
context,
};
this.trace.errors.push(traceError);
if (this.trace.status === 'success') {
this.trace.status = 'partial';
}
}
setStatus(status: EvalTrace['status']): void {
this.trace.status = status;
}
finalize(): EvalTrace {
this.trace.completedAt = new Date().toISOString();
this.trace.duration = Date.now() - this.startTime;
if (this.trace.errors.length > 0 && this.trace.execution.testsPassed === 0) {
this.trace.status = 'failed';
}
return this.trace;
}
getTrace(): EvalTrace {
return { ...this.trace };
}
}
export function createTracer(evalId: string): Tracer {
return new Tracer(evalId);
}

100
src/observability/types.ts Normal file
View file

@ -0,0 +1,100 @@
export interface EvalTrace {
id: string;
evalId: string;
startedAt: string;
completedAt: string;
duration: number;
status: 'success' | 'partial' | 'failed';
introspection: IntrospectionTrace;
analysis: AnalysisTrace;
generation: GenerationTrace;
execution: ExecutionTrace;
errors: TraceError[];
}
export interface IntrospectionTrace {
filesAnalyzed: string[];
totalFunctions: number;
totalClasses: number;
duration: number;
}
export interface AnalysisTrace {
promptTokens: number;
completionTokens: number;
toolCalls: ToolCall[];
questionsAsked: Question[];
decisions: Decision[];
}
export interface GenerationTrace {
scenariosGenerated: number;
filesWritten: string[];
}
export interface ExecutionTrace {
testsPassed: number;
testsFailed: number;
testsSkipped: number;
failures: TestFailure[];
}
export interface ToolCall {
timestamp: string;
tool: string;
input: unknown;
output: unknown;
duration: number;
}
export interface Question {
id: string;
timestamp: string;
question: string;
options?: string[];
answer?: string;
defaultAnswer?: string;
}
export interface Decision {
timestamp: string;
type: 'include' | 'exclude' | 'prioritize' | 'question';
subject: string;
reasoning: string;
confidence: number;
}
export interface TestFailure {
scenarioId: string;
testName: string;
error: string;
stack?: string;
expected?: unknown;
actual?: unknown;
}
export interface TraceError {
timestamp: string;
message: string;
stack?: string;
context?: string;
}
export interface TraceEvent {
timestamp: string;
type: 'tool_start' | 'tool_end' | 'question' | 'decision' | 'error' | 'info';
data: unknown;
}
export interface TraceListItem {
id: string;
evalId: string;
startedAt: string;
status: EvalTrace['status'];
duration: number;
testsPassed: number;
testsFailed: number;
}

View file

@ -0,0 +1,271 @@
import { writeFile, mkdir } from 'fs/promises';
import { dirname, join } from 'path';
import * as yaml from 'js-yaml';
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
export interface ConfigOptions {
testDir: string;
outputPath: string;
framework: 'pytest' | 'vitest' | 'jest';
includeTraceLinks: boolean;
}
export async function generatePromptfooConfig(
spec: EvalSpec,
options: ConfigOptions
): Promise<string> {
const config = buildConfig(spec, options);
const yamlContent = yaml.dump(config, {
lineWidth: 120,
quotingType: '"',
});
await mkdir(dirname(options.outputPath), { recursive: true });
await writeFile(options.outputPath, yamlContent);
return yamlContent;
}
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
return {
description: `Evaluclaude functional tests for ${spec.repo.name}`,
providers: [
{
id: `file://providers/test-runner.py`,
label: 'functional-tests',
config: {
test_dir: options.testDir,
framework: options.framework,
timeout: 300,
},
},
],
prompts: ['{{scenario_id}}'],
tests,
defaultTest: options.includeTraceLinks
? {
metadata: {
traceFile: '.evaluclaude/traces/{{evalId}}.json',
},
}
: undefined,
outputPath: '.evaluclaude/results/promptfoo-results.json',
};
}
function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest {
const assertions = scenario.assertions
.filter(a => a.type !== 'llm-rubric')
.map(a => buildAssertion(a));
const llmRubrics = scenario.assertions
.filter(a => a.type === 'llm-rubric')
.map(a => ({
type: 'llm-rubric' as const,
value: (a as any).rubric,
threshold: (a as any).passingThreshold ?? 0.7,
}));
return {
description: scenario.description,
vars: {
scenario_id: scenario.id,
target_module: scenario.target.module,
target_function: scenario.target.function,
input_args: scenario.input.args,
input_kwargs: scenario.input.kwargs,
},
assert: [...assertions, ...llmRubrics],
metadata: {
category: scenario.category,
priority: scenario.priority,
tags: scenario.tags,
},
};
}
function buildAssertion(assertion: any): PromptfooAssertion {
switch (assertion.type) {
case 'equals':
return {
type: 'equals',
value: assertion.expected,
};
case 'contains':
return {
type: 'contains',
value: assertion.value,
};
case 'matches':
return {
type: 'regex',
value: assertion.pattern,
};
case 'typeof':
return {
type: 'python',
value: `type(output).__name__ == '${assertion.expected}'`,
};
case 'throws':
return {
type: 'python',
value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`,
};
case 'truthy':
return {
type: 'python',
value: 'bool(output)',
};
case 'falsy':
return {
type: 'python',
value: 'not bool(output)',
};
case 'custom':
return {
type: 'python',
value: assertion.check,
};
default:
return {
type: 'python',
value: 'True',
};
}
}
export async function generateTestProvider(outputPath: string): Promise<void> {
const providerCode = `#!/usr/bin/env python3
"""Promptfoo provider that executes tests and returns structured results."""
import subprocess
import json
import sys
import os
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
"""Runs tests and returns structured results."""
test_dir = options.get('config', {}).get('test_dir', './tests')
framework = options.get('config', {}).get('framework', 'pytest')
timeout = options.get('config', {}).get('timeout', 300)
scenario_id = prompt.strip()
try:
if framework == 'pytest':
result = subprocess.run(
[
'python', '-m', 'pytest',
'--json-report',
'--json-report-file=/tmp/pytest_results.json',
'-k', scenario_id,
test_dir
],
capture_output=True,
text=True,
timeout=timeout
)
try:
with open('/tmp/pytest_results.json') as f:
report = json.load(f)
output = {
'passed': report.get('summary', {}).get('passed', 0),
'failed': report.get('summary', {}).get('failed', 0),
'skipped': report.get('summary', {}).get('skipped', 0),
'tests': report.get('tests', []),
'stdout': result.stdout,
'stderr': result.stderr,
'exit_code': result.returncode,
}
except FileNotFoundError:
output = {
'passed': 0,
'failed': 1,
'error': 'Failed to generate pytest report',
'stdout': result.stdout,
'stderr': result.stderr,
}
elif framework in ('vitest', 'jest'):
cmd = ['npx', framework, 'run', '--reporter=json']
if scenario_id:
cmd.extend(['--testNamePattern', scenario_id])
cmd.append(test_dir)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout
)
try:
report = json.loads(result.stdout)
output = {
'passed': report.get('numPassedTests', 0),
'failed': report.get('numFailedTests', 0),
'skipped': report.get('numSkippedTests', 0),
'tests': report.get('testResults', []),
'exit_code': result.returncode,
}
except json.JSONDecodeError:
output = {
'passed': 0,
'failed': 1,
'error': 'Failed to parse test output',
'stdout': result.stdout,
'stderr': result.stderr,
}
else:
output = {'error': f'Unknown framework: {framework}'}
return {
'output': json.dumps(output),
'error': None,
}
except subprocess.TimeoutExpired:
return {
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
'error': None,
}
except Exception as e:
return {
'output': None,
'error': str(e),
}
if __name__ == '__main__':
# For testing the provider directly
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--scenario', default='')
parser.add_argument('--test-dir', default='./tests')
parser.add_argument('--framework', default='pytest')
args = parser.parse_args()
result = get_provider_response(
args.scenario,
{'config': {'test_dir': args.test_dir, 'framework': args.framework}},
{}
)
print(json.dumps(result, indent=2))
`;
await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, providerCode);
}

2
src/promptfoo/index.ts Normal file
View file

@ -0,0 +1,2 @@
export * from './types.js';
export { generatePromptfooConfig, generateTestProvider } from './config-generator.js';

89
src/promptfoo/types.ts Normal file
View file

@ -0,0 +1,89 @@
export interface PromptfooConfig {
description?: string;
providers: PromptfooProvider[];
prompts: string[];
tests: PromptfooTest[];
defaultTest?: PromptfooDefaultTest;
outputPath?: string;
}
export interface PromptfooProvider {
id: string;
label?: string;
config?: Record<string, unknown>;
}
export interface PromptfooTest {
description?: string;
vars?: Record<string, unknown>;
assert?: PromptfooAssertion[];
options?: Record<string, unknown>;
metadata?: Record<string, unknown>;
}
export interface PromptfooAssertion {
type: string;
value?: unknown;
threshold?: number;
weight?: number;
provider?: string;
}
export interface PromptfooDefaultTest {
assert?: PromptfooAssertion[];
options?: Record<string, unknown>;
metadata?: Record<string, unknown>;
}
export interface PromptfooResult {
version: number;
timestamp: string;
results: PromptfooTestResult[];
stats: {
successes: number;
failures: number;
tokenUsage: {
total: number;
prompt: number;
completion: number;
};
};
}
export interface PromptfooTestResult {
prompt: {
raw: string;
label: string;
};
vars: Record<string, unknown>;
response: {
output: string;
tokenUsage?: {
total: number;
prompt: number;
completion: number;
};
};
gradingResult: {
pass: boolean;
score: number;
reason?: string;
componentResults?: Array<{
pass: boolean;
score: number;
reason: string;
assertion: PromptfooAssertion;
}>;
};
success: boolean;
error?: string;
}
export interface EvalConfig {
specPath: string;
testDir: string;
outputDir: string;
framework: 'pytest' | 'vitest' | 'jest';
uiPort: number;
watch: boolean;
}

101
src/runners/index.ts Normal file
View file

@ -0,0 +1,101 @@
import type { Runner, TestFramework, RunnerConfig, ExecutionResult, ExecutionOptions, SandboxConfig } from './types.js';
import { PytestRunner } from './pytest-runner.js';
import { VitestRunner, JestRunner } from './vitest-runner.js';
import { DEFAULT_SANDBOX_CONFIG } from './types.js';
export * from './types.js';
export { PytestRunner } from './pytest-runner.js';
export { VitestRunner, JestRunner } from './vitest-runner.js';
export { sandboxedExec } from './sandbox.js';
const runnerRegistry: Record<TestFramework, new () => Runner> = {
pytest: PytestRunner,
vitest: VitestRunner,
jest: JestRunner,
};
export function createRunner(framework: TestFramework): Runner {
const RunnerClass = runnerRegistry[framework];
if (!RunnerClass) {
throw new Error(`Unknown test framework: ${framework}`);
}
return new RunnerClass();
}
export async function runTests(
testDir: string,
options: ExecutionOptions,
sandboxConfig: SandboxConfig = DEFAULT_SANDBOX_CONFIG
): Promise<ExecutionResult> {
const runner = createRunner(options.framework);
const config: RunnerConfig = {
testDir,
outputFile: `.evaluclaude/results/${options.framework}-${Date.now()}.json`,
options,
sandboxConfig: options.sandbox ? sandboxConfig : undefined,
};
return runner.run(config);
}
export function detectTestFramework(testDir: string): TestFramework {
const fs = require('fs');
const path = require('path');
const pythonFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.py'));
const tsFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.ts') || f.endsWith('.js'));
if (pythonFiles.length > tsFiles.length) {
return 'pytest';
}
const packageJsonPath = path.join(testDir, '..', 'package.json');
if (fs.existsSync(packageJsonPath)) {
try {
const pkg = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
if (pkg.devDependencies?.jest || pkg.dependencies?.jest) {
return 'jest';
}
} catch (e) {
}
}
return 'vitest';
}
export function formatResults(result: ExecutionResult): string {
const lines: string[] = [];
lines.push('');
lines.push('📊 Test Execution Results');
lines.push('═'.repeat(40));
lines.push(` Total: ${result.summary.total}`);
lines.push(` ✅ Passed: ${result.summary.passed}`);
lines.push(` ❌ Failed: ${result.summary.failed}`);
lines.push(` ⏭️ Skipped: ${result.summary.skipped}`);
lines.push(` ⏱️ Duration: ${result.summary.duration}ms`);
if (result.errors.length > 0) {
lines.push('');
lines.push('⚠️ Errors:');
for (const error of result.errors) {
lines.push(`${error}`);
}
}
const failures = result.tests.filter(t => t.status === 'failed' || t.status === 'error');
if (failures.length > 0) {
lines.push('');
lines.push('❌ Failed Tests:');
for (const test of failures) {
lines.push(`${test.name}`);
if (test.error) {
lines.push(` ${test.error.message}`);
}
}
}
lines.push('');
return lines.join('\n');
}

View file

@ -0,0 +1,164 @@
import { readFile, writeFile, mkdir } from 'fs/promises';
import { existsSync } from 'fs';
import { join, dirname } from 'path';
import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js';
import { sandboxedExec } from './sandbox.js';
interface PytestJsonReport {
created: number;
duration: number;
exitcode: number;
root: string;
environment: Record<string, string>;
summary: {
passed: number;
failed: number;
error: number;
skipped: number;
total: number;
collected: number;
};
tests: PytestTestResult[];
}
interface PytestTestResult {
nodeid: string;
outcome: 'passed' | 'failed' | 'skipped' | 'error';
keywords: string[];
setup?: { duration: number; outcome: string };
call?: {
duration: number;
outcome: string;
crash?: { message: string; path: string; lineno: number };
traceback?: Array<{ path: string; lineno: number; message: string }>;
longrepr?: string;
};
teardown?: { duration: number; outcome: string };
}
export class PytestRunner implements Runner {
name = 'pytest' as const;
async run(config: RunnerConfig): Promise<ExecutionResult> {
const { testDir, outputFile, options, sandboxConfig } = config;
const reportFile = join(testDir, '.pytest_report.json');
const args = [
'-v',
'--tb=short',
'--json-report',
`--json-report-file=${reportFile}`,
];
if (options.parallel) {
args.push('-n', 'auto');
}
if (options.filter && options.filter.length > 0) {
args.push('-k', options.filter.join(' or '));
}
args.push(testDir);
const result = await sandboxedExec('python', ['-m', 'pytest', ...args], {
cwd: options.cwd || process.cwd(),
timeout: options.timeout,
env: options.env,
sandboxConfig: sandboxConfig,
});
let report: PytestJsonReport | undefined;
if (existsSync(reportFile)) {
try {
const content = await readFile(reportFile, 'utf-8');
report = JSON.parse(content);
} catch (e) {
}
}
const executionResult = this.parseResults(result.stdout + result.stderr, report);
if (result.timedOut) {
executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
}
if (outputFile) {
await mkdir(dirname(outputFile), { recursive: true });
await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
}
return executionResult;
}
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
const report = jsonReport as PytestJsonReport | undefined;
if (!report) {
return this.parseFromStdout(rawOutput);
}
const summary: ExecutionSummary = {
total: report.summary.total,
passed: report.summary.passed,
failed: report.summary.failed,
skipped: report.summary.skipped,
duration: report.duration * 1000,
};
const tests: TestResult[] = report.tests.map((t) => ({
id: this.extractScenarioId(t.nodeid),
name: t.nodeid,
status: t.outcome === 'error' ? 'error' : t.outcome,
duration: (t.call?.duration || 0) * 1000,
assertions: {
passed: t.outcome === 'passed' ? 1 : 0,
failed: t.outcome === 'failed' ? 1 : 0,
details: [],
},
error: t.call?.crash
? { message: t.call.crash.message, stack: t.call.longrepr }
: undefined,
}));
return {
summary,
tests,
errors: report.summary.error > 0 ? [`${report.summary.error} tests had errors`] : [],
};
}
private parseFromStdout(stdout: string): ExecutionResult {
const lines = stdout.split('\n');
const summaryMatch = stdout.match(/(\d+) passed|(\d+) failed|(\d+) skipped|(\d+) error/g);
let passed = 0, failed = 0, skipped = 0;
if (summaryMatch) {
for (const match of summaryMatch) {
const [num, type] = match.split(' ');
const count = parseInt(num, 10);
if (type === 'passed') passed = count;
if (type === 'failed') failed = count;
if (type === 'skipped') skipped = count;
}
}
return {
summary: {
total: passed + failed + skipped,
passed,
failed,
skipped,
duration: 0,
},
tests: [],
errors: [],
};
}
private extractScenarioId(nodeid: string): string {
const match = nodeid.match(/test_([a-zA-Z0-9_-]+)/);
return match ? match[1] : nodeid;
}
}

126
src/runners/sandbox.ts Normal file
View file

@ -0,0 +1,126 @@
import { spawn, type ChildProcess, type SpawnOptions } from 'child_process';
import type { SandboxConfig, DEFAULT_SANDBOX_CONFIG } from './types.js';
export interface SandboxedExecResult {
exitCode: number;
stdout: string;
stderr: string;
timedOut: boolean;
}
export async function sandboxedExec(
command: string,
args: string[],
options: {
cwd: string;
timeout: number;
env?: Record<string, string>;
sandboxConfig?: SandboxConfig;
}
): Promise<SandboxedExecResult> {
const { cwd, timeout, env = {}, sandboxConfig } = options;
const spawnEnv: Record<string, string> = {};
if (sandboxConfig?.enabled) {
for (const key of sandboxConfig.env.inherit) {
if (process.env[key]) {
spawnEnv[key] = process.env[key]!;
}
}
Object.assign(spawnEnv, sandboxConfig.env.set);
} else {
Object.assign(spawnEnv, process.env);
}
Object.assign(spawnEnv, env);
const spawnOptions: SpawnOptions = {
cwd,
env: spawnEnv,
stdio: ['pipe', 'pipe', 'pipe'],
};
return new Promise((resolve) => {
let stdout = '';
let stderr = '';
let timedOut = false;
const child: ChildProcess = spawn(command, args, spawnOptions);
const timeoutId = setTimeout(() => {
timedOut = true;
child.kill('SIGTERM');
setTimeout(() => child.kill('SIGKILL'), 1000);
}, timeout);
child.stdout?.on('data', (data: Buffer) => {
stdout += data.toString();
});
child.stderr?.on('data', (data: Buffer) => {
stderr += data.toString();
});
child.on('close', (code) => {
clearTimeout(timeoutId);
resolve({
exitCode: code ?? 1,
stdout,
stderr,
timedOut,
});
});
child.on('error', (err) => {
clearTimeout(timeoutId);
resolve({
exitCode: 1,
stdout,
stderr: stderr + '\n' + err.message,
timedOut: false,
});
});
});
}
export function buildSandboxCommand(
command: string,
args: string[],
config: SandboxConfig
): { command: string; args: string[] } {
if (!config.enabled) {
return { command, args };
}
if (process.platform === 'darwin') {
const sandboxArgs: string[] = [];
if (!config.network.allowOutbound) {
sandboxArgs.push('--deny-network-outbound');
}
return {
command: 'sandbox-exec',
args: ['-p', buildSandboxProfile(config), command, ...args],
};
}
return { command, args };
}
function buildSandboxProfile(config: SandboxConfig): string {
const rules: string[] = ['(version 1)', '(allow default)'];
if (!config.network.allowOutbound) {
rules.push('(deny network-outbound (remote ip "*:*"))');
}
for (const path of config.filesystem.readOnly) {
if (path !== '/') {
rules.push(`(deny file-write* (subpath "${path}"))`);
}
}
return rules.join('\n');
}

95
src/runners/types.ts Normal file
View file

@ -0,0 +1,95 @@
export type TestFramework = 'pytest' | 'vitest' | 'jest';
export interface ExecutionOptions {
framework: TestFramework;
sandbox: boolean;
timeout: number;
parallel: boolean;
filter?: string[];
cwd?: string;
env?: Record<string, string>;
}
export interface ExecutionResult {
summary: ExecutionSummary;
tests: TestResult[];
errors: string[];
traceId?: string;
}
export interface ExecutionSummary {
total: number;
passed: number;
failed: number;
skipped: number;
duration: number;
}
export interface TestResult {
id: string;
name: string;
status: 'passed' | 'failed' | 'skipped' | 'error';
duration: number;
assertions: {
passed: number;
failed: number;
details: AssertionResult[];
};
error?: { message: string; stack?: string };
stdout?: string;
stderr?: string;
}
export interface AssertionResult {
description: string;
passed: boolean;
expected?: unknown;
actual?: unknown;
}
export interface SandboxConfig {
enabled: boolean;
autoAllowBashIfSandboxed: boolean;
network: {
allowLocalBinding: boolean;
allowOutbound: boolean;
};
filesystem: {
readOnly: string[];
writable: string[];
};
env: {
inherit: string[];
set: Record<string, string>;
};
}
export const DEFAULT_SANDBOX_CONFIG: SandboxConfig = {
enabled: true,
autoAllowBashIfSandboxed: true,
network: {
allowLocalBinding: true,
allowOutbound: false,
},
filesystem: {
readOnly: ['/'],
writable: ['/tmp', './test-output'],
},
env: {
inherit: ['PATH', 'HOME', 'USER'],
set: { CI: 'true', NODE_ENV: 'test' },
},
};
export interface RunnerConfig {
testDir: string;
outputFile: string;
options: ExecutionOptions;
sandboxConfig?: SandboxConfig;
}
export interface Runner {
name: TestFramework;
run(config: RunnerConfig): Promise<ExecutionResult>;
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult;
}

View file

@ -0,0 +1,213 @@
import { readFile, writeFile, mkdir } from 'fs/promises';
import { existsSync } from 'fs';
import { join, dirname } from 'path';
import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js';
import { sandboxedExec } from './sandbox.js';
interface VitestJsonReport {
numTotalTestSuites: number;
numPassedTestSuites: number;
numFailedTestSuites: number;
numTotalTests: number;
numPassedTests: number;
numFailedTests: number;
numSkippedTests: number;
startTime: number;
endTime: number;
testResults: VitestTestFile[];
}
interface VitestTestFile {
name: string;
status: 'passed' | 'failed';
startTime: number;
endTime: number;
assertionResults: VitestAssertion[];
}
interface VitestAssertion {
ancestorTitles: string[];
fullName: string;
status: 'passed' | 'failed' | 'skipped';
title: string;
duration: number;
failureMessages: string[];
}
export class VitestRunner implements Runner {
name = 'vitest' as const;
async run(config: RunnerConfig): Promise<ExecutionResult> {
const { testDir, outputFile, options, sandboxConfig } = config;
const reportFile = join(testDir, '.vitest_report.json');
const args = [
'vitest',
'run',
'--reporter=json',
`--outputFile=${reportFile}`,
];
if (options.filter && options.filter.length > 0) {
args.push('--testNamePattern', options.filter.join('|'));
}
args.push(testDir);
const result = await sandboxedExec('npx', args, {
cwd: options.cwd || process.cwd(),
timeout: options.timeout,
env: options.env,
sandboxConfig: sandboxConfig,
});
let report: VitestJsonReport | undefined;
if (existsSync(reportFile)) {
try {
const content = await readFile(reportFile, 'utf-8');
report = JSON.parse(content);
} catch (e) {
}
}
const executionResult = this.parseResults(result.stdout + result.stderr, report);
if (result.timedOut) {
executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
}
if (outputFile) {
await mkdir(dirname(outputFile), { recursive: true });
await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
}
return executionResult;
}
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
const report = jsonReport as VitestJsonReport | undefined;
if (!report) {
return this.parseFromStdout(rawOutput);
}
const summary: ExecutionSummary = {
total: report.numTotalTests,
passed: report.numPassedTests,
failed: report.numFailedTests,
skipped: report.numSkippedTests,
duration: report.endTime - report.startTime,
};
const tests: TestResult[] = [];
for (const file of report.testResults) {
for (const assertion of file.assertionResults) {
tests.push({
id: this.extractScenarioId(assertion.fullName),
name: assertion.fullName,
status: assertion.status === 'skipped' ? 'skipped' : assertion.status,
duration: assertion.duration,
assertions: {
passed: assertion.status === 'passed' ? 1 : 0,
failed: assertion.status === 'failed' ? 1 : 0,
details: [],
},
error: assertion.failureMessages.length > 0
? { message: assertion.failureMessages.join('\n') }
: undefined,
});
}
}
return {
summary,
tests,
errors: [],
};
}
private parseFromStdout(stdout: string): ExecutionResult {
const passMatch = stdout.match(/(\d+) passed/);
const failMatch = stdout.match(/(\d+) failed/);
const skipMatch = stdout.match(/(\d+) skipped/);
const passed = passMatch ? parseInt(passMatch[1], 10) : 0;
const failed = failMatch ? parseInt(failMatch[1], 10) : 0;
const skipped = skipMatch ? parseInt(skipMatch[1], 10) : 0;
return {
summary: {
total: passed + failed + skipped,
passed,
failed,
skipped,
duration: 0,
},
tests: [],
errors: [],
};
}
private extractScenarioId(fullName: string): string {
const match = fullName.match(/test[_\s]([a-zA-Z0-9_-]+)/i);
return match ? match[1] : fullName.replace(/\s+/g, '_');
}
}
export class JestRunner implements Runner {
name = 'jest' as const;
async run(config: RunnerConfig): Promise<ExecutionResult> {
const { testDir, outputFile, options, sandboxConfig } = config;
const reportFile = join(testDir, '.jest_report.json');
const args = [
'jest',
'--json',
`--outputFile=${reportFile}`,
];
if (options.filter && options.filter.length > 0) {
args.push('--testNamePattern', options.filter.join('|'));
}
args.push(testDir);
const result = await sandboxedExec('npx', args, {
cwd: options.cwd || process.cwd(),
timeout: options.timeout,
env: options.env,
sandboxConfig: sandboxConfig,
});
let report: VitestJsonReport | undefined;
if (existsSync(reportFile)) {
try {
const content = await readFile(reportFile, 'utf-8');
report = JSON.parse(content);
} catch (e) {
}
}
const executionResult = this.parseResults(result.stdout + result.stderr, report);
if (result.timedOut) {
executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
}
if (outputFile) {
await mkdir(dirname(outputFile), { recursive: true });
await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
}
return executionResult;
}
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
const vitestRunner = new VitestRunner();
return vitestRunner.parseResults(rawOutput, jsonReport);
}
}