diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts new file mode 100644 index 0000000..28f372f --- /dev/null +++ b/src/cli/commands/run.ts @@ -0,0 +1,145 @@ +import { Command } from 'commander'; +import { existsSync, readFileSync } from 'fs'; +import { join } from 'path'; +import { + runTests, + formatResults, + detectTestFramework, + type TestFramework, + type ExecutionOptions, + DEFAULT_SANDBOX_CONFIG +} from '../../runners/index.js'; +import { createTracer, saveTrace } from '../../observability/index.js'; +import type { EvalSpec } from '../../analyzer/types.js'; + +export const runCommand = new Command('run') + .description('Run generated tests and collect results') + .argument('[test-dir]', 'Directory containing test files', './tests/generated') + .option('-f, --framework ', 'Test framework (pytest, vitest, jest)') + .option('-s, --spec ', 'Path to EvalSpec JSON for result mapping') + .option('--sandbox', 'Run tests in sandbox mode', true) + .option('--no-sandbox', 'Disable sandbox mode') + .option('-t, --timeout ', 'Test timeout in milliseconds', '300000') + .option('-p, --parallel', 'Run tests in parallel', false) + .option('--filter ', 'Run only tests matching patterns') + .option('-o, --output ', 'Output results to JSON file') + .option('--trace', 'Record execution trace', true) + .option('--no-trace', 'Disable execution tracing') + .option('-w, --watch', 'Watch mode (rerun on changes)', false) + .action(async (testDir: string, options) => { + try { + console.log(`\n๐Ÿงช Running tests from ${testDir}...\n`); + + if (!existsSync(testDir)) { + console.error(`Error: Test directory not found: ${testDir}`); + process.exit(1); + } + + const framework: TestFramework = options.framework || detectTestFramework(testDir); + console.log(` Framework: ${framework}`); + console.log(` Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`); + console.log(` Timeout: ${options.timeout}ms`); + + let spec: EvalSpec | undefined; + if (options.spec && existsSync(options.spec)) { + spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec; + console.log(` Spec: ${options.spec} (${spec.scenarios.length} scenarios)`); + } + + const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null; + + const execOptions: ExecutionOptions = { + framework, + sandbox: options.sandbox, + timeout: parseInt(options.timeout, 10), + parallel: options.parallel, + filter: options.filter, + cwd: process.cwd(), + }; + + if (tracer) { + tracer.recordIntrospection({ + filesAnalyzed: [testDir], + duration: 0, + }); + } + + console.log('\n Running tests...\n'); + const startTime = Date.now(); + + const result = await runTests( + testDir, + execOptions, + options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined + ); + + if (tracer) { + tracer.recordExecution({ + testsPassed: result.summary.passed, + testsFailed: result.summary.failed, + testsSkipped: result.summary.skipped, + }); + + for (const test of result.tests) { + if (test.status === 'failed' || test.status === 'error') { + tracer.recordTestFailure({ + scenarioId: test.id, + testName: test.name, + error: test.error?.message || 'Unknown error', + stack: test.error?.stack, + }); + } + } + } + + console.log(formatResults(result)); + + if (spec) { + const mappedResults = mapResultsToScenarios(result, spec); + console.log(`\n๐Ÿ“Š Scenario Coverage:`); + console.log(` Covered: ${mappedResults.covered}/${spec.scenarios.length}`); + console.log(` Unmapped: ${mappedResults.unmapped}`); + } + + if (options.output) { + const { writeFileSync, mkdirSync } = await import('fs'); + const { dirname } = await import('path'); + mkdirSync(dirname(options.output), { recursive: true }); + writeFileSync(options.output, JSON.stringify(result, null, 2)); + console.log(`\n๐Ÿ“ Results saved to: ${options.output}`); + } + + if (tracer) { + const trace = tracer.finalize(); + const tracePath = await saveTrace(trace); + console.log(`\n๐Ÿ“Š Trace saved: ${tracePath}`); + console.log(` View with: evaluclaude view ${trace.id}`); + } + + if (result.summary.failed > 0) { + process.exit(1); + } + } catch (error) { + console.error('Error running tests:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); + +function mapResultsToScenarios( + result: Awaited>, + spec: EvalSpec +): { covered: number; unmapped: number } { + const scenarioIds = new Set(spec.scenarios.map(s => s.id)); + let covered = 0; + let unmapped = 0; + + for (const test of result.tests) { + if (scenarioIds.has(test.id)) { + covered++; + } else { + unmapped++; + } + } + + return { covered, unmapped }; +} diff --git a/src/cli/commands/ui.ts b/src/cli/commands/ui.ts new file mode 100644 index 0000000..9be91eb --- /dev/null +++ b/src/cli/commands/ui.ts @@ -0,0 +1,236 @@ +import { Command } from 'commander'; +import { spawn, type ChildProcess } from 'child_process'; +import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs'; +import { join, dirname } from 'path'; +import type { EvalSpec } from '../../analyzer/types.js'; +import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js'; + +const EVALUCLAUDE_DIR = '.evaluclaude'; +const CONFIG_FILE = 'promptfooconfig.yaml'; +const PROVIDERS_DIR = 'providers'; + +export const uiCommand = new Command('ui') + .description('Launch the evaluation dashboard UI') + .option('-p, --port ', 'Port to run the UI on', '3000') + .option('-s, --spec ', 'Path to EvalSpec JSON file') + .option('--generate', 'Regenerate Promptfoo config from spec') + .option('--no-open', 'Do not auto-open browser') + .action(async (options) => { + try { + const port = parseInt(options.port, 10); + const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE); + const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py'); + + if (options.spec && options.generate) { + console.log('\n๐Ÿ“„ Generating Promptfoo configuration...'); + + if (!existsSync(options.spec)) { + console.error(`Error: Spec file not found: ${options.spec}`); + process.exit(1); + } + + const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8')); + + await generatePromptfooConfig(spec, { + testDir: './tests/generated', + outputPath: configPath, + framework: detectFramework(spec), + includeTraceLinks: true, + }); + + await generateTestProvider(providerPath); + + console.log(` Config: ${configPath}`); + console.log(` Provider: ${providerPath}`); + } + + if (!existsSync(configPath)) { + console.log('\nโš ๏ธ No Promptfoo config found.'); + console.log(' Run with --spec --generate to create one.\n'); + console.log(' Or create one manually:'); + console.log(` ${configPath}\n`); + + await createDefaultConfig(configPath, providerPath); + console.log(` Created default config at ${configPath}`); + } + + console.log(`\n๐Ÿš€ Starting Promptfoo UI on port ${port}...`); + console.log(` Config: ${configPath}\n`); + + await launchPromptfooUI(port, configPath, options.open); + } catch (error) { + console.error('Error launching UI:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); + +export const evalCommand = new Command('eval') + .description('Run evaluations with Promptfoo and optionally launch UI') + .option('-s, --spec ', 'Path to EvalSpec JSON file') + .option('-c, --config ', 'Path to promptfooconfig.yaml') + .option('-o, --output ', 'Output path for results', '.evaluclaude/results') + .option('--view', 'Launch UI after evaluation', false) + .option('-p, --port ', 'Port for UI', '3000') + .action(async (options) => { + try { + const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE); + + if (options.spec) { + console.log('\n๐Ÿ“„ Generating Promptfoo configuration from spec...'); + const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8')); + + await generatePromptfooConfig(spec, { + testDir: './tests/generated', + outputPath: configPath, + framework: detectFramework(spec), + includeTraceLinks: true, + }); + + const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py'); + await generateTestProvider(providerPath); + } + + if (!existsSync(configPath)) { + console.error(`Error: Config not found: ${configPath}`); + console.log('Run with --spec to generate from EvalSpec.'); + process.exit(1); + } + + console.log('\n๐Ÿงช Running Promptfoo evaluations...\n'); + + const outputFile = join(options.output, `eval-${Date.now()}.json`); + mkdirSync(dirname(outputFile), { recursive: true }); + + await runPromptfooEval(configPath, outputFile); + + console.log(`\n๐Ÿ“ Results saved: ${outputFile}`); + + if (options.view) { + console.log(`\n๐Ÿš€ Launching UI on port ${options.port}...`); + await launchPromptfooUI(parseInt(options.port, 10), configPath, true); + } + } catch (error) { + console.error('Error running eval:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); + +async function launchPromptfooUI( + port: number, + configPath: string, + openBrowser: boolean +): Promise { + return new Promise((resolve, reject) => { + const args = ['promptfoo', 'view', '--port', String(port)]; + + if (openBrowser) { + args.push('-y'); + } else { + args.push('-n'); + } + + const configDir = dirname(configPath); + args.push(configDir); + + console.log(` Running: npx ${args.join(' ')}\n`); + + const child = spawn('npx', args, { + stdio: 'inherit', + env: { ...process.env }, + }); + + child.on('error', (error) => { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + console.error('\nโŒ Promptfoo not found. Install with: npm install -g promptfoo'); + } else { + reject(error); + } + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(); + } else { + reject(new Error(`Promptfoo exited with code ${code}`)); + } + }); + + process.on('SIGINT', () => { + child.kill('SIGINT'); + process.exit(0); + }); + }); +} + +async function runPromptfooEval(configPath: string, outputFile: string): Promise { + return new Promise((resolve, reject) => { + const args = [ + 'promptfoo', + 'eval', + '-c', configPath, + '-o', outputFile, + '--no-cache', + ]; + + console.log(` Running: npx ${args.join(' ')}\n`); + + const child = spawn('npx', args, { + stdio: 'inherit', + env: { ...process.env }, + }); + + child.on('error', reject); + + child.on('close', (code) => { + if (code === 0) { + resolve(); + } else { + reject(new Error(`Promptfoo eval exited with code ${code}`)); + } + }); + }); +} + +async function createDefaultConfig(configPath: string, providerPath: string): Promise { + const defaultConfig = `# Evaluclaude Promptfoo Configuration +# Generated by evaluclaude + +description: "Evaluclaude functional test evaluations" + +providers: + - id: file://${providerPath} + label: functional-tests + config: + test_dir: ./tests/generated + framework: pytest + timeout: 300 + +prompts: + - "{{scenario_id}}" + +tests: + - description: "Example test" + vars: + scenario_id: "test_example" + assert: + - type: python + value: | + import json + result = json.loads(output) + result.get('passed', 0) > 0 + +outputPath: .evaluclaude/results/promptfoo-results.json +`; + + mkdirSync(dirname(configPath), { recursive: true }); + writeFileSync(configPath, defaultConfig); + + await generateTestProvider(providerPath); +} + +function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' { + if (spec.repo.languages.includes('python')) { + return 'pytest'; + } + return 'vitest'; +} diff --git a/src/cli/commands/view.ts b/src/cli/commands/view.ts new file mode 100644 index 0000000..2020aaa --- /dev/null +++ b/src/cli/commands/view.ts @@ -0,0 +1,90 @@ +import { Command } from 'commander'; +import { + loadTrace, + listTraces, + getLatestTrace, + formatTrace, + formatTraceList +} from '../../observability/index.js'; + +export const viewCommand = new Command('view') + .description('View evaluation traces') + .argument('[trace-id]', 'Specific trace ID to view') + .option('--last', 'View the most recent trace') + .option('--list', 'List all traces') + .option('--json', 'Output as raw JSON') + .option('-v, --verbose', 'Show verbose output including tool calls') + .option('--tools', 'Show tool call details') + .option('--questions', 'Show questions asked', true) + .option('--decisions', 'Show decisions made', true) + .option('-n, --limit ', 'Limit number of traces listed', '20') + .option('--eval ', 'Filter traces by eval ID') + .action(async (traceId: string | undefined, options) => { + try { + if (options.list) { + const traces = await listTraces(options.eval); + const limited = traces.slice(0, parseInt(options.limit, 10)); + + if (traces.length === 0) { + console.log('\nNo traces found.'); + console.log('Run `evaluclaude run` to generate traces.\n'); + return; + } + + console.log(formatTraceList(limited)); + + if (traces.length > limited.length) { + console.log(`Showing ${limited.length} of ${traces.length} traces.`); + console.log(`Use --limit to see more.\n`); + } + return; + } + + let trace; + + if (options.last || !traceId) { + trace = await getLatestTrace(); + if (!trace) { + console.log('\nNo traces found.'); + console.log('Run `evaluclaude run` to generate traces.\n'); + return; + } + } else { + trace = await loadTrace(traceId); + if (!trace) { + console.error(`\nTrace not found: ${traceId}`); + console.log('Use `evaluclaude view --list` to see available traces.\n'); + process.exit(1); + } + } + + const output = formatTrace(trace, { + json: options.json, + verbose: options.verbose, + showToolCalls: options.tools || options.verbose, + showQuestions: options.questions, + showDecisions: options.decisions, + }); + + console.log(output); + } catch (error) { + console.error('Error viewing trace:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); + +export const tracesCommand = new Command('traces') + .description('List all evaluation traces (alias for view --list)') + .option('-n, --limit ', 'Limit number of traces', '20') + .option('--eval ', 'Filter by eval ID') + .action(async (options) => { + const traces = await listTraces(options.eval); + const limited = traces.slice(0, parseInt(options.limit, 10)); + + if (traces.length === 0) { + console.log('\nNo traces found.'); + return; + } + + console.log(formatTraceList(limited)); + }); diff --git a/src/cli/index.ts b/src/cli/index.ts index 3b6807e..bdc914f 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -5,6 +5,9 @@ import { introCommand } from './commands/intro.js'; import { analyzeCommand } from './commands/analyze.js'; import { renderCommand } from './commands/render.js'; import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/grade.js'; +import { runCommand } from './commands/run.js'; +import { viewCommand, tracesCommand } from './commands/view.js'; +import { uiCommand, evalCommand } from './commands/ui.js'; const program = new Command(); @@ -19,5 +22,10 @@ program.addCommand(renderCommand); program.addCommand(gradeCommand); program.addCommand(listRubricsCommand); program.addCommand(calibrateCommand); +program.addCommand(runCommand); +program.addCommand(viewCommand); +program.addCommand(tracesCommand); +program.addCommand(uiCommand); +program.addCommand(evalCommand); program.parse(process.argv); diff --git a/src/index.ts b/src/index.ts index 225f0ef..dcd9e23 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,3 +2,6 @@ export * from './introspector/index.js'; export * from './analyzer/index.js'; export * from './renderers/index.js'; export * from './graders/index.js'; +export * from './runners/index.js'; +export * from './observability/index.js'; +export * from './promptfoo/index.js'; diff --git a/src/observability/index.ts b/src/observability/index.ts new file mode 100644 index 0000000..63f05b0 --- /dev/null +++ b/src/observability/index.ts @@ -0,0 +1,15 @@ +export * from './types.js'; +export { Tracer, createTracer } from './tracer.js'; +export { + TraceStore, + traceStore, + saveTrace, + loadTrace, + listTraces, + getLatestTrace +} from './trace-store.js'; +export { + formatTrace, + formatTraceList, + type ViewOptions +} from './trace-viewer.js'; diff --git a/src/observability/trace-store.ts b/src/observability/trace-store.ts new file mode 100644 index 0000000..4755cfd --- /dev/null +++ b/src/observability/trace-store.ts @@ -0,0 +1,117 @@ +import { mkdir, readdir, readFile, writeFile } from 'fs/promises'; +import { existsSync } from 'fs'; +import { join } from 'path'; +import type { EvalTrace, TraceListItem } from './types.js'; + +const DEFAULT_TRACES_DIR = '.evaluclaude/traces'; + +export class TraceStore { + private tracesDir: string; + + constructor(tracesDir: string = DEFAULT_TRACES_DIR) { + this.tracesDir = tracesDir; + } + + async save(trace: EvalTrace): Promise { + await mkdir(this.tracesDir, { recursive: true }); + const filePath = join(this.tracesDir, `${trace.id}.json`); + await writeFile(filePath, JSON.stringify(trace, null, 2)); + return filePath; + } + + async load(traceId: string): Promise { + const filePath = join(this.tracesDir, `${traceId}.json`); + if (!existsSync(filePath)) { + return null; + } + const content = await readFile(filePath, 'utf-8'); + return JSON.parse(content) as EvalTrace; + } + + async list(evalId?: string): Promise { + if (!existsSync(this.tracesDir)) { + return []; + } + + const files = await readdir(this.tracesDir); + const jsonFiles = files.filter(f => f.endsWith('.json')); + + const traces: TraceListItem[] = []; + + for (const file of jsonFiles) { + try { + const content = await readFile(join(this.tracesDir, file), 'utf-8'); + const trace = JSON.parse(content) as EvalTrace; + + if (evalId && trace.evalId !== evalId) { + continue; + } + + traces.push({ + id: trace.id, + evalId: trace.evalId, + startedAt: trace.startedAt, + status: trace.status, + duration: trace.duration, + testsPassed: trace.execution.testsPassed, + testsFailed: trace.execution.testsFailed, + }); + } catch (e) { + } + } + + return traces.sort((a, b) => + new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime() + ); + } + + async getLatest(): Promise { + const traces = await this.list(); + if (traces.length === 0) { + return null; + } + return this.load(traces[0].id); + } + + async delete(traceId: string): Promise { + const filePath = join(this.tracesDir, `${traceId}.json`); + if (!existsSync(filePath)) { + return false; + } + const { unlink } = await import('fs/promises'); + await unlink(filePath); + return true; + } + + async cleanup(keepCount: number = 50): Promise { + const traces = await this.list(); + const toDelete = traces.slice(keepCount); + + let deleted = 0; + for (const trace of toDelete) { + if (await this.delete(trace.id)) { + deleted++; + } + } + + return deleted; + } +} + +export const traceStore = new TraceStore(); + +export async function saveTrace(trace: EvalTrace): Promise { + return traceStore.save(trace); +} + +export async function loadTrace(traceId: string): Promise { + return traceStore.load(traceId); +} + +export async function listTraces(evalId?: string): Promise { + return traceStore.list(evalId); +} + +export async function getLatestTrace(): Promise { + return traceStore.getLatest(); +} diff --git a/src/observability/trace-viewer.ts b/src/observability/trace-viewer.ts new file mode 100644 index 0000000..e1c3a52 --- /dev/null +++ b/src/observability/trace-viewer.ts @@ -0,0 +1,226 @@ +import type { EvalTrace, ToolCall, Question, Decision, TestFailure } from './types.js'; + +export interface ViewOptions { + json: boolean; + verbose: boolean; + showToolCalls: boolean; + showQuestions: boolean; + showDecisions: boolean; +} + +const DEFAULT_VIEW_OPTIONS: ViewOptions = { + json: false, + verbose: false, + showToolCalls: false, + showQuestions: true, + showDecisions: true, +}; + +export function formatTrace(trace: EvalTrace, options: Partial = {}): string { + const opts = { ...DEFAULT_VIEW_OPTIONS, ...options }; + + if (opts.json) { + return JSON.stringify(trace, null, 2); + } + + const lines: string[] = []; + + lines.push(''); + lines.push('โ•'.repeat(60)); + lines.push(`๐Ÿ“Š Trace: ${trace.id}`); + lines.push('โ•'.repeat(60)); + lines.push(''); + + lines.push(` Status: ${formatStatus(trace.status)}`); + lines.push(` Started: ${formatDate(trace.startedAt)}`); + lines.push(` Duration: ${formatDuration(trace.duration)}`); + lines.push(` Eval ID: ${trace.evalId}`); + lines.push(''); + + lines.push('๐Ÿ“‚ Introspection'); + lines.push('โ”€'.repeat(40)); + lines.push(` Files analyzed: ${trace.introspection.filesAnalyzed.length}`); + lines.push(` Functions found: ${trace.introspection.totalFunctions}`); + lines.push(` Classes found: ${trace.introspection.totalClasses}`); + lines.push(` Duration: ${formatDuration(trace.introspection.duration)}`); + lines.push(''); + + lines.push('๐Ÿค– Analysis'); + lines.push('โ”€'.repeat(40)); + lines.push(` Tool calls: ${trace.analysis.toolCalls.length}`); + lines.push(` Questions asked: ${trace.analysis.questionsAsked.length}`); + lines.push(` Decisions made: ${trace.analysis.decisions.length}`); + lines.push(` Prompt tokens: ${trace.analysis.promptTokens.toLocaleString()}`); + lines.push(` Completion tokens: ${trace.analysis.completionTokens.toLocaleString()}`); + lines.push(''); + + lines.push('๐Ÿ“ Generation'); + lines.push('โ”€'.repeat(40)); + lines.push(` Scenarios: ${trace.generation.scenariosGenerated}`); + lines.push(` Files written: ${trace.generation.filesWritten.length}`); + lines.push(''); + + lines.push('๐Ÿงช Execution'); + lines.push('โ”€'.repeat(40)); + lines.push(` โœ… Passed: ${trace.execution.testsPassed}`); + lines.push(` โŒ Failed: ${trace.execution.testsFailed}`); + lines.push(` โญ๏ธ Skipped: ${trace.execution.testsSkipped}`); + lines.push(''); + + if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) { + lines.push('โ“ Questions Asked'); + lines.push('โ”€'.repeat(40)); + for (const q of trace.analysis.questionsAsked) { + lines.push(formatQuestion(q)); + } + lines.push(''); + } + + if (opts.showDecisions && trace.analysis.decisions.length > 0) { + lines.push('๐ŸŽฏ Key Decisions'); + lines.push('โ”€'.repeat(40)); + for (const d of trace.analysis.decisions.slice(0, 10)) { + lines.push(formatDecision(d)); + } + if (trace.analysis.decisions.length > 10) { + lines.push(` ... and ${trace.analysis.decisions.length - 10} more`); + } + lines.push(''); + } + + if (opts.showToolCalls && trace.analysis.toolCalls.length > 0) { + lines.push('๐Ÿ”ง Tool Calls'); + lines.push('โ”€'.repeat(40)); + for (const tc of trace.analysis.toolCalls.slice(0, 20)) { + lines.push(formatToolCall(tc, opts.verbose)); + } + if (trace.analysis.toolCalls.length > 20) { + lines.push(` ... and ${trace.analysis.toolCalls.length - 20} more`); + } + lines.push(''); + } + + if (trace.execution.failures.length > 0) { + lines.push('โŒ Test Failures'); + lines.push('โ”€'.repeat(40)); + for (const f of trace.execution.failures) { + lines.push(formatFailure(f)); + } + lines.push(''); + } + + if (trace.errors.length > 0) { + lines.push('โš ๏ธ Errors'); + lines.push('โ”€'.repeat(40)); + for (const e of trace.errors) { + lines.push(` [${formatDate(e.timestamp)}]`); + lines.push(` ${e.message}`); + if (e.context) { + lines.push(` Context: ${e.context}`); + } + lines.push(''); + } + } + + lines.push('โ•'.repeat(60)); + lines.push(''); + + return lines.join('\n'); +} + +function formatStatus(status: EvalTrace['status']): string { + switch (status) { + case 'success': + return 'โœ… Success'; + case 'partial': + return 'โš ๏ธ Partial'; + case 'failed': + return 'โŒ Failed'; + default: + return status; + } +} + +function formatDate(iso: string): string { + return new Date(iso).toLocaleString(); +} + +function formatDuration(ms: number): string { + if (ms < 1000) { + return `${ms}ms`; + } + if (ms < 60000) { + return `${(ms / 1000).toFixed(1)}s`; + } + const minutes = Math.floor(ms / 60000); + const seconds = ((ms % 60000) / 1000).toFixed(0); + return `${minutes}m ${seconds}s`; +} + +function formatQuestion(q: Question): string { + const lines: string[] = []; + lines.push(` Q: ${q.question}`); + if (q.answer) { + lines.push(` A: ${q.answer}`); + } else { + lines.push(` A: (no answer)`); + } + lines.push(''); + return lines.join('\n'); +} + +function formatDecision(d: Decision): string { + const icon = d.type === 'include' ? 'โœ“' : d.type === 'exclude' ? 'โœ—' : 'โ†’'; + return ` ${icon} [${d.type}] ${d.subject}\n Reason: ${d.reasoning}\n Confidence: ${(d.confidence * 100).toFixed(0)}%\n`; +} + +function formatToolCall(tc: ToolCall, verbose: boolean): string { + const duration = formatDuration(tc.duration); + if (verbose) { + return ` [${tc.tool}] (${duration})\n Input: ${JSON.stringify(tc.input).slice(0, 100)}...\n`; + } + return ` ${tc.tool} (${duration})`; +} + +function formatFailure(f: TestFailure): string { + const lines: string[] = []; + lines.push(` โ€ข ${f.testName}`); + lines.push(` Scenario: ${f.scenarioId}`); + lines.push(` Error: ${f.error}`); + if (f.expected !== undefined && f.actual !== undefined) { + lines.push(` Expected: ${JSON.stringify(f.expected)}`); + lines.push(` Actual: ${JSON.stringify(f.actual)}`); + } + lines.push(''); + return lines.join('\n'); +} + +export function formatTraceList(traces: Array<{ + id: string; + startedAt: string; + status: string; + duration: number; + testsPassed: number; + testsFailed: number; +}>): string { + const lines: string[] = []; + + lines.push(''); + lines.push('๐Ÿ“‹ Recent Traces'); + lines.push('โ•'.repeat(80)); + lines.push(''); + lines.push('ID Status Passed Failed Duration'); + lines.push('โ”€'.repeat(80)); + + for (const t of traces) { + const statusIcon = t.status === 'success' ? 'โœ…' : t.status === 'partial' ? 'โš ๏ธ ' : 'โŒ'; + const id = t.id.slice(0, 36); + const passed = String(t.testsPassed).padStart(6); + const failed = String(t.testsFailed).padStart(6); + const duration = formatDuration(t.duration).padStart(8); + lines.push(`${id} ${statusIcon} ${passed} ${failed} ${duration}`); + } + + lines.push(''); + return lines.join('\n'); +} diff --git a/src/observability/tracer.ts b/src/observability/tracer.ts new file mode 100644 index 0000000..c538413 --- /dev/null +++ b/src/observability/tracer.ts @@ -0,0 +1,168 @@ +import { randomUUID } from 'crypto'; +import type { + EvalTrace, + ToolCall, + Question, + Decision, + TraceError, + TestFailure, + IntrospectionTrace, + GenerationTrace, + ExecutionTrace, +} from './types.js'; + +export class Tracer { + private trace: EvalTrace; + private currentToolCall?: { name: string; input: unknown; startTime: number }; + private startTime: number; + + constructor(evalId: string) { + this.startTime = Date.now(); + this.trace = { + id: randomUUID(), + evalId, + startedAt: new Date().toISOString(), + completedAt: '', + duration: 0, + status: 'success', + introspection: { + filesAnalyzed: [], + totalFunctions: 0, + totalClasses: 0, + duration: 0, + }, + analysis: { + promptTokens: 0, + completionTokens: 0, + toolCalls: [], + questionsAsked: [], + decisions: [], + }, + generation: { + scenariosGenerated: 0, + filesWritten: [], + }, + execution: { + testsPassed: 0, + testsFailed: 0, + testsSkipped: 0, + failures: [], + }, + errors: [], + }; + } + + get traceId(): string { + return this.trace.id; + } + + recordToolStart(name: string, input: unknown): void { + this.currentToolCall = { name, input, startTime: Date.now() }; + } + + recordToolEnd(name: string, output: unknown): void { + if (this.currentToolCall?.name === name) { + const toolCall: ToolCall = { + timestamp: new Date().toISOString(), + tool: name, + input: this.currentToolCall.input, + output, + duration: Date.now() - this.currentToolCall.startTime, + }; + this.trace.analysis.toolCalls.push(toolCall); + this.currentToolCall = undefined; + } + } + + recordQuestion(question: Question): void { + this.trace.analysis.questionsAsked.push({ + ...question, + timestamp: new Date().toISOString(), + }); + } + + recordAnswer(questionId: string, answer: string): void { + const question = this.trace.analysis.questionsAsked.find(q => q.id === questionId); + if (question) { + question.answer = answer; + } + } + + recordDecision( + type: Decision['type'], + subject: string, + reasoning: string, + confidence: number + ): void { + this.trace.analysis.decisions.push({ + timestamp: new Date().toISOString(), + type, + subject, + reasoning, + confidence: Math.max(0, Math.min(1, confidence)), + }); + } + + recordIntrospection(data: Partial): void { + Object.assign(this.trace.introspection, data); + } + + recordGeneration(data: Partial): void { + Object.assign(this.trace.generation, data); + } + + recordExecution(data: Partial): void { + Object.assign(this.trace.execution, data); + } + + recordTestFailure(failure: TestFailure): void { + this.trace.execution.failures.push(failure); + this.trace.execution.testsFailed++; + } + + recordTestPass(): void { + this.trace.execution.testsPassed++; + } + + recordTokenUsage(promptTokens: number, completionTokens: number): void { + this.trace.analysis.promptTokens += promptTokens; + this.trace.analysis.completionTokens += completionTokens; + } + + recordError(error: Error, context?: string): void { + const traceError: TraceError = { + timestamp: new Date().toISOString(), + message: error.message, + stack: error.stack, + context, + }; + this.trace.errors.push(traceError); + + if (this.trace.status === 'success') { + this.trace.status = 'partial'; + } + } + + setStatus(status: EvalTrace['status']): void { + this.trace.status = status; + } + + finalize(): EvalTrace { + this.trace.completedAt = new Date().toISOString(); + this.trace.duration = Date.now() - this.startTime; + + if (this.trace.errors.length > 0 && this.trace.execution.testsPassed === 0) { + this.trace.status = 'failed'; + } + + return this.trace; + } + + getTrace(): EvalTrace { + return { ...this.trace }; + } +} + +export function createTracer(evalId: string): Tracer { + return new Tracer(evalId); +} diff --git a/src/observability/types.ts b/src/observability/types.ts new file mode 100644 index 0000000..59d14cc --- /dev/null +++ b/src/observability/types.ts @@ -0,0 +1,100 @@ +export interface EvalTrace { + id: string; + evalId: string; + startedAt: string; + completedAt: string; + duration: number; + + status: 'success' | 'partial' | 'failed'; + + introspection: IntrospectionTrace; + analysis: AnalysisTrace; + generation: GenerationTrace; + execution: ExecutionTrace; + + errors: TraceError[]; +} + +export interface IntrospectionTrace { + filesAnalyzed: string[]; + totalFunctions: number; + totalClasses: number; + duration: number; +} + +export interface AnalysisTrace { + promptTokens: number; + completionTokens: number; + toolCalls: ToolCall[]; + questionsAsked: Question[]; + decisions: Decision[]; +} + +export interface GenerationTrace { + scenariosGenerated: number; + filesWritten: string[]; +} + +export interface ExecutionTrace { + testsPassed: number; + testsFailed: number; + testsSkipped: number; + failures: TestFailure[]; +} + +export interface ToolCall { + timestamp: string; + tool: string; + input: unknown; + output: unknown; + duration: number; +} + +export interface Question { + id: string; + timestamp: string; + question: string; + options?: string[]; + answer?: string; + defaultAnswer?: string; +} + +export interface Decision { + timestamp: string; + type: 'include' | 'exclude' | 'prioritize' | 'question'; + subject: string; + reasoning: string; + confidence: number; +} + +export interface TestFailure { + scenarioId: string; + testName: string; + error: string; + stack?: string; + expected?: unknown; + actual?: unknown; +} + +export interface TraceError { + timestamp: string; + message: string; + stack?: string; + context?: string; +} + +export interface TraceEvent { + timestamp: string; + type: 'tool_start' | 'tool_end' | 'question' | 'decision' | 'error' | 'info'; + data: unknown; +} + +export interface TraceListItem { + id: string; + evalId: string; + startedAt: string; + status: EvalTrace['status']; + duration: number; + testsPassed: number; + testsFailed: number; +} diff --git a/src/promptfoo/config-generator.ts b/src/promptfoo/config-generator.ts new file mode 100644 index 0000000..96536ab --- /dev/null +++ b/src/promptfoo/config-generator.ts @@ -0,0 +1,271 @@ +import { writeFile, mkdir } from 'fs/promises'; +import { dirname, join } from 'path'; +import * as yaml from 'js-yaml'; +import type { EvalSpec, EvalScenario } from '../analyzer/types.js'; +import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js'; + +export interface ConfigOptions { + testDir: string; + outputPath: string; + framework: 'pytest' | 'vitest' | 'jest'; + includeTraceLinks: boolean; +} + +export async function generatePromptfooConfig( + spec: EvalSpec, + options: ConfigOptions +): Promise { + const config = buildConfig(spec, options); + const yamlContent = yaml.dump(config, { + lineWidth: 120, + quotingType: '"', + }); + + await mkdir(dirname(options.outputPath), { recursive: true }); + await writeFile(options.outputPath, yamlContent); + + return yamlContent; +} + +function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig { + const tests = spec.scenarios.map(scenario => buildTest(scenario, options)); + + return { + description: `Evaluclaude functional tests for ${spec.repo.name}`, + providers: [ + { + id: `file://providers/test-runner.py`, + label: 'functional-tests', + config: { + test_dir: options.testDir, + framework: options.framework, + timeout: 300, + }, + }, + ], + prompts: ['{{scenario_id}}'], + tests, + defaultTest: options.includeTraceLinks + ? { + metadata: { + traceFile: '.evaluclaude/traces/{{evalId}}.json', + }, + } + : undefined, + outputPath: '.evaluclaude/results/promptfoo-results.json', + }; +} + +function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest { + const assertions = scenario.assertions + .filter(a => a.type !== 'llm-rubric') + .map(a => buildAssertion(a)); + + const llmRubrics = scenario.assertions + .filter(a => a.type === 'llm-rubric') + .map(a => ({ + type: 'llm-rubric' as const, + value: (a as any).rubric, + threshold: (a as any).passingThreshold ?? 0.7, + })); + + return { + description: scenario.description, + vars: { + scenario_id: scenario.id, + target_module: scenario.target.module, + target_function: scenario.target.function, + input_args: scenario.input.args, + input_kwargs: scenario.input.kwargs, + }, + assert: [...assertions, ...llmRubrics], + metadata: { + category: scenario.category, + priority: scenario.priority, + tags: scenario.tags, + }, + }; +} + +function buildAssertion(assertion: any): PromptfooAssertion { + switch (assertion.type) { + case 'equals': + return { + type: 'equals', + value: assertion.expected, + }; + + case 'contains': + return { + type: 'contains', + value: assertion.value, + }; + + case 'matches': + return { + type: 'regex', + value: assertion.pattern, + }; + + case 'typeof': + return { + type: 'python', + value: `type(output).__name__ == '${assertion.expected}'`, + }; + + case 'throws': + return { + type: 'python', + value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`, + }; + + case 'truthy': + return { + type: 'python', + value: 'bool(output)', + }; + + case 'falsy': + return { + type: 'python', + value: 'not bool(output)', + }; + + case 'custom': + return { + type: 'python', + value: assertion.check, + }; + + default: + return { + type: 'python', + value: 'True', + }; + } +} + +export async function generateTestProvider(outputPath: string): Promise { + const providerCode = `#!/usr/bin/env python3 +"""Promptfoo provider that executes tests and returns structured results.""" + +import subprocess +import json +import sys +import os + +def get_provider_response(prompt: str, options: dict, context: dict) -> dict: + """Runs tests and returns structured results.""" + + test_dir = options.get('config', {}).get('test_dir', './tests') + framework = options.get('config', {}).get('framework', 'pytest') + timeout = options.get('config', {}).get('timeout', 300) + + scenario_id = prompt.strip() + + try: + if framework == 'pytest': + result = subprocess.run( + [ + 'python', '-m', 'pytest', + '--json-report', + '--json-report-file=/tmp/pytest_results.json', + '-k', scenario_id, + test_dir + ], + capture_output=True, + text=True, + timeout=timeout + ) + + try: + with open('/tmp/pytest_results.json') as f: + report = json.load(f) + + output = { + 'passed': report.get('summary', {}).get('passed', 0), + 'failed': report.get('summary', {}).get('failed', 0), + 'skipped': report.get('summary', {}).get('skipped', 0), + 'tests': report.get('tests', []), + 'stdout': result.stdout, + 'stderr': result.stderr, + 'exit_code': result.returncode, + } + except FileNotFoundError: + output = { + 'passed': 0, + 'failed': 1, + 'error': 'Failed to generate pytest report', + 'stdout': result.stdout, + 'stderr': result.stderr, + } + + elif framework in ('vitest', 'jest'): + cmd = ['npx', framework, 'run', '--reporter=json'] + if scenario_id: + cmd.extend(['--testNamePattern', scenario_id]) + cmd.append(test_dir) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout + ) + + try: + report = json.loads(result.stdout) + output = { + 'passed': report.get('numPassedTests', 0), + 'failed': report.get('numFailedTests', 0), + 'skipped': report.get('numSkippedTests', 0), + 'tests': report.get('testResults', []), + 'exit_code': result.returncode, + } + except json.JSONDecodeError: + output = { + 'passed': 0, + 'failed': 1, + 'error': 'Failed to parse test output', + 'stdout': result.stdout, + 'stderr': result.stderr, + } + else: + output = {'error': f'Unknown framework: {framework}'} + + return { + 'output': json.dumps(output), + 'error': None, + } + + except subprocess.TimeoutExpired: + return { + 'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}), + 'error': None, + } + except Exception as e: + return { + 'output': None, + 'error': str(e), + } + +if __name__ == '__main__': + # For testing the provider directly + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--scenario', default='') + parser.add_argument('--test-dir', default='./tests') + parser.add_argument('--framework', default='pytest') + args = parser.parse_args() + + result = get_provider_response( + args.scenario, + {'config': {'test_dir': args.test_dir, 'framework': args.framework}}, + {} + ) + print(json.dumps(result, indent=2)) +`; + + await mkdir(dirname(outputPath), { recursive: true }); + await writeFile(outputPath, providerCode); +} diff --git a/src/promptfoo/index.ts b/src/promptfoo/index.ts new file mode 100644 index 0000000..42deb5c --- /dev/null +++ b/src/promptfoo/index.ts @@ -0,0 +1,2 @@ +export * from './types.js'; +export { generatePromptfooConfig, generateTestProvider } from './config-generator.js'; diff --git a/src/promptfoo/types.ts b/src/promptfoo/types.ts new file mode 100644 index 0000000..7966b55 --- /dev/null +++ b/src/promptfoo/types.ts @@ -0,0 +1,89 @@ +export interface PromptfooConfig { + description?: string; + providers: PromptfooProvider[]; + prompts: string[]; + tests: PromptfooTest[]; + defaultTest?: PromptfooDefaultTest; + outputPath?: string; +} + +export interface PromptfooProvider { + id: string; + label?: string; + config?: Record; +} + +export interface PromptfooTest { + description?: string; + vars?: Record; + assert?: PromptfooAssertion[]; + options?: Record; + metadata?: Record; +} + +export interface PromptfooAssertion { + type: string; + value?: unknown; + threshold?: number; + weight?: number; + provider?: string; +} + +export interface PromptfooDefaultTest { + assert?: PromptfooAssertion[]; + options?: Record; + metadata?: Record; +} + +export interface PromptfooResult { + version: number; + timestamp: string; + results: PromptfooTestResult[]; + stats: { + successes: number; + failures: number; + tokenUsage: { + total: number; + prompt: number; + completion: number; + }; + }; +} + +export interface PromptfooTestResult { + prompt: { + raw: string; + label: string; + }; + vars: Record; + response: { + output: string; + tokenUsage?: { + total: number; + prompt: number; + completion: number; + }; + }; + gradingResult: { + pass: boolean; + score: number; + reason?: string; + componentResults?: Array<{ + pass: boolean; + score: number; + reason: string; + assertion: PromptfooAssertion; + }>; + }; + success: boolean; + error?: string; +} + +export interface EvalConfig { + specPath: string; + testDir: string; + outputDir: string; + framework: 'pytest' | 'vitest' | 'jest'; + uiPort: number; + watch: boolean; +} diff --git a/src/runners/index.ts b/src/runners/index.ts new file mode 100644 index 0000000..ae861b9 --- /dev/null +++ b/src/runners/index.ts @@ -0,0 +1,101 @@ +import type { Runner, TestFramework, RunnerConfig, ExecutionResult, ExecutionOptions, SandboxConfig } from './types.js'; +import { PytestRunner } from './pytest-runner.js'; +import { VitestRunner, JestRunner } from './vitest-runner.js'; +import { DEFAULT_SANDBOX_CONFIG } from './types.js'; + +export * from './types.js'; +export { PytestRunner } from './pytest-runner.js'; +export { VitestRunner, JestRunner } from './vitest-runner.js'; +export { sandboxedExec } from './sandbox.js'; + +const runnerRegistry: Record Runner> = { + pytest: PytestRunner, + vitest: VitestRunner, + jest: JestRunner, +}; + +export function createRunner(framework: TestFramework): Runner { + const RunnerClass = runnerRegistry[framework]; + if (!RunnerClass) { + throw new Error(`Unknown test framework: ${framework}`); + } + return new RunnerClass(); +} + +export async function runTests( + testDir: string, + options: ExecutionOptions, + sandboxConfig: SandboxConfig = DEFAULT_SANDBOX_CONFIG +): Promise { + const runner = createRunner(options.framework); + + const config: RunnerConfig = { + testDir, + outputFile: `.evaluclaude/results/${options.framework}-${Date.now()}.json`, + options, + sandboxConfig: options.sandbox ? sandboxConfig : undefined, + }; + + return runner.run(config); +} + +export function detectTestFramework(testDir: string): TestFramework { + const fs = require('fs'); + const path = require('path'); + + const pythonFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.py')); + const tsFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.ts') || f.endsWith('.js')); + + if (pythonFiles.length > tsFiles.length) { + return 'pytest'; + } + + const packageJsonPath = path.join(testDir, '..', 'package.json'); + if (fs.existsSync(packageJsonPath)) { + try { + const pkg = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8')); + if (pkg.devDependencies?.jest || pkg.dependencies?.jest) { + return 'jest'; + } + } catch (e) { + } + } + + return 'vitest'; +} + +export function formatResults(result: ExecutionResult): string { + const lines: string[] = []; + + lines.push(''); + lines.push('๐Ÿ“Š Test Execution Results'); + lines.push('โ•'.repeat(40)); + lines.push(` Total: ${result.summary.total}`); + lines.push(` โœ… Passed: ${result.summary.passed}`); + lines.push(` โŒ Failed: ${result.summary.failed}`); + lines.push(` โญ๏ธ Skipped: ${result.summary.skipped}`); + lines.push(` โฑ๏ธ Duration: ${result.summary.duration}ms`); + + if (result.errors.length > 0) { + lines.push(''); + lines.push('โš ๏ธ Errors:'); + for (const error of result.errors) { + lines.push(` โ€ข ${error}`); + } + } + + const failures = result.tests.filter(t => t.status === 'failed' || t.status === 'error'); + if (failures.length > 0) { + lines.push(''); + lines.push('โŒ Failed Tests:'); + for (const test of failures) { + lines.push(` โ€ข ${test.name}`); + if (test.error) { + lines.push(` ${test.error.message}`); + } + } + } + + lines.push(''); + return lines.join('\n'); +} diff --git a/src/runners/pytest-runner.ts b/src/runners/pytest-runner.ts new file mode 100644 index 0000000..c68f5fc --- /dev/null +++ b/src/runners/pytest-runner.ts @@ -0,0 +1,164 @@ +import { readFile, writeFile, mkdir } from 'fs/promises'; +import { existsSync } from 'fs'; +import { join, dirname } from 'path'; +import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js'; +import { sandboxedExec } from './sandbox.js'; + +interface PytestJsonReport { + created: number; + duration: number; + exitcode: number; + root: string; + environment: Record; + summary: { + passed: number; + failed: number; + error: number; + skipped: number; + total: number; + collected: number; + }; + tests: PytestTestResult[]; +} + +interface PytestTestResult { + nodeid: string; + outcome: 'passed' | 'failed' | 'skipped' | 'error'; + keywords: string[]; + setup?: { duration: number; outcome: string }; + call?: { + duration: number; + outcome: string; + crash?: { message: string; path: string; lineno: number }; + traceback?: Array<{ path: string; lineno: number; message: string }>; + longrepr?: string; + }; + teardown?: { duration: number; outcome: string }; +} + +export class PytestRunner implements Runner { + name = 'pytest' as const; + + async run(config: RunnerConfig): Promise { + const { testDir, outputFile, options, sandboxConfig } = config; + + const reportFile = join(testDir, '.pytest_report.json'); + + const args = [ + '-v', + '--tb=short', + '--json-report', + `--json-report-file=${reportFile}`, + ]; + + if (options.parallel) { + args.push('-n', 'auto'); + } + + if (options.filter && options.filter.length > 0) { + args.push('-k', options.filter.join(' or ')); + } + + args.push(testDir); + + const result = await sandboxedExec('python', ['-m', 'pytest', ...args], { + cwd: options.cwd || process.cwd(), + timeout: options.timeout, + env: options.env, + sandboxConfig: sandboxConfig, + }); + + let report: PytestJsonReport | undefined; + if (existsSync(reportFile)) { + try { + const content = await readFile(reportFile, 'utf-8'); + report = JSON.parse(content); + } catch (e) { + } + } + + const executionResult = this.parseResults(result.stdout + result.stderr, report); + + if (result.timedOut) { + executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`); + } + + if (outputFile) { + await mkdir(dirname(outputFile), { recursive: true }); + await writeFile(outputFile, JSON.stringify(executionResult, null, 2)); + } + + return executionResult; + } + + parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult { + const report = jsonReport as PytestJsonReport | undefined; + + if (!report) { + return this.parseFromStdout(rawOutput); + } + + const summary: ExecutionSummary = { + total: report.summary.total, + passed: report.summary.passed, + failed: report.summary.failed, + skipped: report.summary.skipped, + duration: report.duration * 1000, + }; + + const tests: TestResult[] = report.tests.map((t) => ({ + id: this.extractScenarioId(t.nodeid), + name: t.nodeid, + status: t.outcome === 'error' ? 'error' : t.outcome, + duration: (t.call?.duration || 0) * 1000, + assertions: { + passed: t.outcome === 'passed' ? 1 : 0, + failed: t.outcome === 'failed' ? 1 : 0, + details: [], + }, + error: t.call?.crash + ? { message: t.call.crash.message, stack: t.call.longrepr } + : undefined, + })); + + return { + summary, + tests, + errors: report.summary.error > 0 ? [`${report.summary.error} tests had errors`] : [], + }; + } + + private parseFromStdout(stdout: string): ExecutionResult { + const lines = stdout.split('\n'); + const summaryMatch = stdout.match(/(\d+) passed|(\d+) failed|(\d+) skipped|(\d+) error/g); + + let passed = 0, failed = 0, skipped = 0; + + if (summaryMatch) { + for (const match of summaryMatch) { + const [num, type] = match.split(' '); + const count = parseInt(num, 10); + if (type === 'passed') passed = count; + if (type === 'failed') failed = count; + if (type === 'skipped') skipped = count; + } + } + + return { + summary: { + total: passed + failed + skipped, + passed, + failed, + skipped, + duration: 0, + }, + tests: [], + errors: [], + }; + } + + private extractScenarioId(nodeid: string): string { + const match = nodeid.match(/test_([a-zA-Z0-9_-]+)/); + return match ? match[1] : nodeid; + } +} diff --git a/src/runners/sandbox.ts b/src/runners/sandbox.ts new file mode 100644 index 0000000..178c023 --- /dev/null +++ b/src/runners/sandbox.ts @@ -0,0 +1,126 @@ +import { spawn, type ChildProcess, type SpawnOptions } from 'child_process'; +import type { SandboxConfig, DEFAULT_SANDBOX_CONFIG } from './types.js'; + +export interface SandboxedExecResult { + exitCode: number; + stdout: string; + stderr: string; + timedOut: boolean; +} + +export async function sandboxedExec( + command: string, + args: string[], + options: { + cwd: string; + timeout: number; + env?: Record; + sandboxConfig?: SandboxConfig; + } +): Promise { + const { cwd, timeout, env = {}, sandboxConfig } = options; + + const spawnEnv: Record = {}; + + if (sandboxConfig?.enabled) { + for (const key of sandboxConfig.env.inherit) { + if (process.env[key]) { + spawnEnv[key] = process.env[key]!; + } + } + Object.assign(spawnEnv, sandboxConfig.env.set); + } else { + Object.assign(spawnEnv, process.env); + } + + Object.assign(spawnEnv, env); + + const spawnOptions: SpawnOptions = { + cwd, + env: spawnEnv, + stdio: ['pipe', 'pipe', 'pipe'], + }; + + return new Promise((resolve) => { + let stdout = ''; + let stderr = ''; + let timedOut = false; + + const child: ChildProcess = spawn(command, args, spawnOptions); + + const timeoutId = setTimeout(() => { + timedOut = true; + child.kill('SIGTERM'); + setTimeout(() => child.kill('SIGKILL'), 1000); + }, timeout); + + child.stdout?.on('data', (data: Buffer) => { + stdout += data.toString(); + }); + + child.stderr?.on('data', (data: Buffer) => { + stderr += data.toString(); + }); + + child.on('close', (code) => { + clearTimeout(timeoutId); + resolve({ + exitCode: code ?? 1, + stdout, + stderr, + timedOut, + }); + }); + + child.on('error', (err) => { + clearTimeout(timeoutId); + resolve({ + exitCode: 1, + stdout, + stderr: stderr + '\n' + err.message, + timedOut: false, + }); + }); + }); +} + +export function buildSandboxCommand( + command: string, + args: string[], + config: SandboxConfig +): { command: string; args: string[] } { + if (!config.enabled) { + return { command, args }; + } + + if (process.platform === 'darwin') { + const sandboxArgs: string[] = []; + + if (!config.network.allowOutbound) { + sandboxArgs.push('--deny-network-outbound'); + } + + return { + command: 'sandbox-exec', + args: ['-p', buildSandboxProfile(config), command, ...args], + }; + } + + return { command, args }; +} + +function buildSandboxProfile(config: SandboxConfig): string { + const rules: string[] = ['(version 1)', '(allow default)']; + + if (!config.network.allowOutbound) { + rules.push('(deny network-outbound (remote ip "*:*"))'); + } + + for (const path of config.filesystem.readOnly) { + if (path !== '/') { + rules.push(`(deny file-write* (subpath "${path}"))`); + } + } + + return rules.join('\n'); +} diff --git a/src/runners/types.ts b/src/runners/types.ts new file mode 100644 index 0000000..f1f82c4 --- /dev/null +++ b/src/runners/types.ts @@ -0,0 +1,95 @@ +export type TestFramework = 'pytest' | 'vitest' | 'jest'; + +export interface ExecutionOptions { + framework: TestFramework; + sandbox: boolean; + timeout: number; + parallel: boolean; + filter?: string[]; + cwd?: string; + env?: Record; +} + +export interface ExecutionResult { + summary: ExecutionSummary; + tests: TestResult[]; + errors: string[]; + traceId?: string; +} + +export interface ExecutionSummary { + total: number; + passed: number; + failed: number; + skipped: number; + duration: number; +} + +export interface TestResult { + id: string; + name: string; + status: 'passed' | 'failed' | 'skipped' | 'error'; + duration: number; + assertions: { + passed: number; + failed: number; + details: AssertionResult[]; + }; + error?: { message: string; stack?: string }; + stdout?: string; + stderr?: string; +} + +export interface AssertionResult { + description: string; + passed: boolean; + expected?: unknown; + actual?: unknown; +} + +export interface SandboxConfig { + enabled: boolean; + autoAllowBashIfSandboxed: boolean; + network: { + allowLocalBinding: boolean; + allowOutbound: boolean; + }; + filesystem: { + readOnly: string[]; + writable: string[]; + }; + env: { + inherit: string[]; + set: Record; + }; +} + +export const DEFAULT_SANDBOX_CONFIG: SandboxConfig = { + enabled: true, + autoAllowBashIfSandboxed: true, + network: { + allowLocalBinding: true, + allowOutbound: false, + }, + filesystem: { + readOnly: ['/'], + writable: ['/tmp', './test-output'], + }, + env: { + inherit: ['PATH', 'HOME', 'USER'], + set: { CI: 'true', NODE_ENV: 'test' }, + }, +}; + +export interface RunnerConfig { + testDir: string; + outputFile: string; + options: ExecutionOptions; + sandboxConfig?: SandboxConfig; +} + +export interface Runner { + name: TestFramework; + run(config: RunnerConfig): Promise; + parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult; +} diff --git a/src/runners/vitest-runner.ts b/src/runners/vitest-runner.ts new file mode 100644 index 0000000..7bf61eb --- /dev/null +++ b/src/runners/vitest-runner.ts @@ -0,0 +1,213 @@ +import { readFile, writeFile, mkdir } from 'fs/promises'; +import { existsSync } from 'fs'; +import { join, dirname } from 'path'; +import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js'; +import { sandboxedExec } from './sandbox.js'; + +interface VitestJsonReport { + numTotalTestSuites: number; + numPassedTestSuites: number; + numFailedTestSuites: number; + numTotalTests: number; + numPassedTests: number; + numFailedTests: number; + numSkippedTests: number; + startTime: number; + endTime: number; + testResults: VitestTestFile[]; +} + +interface VitestTestFile { + name: string; + status: 'passed' | 'failed'; + startTime: number; + endTime: number; + assertionResults: VitestAssertion[]; +} + +interface VitestAssertion { + ancestorTitles: string[]; + fullName: string; + status: 'passed' | 'failed' | 'skipped'; + title: string; + duration: number; + failureMessages: string[]; +} + +export class VitestRunner implements Runner { + name = 'vitest' as const; + + async run(config: RunnerConfig): Promise { + const { testDir, outputFile, options, sandboxConfig } = config; + + const reportFile = join(testDir, '.vitest_report.json'); + + const args = [ + 'vitest', + 'run', + '--reporter=json', + `--outputFile=${reportFile}`, + ]; + + if (options.filter && options.filter.length > 0) { + args.push('--testNamePattern', options.filter.join('|')); + } + + args.push(testDir); + + const result = await sandboxedExec('npx', args, { + cwd: options.cwd || process.cwd(), + timeout: options.timeout, + env: options.env, + sandboxConfig: sandboxConfig, + }); + + let report: VitestJsonReport | undefined; + if (existsSync(reportFile)) { + try { + const content = await readFile(reportFile, 'utf-8'); + report = JSON.parse(content); + } catch (e) { + } + } + + const executionResult = this.parseResults(result.stdout + result.stderr, report); + + if (result.timedOut) { + executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`); + } + + if (outputFile) { + await mkdir(dirname(outputFile), { recursive: true }); + await writeFile(outputFile, JSON.stringify(executionResult, null, 2)); + } + + return executionResult; + } + + parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult { + const report = jsonReport as VitestJsonReport | undefined; + + if (!report) { + return this.parseFromStdout(rawOutput); + } + + const summary: ExecutionSummary = { + total: report.numTotalTests, + passed: report.numPassedTests, + failed: report.numFailedTests, + skipped: report.numSkippedTests, + duration: report.endTime - report.startTime, + }; + + const tests: TestResult[] = []; + + for (const file of report.testResults) { + for (const assertion of file.assertionResults) { + tests.push({ + id: this.extractScenarioId(assertion.fullName), + name: assertion.fullName, + status: assertion.status === 'skipped' ? 'skipped' : assertion.status, + duration: assertion.duration, + assertions: { + passed: assertion.status === 'passed' ? 1 : 0, + failed: assertion.status === 'failed' ? 1 : 0, + details: [], + }, + error: assertion.failureMessages.length > 0 + ? { message: assertion.failureMessages.join('\n') } + : undefined, + }); + } + } + + return { + summary, + tests, + errors: [], + }; + } + + private parseFromStdout(stdout: string): ExecutionResult { + const passMatch = stdout.match(/(\d+) passed/); + const failMatch = stdout.match(/(\d+) failed/); + const skipMatch = stdout.match(/(\d+) skipped/); + + const passed = passMatch ? parseInt(passMatch[1], 10) : 0; + const failed = failMatch ? parseInt(failMatch[1], 10) : 0; + const skipped = skipMatch ? parseInt(skipMatch[1], 10) : 0; + + return { + summary: { + total: passed + failed + skipped, + passed, + failed, + skipped, + duration: 0, + }, + tests: [], + errors: [], + }; + } + + private extractScenarioId(fullName: string): string { + const match = fullName.match(/test[_\s]([a-zA-Z0-9_-]+)/i); + return match ? match[1] : fullName.replace(/\s+/g, '_'); + } +} + +export class JestRunner implements Runner { + name = 'jest' as const; + + async run(config: RunnerConfig): Promise { + const { testDir, outputFile, options, sandboxConfig } = config; + + const reportFile = join(testDir, '.jest_report.json'); + + const args = [ + 'jest', + '--json', + `--outputFile=${reportFile}`, + ]; + + if (options.filter && options.filter.length > 0) { + args.push('--testNamePattern', options.filter.join('|')); + } + + args.push(testDir); + + const result = await sandboxedExec('npx', args, { + cwd: options.cwd || process.cwd(), + timeout: options.timeout, + env: options.env, + sandboxConfig: sandboxConfig, + }); + + let report: VitestJsonReport | undefined; + if (existsSync(reportFile)) { + try { + const content = await readFile(reportFile, 'utf-8'); + report = JSON.parse(content); + } catch (e) { + } + } + + const executionResult = this.parseResults(result.stdout + result.stderr, report); + + if (result.timedOut) { + executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`); + } + + if (outputFile) { + await mkdir(dirname(outputFile), { recursive: true }); + await writeFile(outputFile, JSON.stringify(executionResult, null, 2)); + } + + return executionResult; + } + + parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult { + const vitestRunner = new VitestRunner(); + return vitestRunner.parseResults(rawOutput, jsonReport); + } +}