mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-15 06:04:41 +00:00
promptfoo ui and testcon
This commit is contained in:
parent
e0c36241b0
commit
6698c12e5b
18 changed files with 2169 additions and 0 deletions
145
src/cli/commands/run.ts
Normal file
145
src/cli/commands/run.ts
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
import { Command } from 'commander';
|
||||
import { existsSync, readFileSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import {
|
||||
runTests,
|
||||
formatResults,
|
||||
detectTestFramework,
|
||||
type TestFramework,
|
||||
type ExecutionOptions,
|
||||
DEFAULT_SANDBOX_CONFIG
|
||||
} from '../../runners/index.js';
|
||||
import { createTracer, saveTrace } from '../../observability/index.js';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
|
||||
export const runCommand = new Command('run')
|
||||
.description('Run generated tests and collect results')
|
||||
.argument('[test-dir]', 'Directory containing test files', './tests/generated')
|
||||
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
||||
.option('-s, --spec <spec>', 'Path to EvalSpec JSON for result mapping')
|
||||
.option('--sandbox', 'Run tests in sandbox mode', true)
|
||||
.option('--no-sandbox', 'Disable sandbox mode')
|
||||
.option('-t, --timeout <ms>', 'Test timeout in milliseconds', '300000')
|
||||
.option('-p, --parallel', 'Run tests in parallel', false)
|
||||
.option('--filter <patterns...>', 'Run only tests matching patterns')
|
||||
.option('-o, --output <file>', 'Output results to JSON file')
|
||||
.option('--trace', 'Record execution trace', true)
|
||||
.option('--no-trace', 'Disable execution tracing')
|
||||
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
|
||||
.action(async (testDir: string, options) => {
|
||||
try {
|
||||
console.log(`\n🧪 Running tests from ${testDir}...\n`);
|
||||
|
||||
if (!existsSync(testDir)) {
|
||||
console.error(`Error: Test directory not found: ${testDir}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const framework: TestFramework = options.framework || detectTestFramework(testDir);
|
||||
console.log(` Framework: ${framework}`);
|
||||
console.log(` Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`);
|
||||
console.log(` Timeout: ${options.timeout}ms`);
|
||||
|
||||
let spec: EvalSpec | undefined;
|
||||
if (options.spec && existsSync(options.spec)) {
|
||||
spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
|
||||
console.log(` Spec: ${options.spec} (${spec.scenarios.length} scenarios)`);
|
||||
}
|
||||
|
||||
const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
|
||||
|
||||
const execOptions: ExecutionOptions = {
|
||||
framework,
|
||||
sandbox: options.sandbox,
|
||||
timeout: parseInt(options.timeout, 10),
|
||||
parallel: options.parallel,
|
||||
filter: options.filter,
|
||||
cwd: process.cwd(),
|
||||
};
|
||||
|
||||
if (tracer) {
|
||||
tracer.recordIntrospection({
|
||||
filesAnalyzed: [testDir],
|
||||
duration: 0,
|
||||
});
|
||||
}
|
||||
|
||||
console.log('\n Running tests...\n');
|
||||
const startTime = Date.now();
|
||||
|
||||
const result = await runTests(
|
||||
testDir,
|
||||
execOptions,
|
||||
options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
|
||||
);
|
||||
|
||||
if (tracer) {
|
||||
tracer.recordExecution({
|
||||
testsPassed: result.summary.passed,
|
||||
testsFailed: result.summary.failed,
|
||||
testsSkipped: result.summary.skipped,
|
||||
});
|
||||
|
||||
for (const test of result.tests) {
|
||||
if (test.status === 'failed' || test.status === 'error') {
|
||||
tracer.recordTestFailure({
|
||||
scenarioId: test.id,
|
||||
testName: test.name,
|
||||
error: test.error?.message || 'Unknown error',
|
||||
stack: test.error?.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(formatResults(result));
|
||||
|
||||
if (spec) {
|
||||
const mappedResults = mapResultsToScenarios(result, spec);
|
||||
console.log(`\n📊 Scenario Coverage:`);
|
||||
console.log(` Covered: ${mappedResults.covered}/${spec.scenarios.length}`);
|
||||
console.log(` Unmapped: ${mappedResults.unmapped}`);
|
||||
}
|
||||
|
||||
if (options.output) {
|
||||
const { writeFileSync, mkdirSync } = await import('fs');
|
||||
const { dirname } = await import('path');
|
||||
mkdirSync(dirname(options.output), { recursive: true });
|
||||
writeFileSync(options.output, JSON.stringify(result, null, 2));
|
||||
console.log(`\n📁 Results saved to: ${options.output}`);
|
||||
}
|
||||
|
||||
if (tracer) {
|
||||
const trace = tracer.finalize();
|
||||
const tracePath = await saveTrace(trace);
|
||||
console.log(`\n📊 Trace saved: ${tracePath}`);
|
||||
console.log(` View with: evaluclaude view ${trace.id}`);
|
||||
}
|
||||
|
||||
if (result.summary.failed > 0) {
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error running tests:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
function mapResultsToScenarios(
|
||||
result: Awaited<ReturnType<typeof runTests>>,
|
||||
spec: EvalSpec
|
||||
): { covered: number; unmapped: number } {
|
||||
const scenarioIds = new Set(spec.scenarios.map(s => s.id));
|
||||
let covered = 0;
|
||||
let unmapped = 0;
|
||||
|
||||
for (const test of result.tests) {
|
||||
if (scenarioIds.has(test.id)) {
|
||||
covered++;
|
||||
} else {
|
||||
unmapped++;
|
||||
}
|
||||
}
|
||||
|
||||
return { covered, unmapped };
|
||||
}
|
||||
236
src/cli/commands/ui.ts
Normal file
236
src/cli/commands/ui.ts
Normal file
|
|
@ -0,0 +1,236 @@
|
|||
import { Command } from 'commander';
|
||||
import { spawn, type ChildProcess } from 'child_process';
|
||||
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
||||
|
||||
const EVALUCLAUDE_DIR = '.evaluclaude';
|
||||
const CONFIG_FILE = 'promptfooconfig.yaml';
|
||||
const PROVIDERS_DIR = 'providers';
|
||||
|
||||
export const uiCommand = new Command('ui')
|
||||
.description('Launch the evaluation dashboard UI')
|
||||
.option('-p, --port <port>', 'Port to run the UI on', '3000')
|
||||
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
|
||||
.option('--generate', 'Regenerate Promptfoo config from spec')
|
||||
.option('--no-open', 'Do not auto-open browser')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const port = parseInt(options.port, 10);
|
||||
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
|
||||
if (options.spec && options.generate) {
|
||||
console.log('\n📄 Generating Promptfoo configuration...');
|
||||
|
||||
if (!existsSync(options.spec)) {
|
||||
console.error(`Error: Spec file not found: ${options.spec}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
||||
|
||||
await generatePromptfooConfig(spec, {
|
||||
testDir: './tests/generated',
|
||||
outputPath: configPath,
|
||||
framework: detectFramework(spec),
|
||||
includeTraceLinks: true,
|
||||
});
|
||||
|
||||
await generateTestProvider(providerPath);
|
||||
|
||||
console.log(` Config: ${configPath}`);
|
||||
console.log(` Provider: ${providerPath}`);
|
||||
}
|
||||
|
||||
if (!existsSync(configPath)) {
|
||||
console.log('\n⚠️ No Promptfoo config found.');
|
||||
console.log(' Run with --spec <file> --generate to create one.\n');
|
||||
console.log(' Or create one manually:');
|
||||
console.log(` ${configPath}\n`);
|
||||
|
||||
await createDefaultConfig(configPath, providerPath);
|
||||
console.log(` Created default config at ${configPath}`);
|
||||
}
|
||||
|
||||
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
|
||||
console.log(` Config: ${configPath}\n`);
|
||||
|
||||
await launchPromptfooUI(port, configPath, options.open);
|
||||
} catch (error) {
|
||||
console.error('Error launching UI:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const evalCommand = new Command('eval')
|
||||
.description('Run evaluations with Promptfoo and optionally launch UI')
|
||||
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
|
||||
.option('-c, --config <config>', 'Path to promptfooconfig.yaml')
|
||||
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
|
||||
.option('--view', 'Launch UI after evaluation', false)
|
||||
.option('-p, --port <port>', 'Port for UI', '3000')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||
|
||||
if (options.spec) {
|
||||
console.log('\n📄 Generating Promptfoo configuration from spec...');
|
||||
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
||||
|
||||
await generatePromptfooConfig(spec, {
|
||||
testDir: './tests/generated',
|
||||
outputPath: configPath,
|
||||
framework: detectFramework(spec),
|
||||
includeTraceLinks: true,
|
||||
});
|
||||
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
await generateTestProvider(providerPath);
|
||||
}
|
||||
|
||||
if (!existsSync(configPath)) {
|
||||
console.error(`Error: Config not found: ${configPath}`);
|
||||
console.log('Run with --spec <file> to generate from EvalSpec.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n🧪 Running Promptfoo evaluations...\n');
|
||||
|
||||
const outputFile = join(options.output, `eval-${Date.now()}.json`);
|
||||
mkdirSync(dirname(outputFile), { recursive: true });
|
||||
|
||||
await runPromptfooEval(configPath, outputFile);
|
||||
|
||||
console.log(`\n📁 Results saved: ${outputFile}`);
|
||||
|
||||
if (options.view) {
|
||||
console.log(`\n🚀 Launching UI on port ${options.port}...`);
|
||||
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error running eval:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
async function launchPromptfooUI(
|
||||
port: number,
|
||||
configPath: string,
|
||||
openBrowser: boolean
|
||||
): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const args = ['promptfoo', 'view', '--port', String(port)];
|
||||
|
||||
if (openBrowser) {
|
||||
args.push('-y');
|
||||
} else {
|
||||
args.push('-n');
|
||||
}
|
||||
|
||||
const configDir = dirname(configPath);
|
||||
args.push(configDir);
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
||||
const child = spawn('npx', args, {
|
||||
stdio: 'inherit',
|
||||
env: { ...process.env },
|
||||
});
|
||||
|
||||
child.on('error', (error) => {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
|
||||
} else {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`Promptfoo exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
child.kill('SIGINT');
|
||||
process.exit(0);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const args = [
|
||||
'promptfoo',
|
||||
'eval',
|
||||
'-c', configPath,
|
||||
'-o', outputFile,
|
||||
'--no-cache',
|
||||
];
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
||||
const child = spawn('npx', args, {
|
||||
stdio: 'inherit',
|
||||
env: { ...process.env },
|
||||
});
|
||||
|
||||
child.on('error', reject);
|
||||
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`Promptfoo eval exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
|
||||
const defaultConfig = `# Evaluclaude Promptfoo Configuration
|
||||
# Generated by evaluclaude
|
||||
|
||||
description: "Evaluclaude functional test evaluations"
|
||||
|
||||
providers:
|
||||
- id: file://${providerPath}
|
||||
label: functional-tests
|
||||
config:
|
||||
test_dir: ./tests/generated
|
||||
framework: pytest
|
||||
timeout: 300
|
||||
|
||||
prompts:
|
||||
- "{{scenario_id}}"
|
||||
|
||||
tests:
|
||||
- description: "Example test"
|
||||
vars:
|
||||
scenario_id: "test_example"
|
||||
assert:
|
||||
- type: python
|
||||
value: |
|
||||
import json
|
||||
result = json.loads(output)
|
||||
result.get('passed', 0) > 0
|
||||
|
||||
outputPath: .evaluclaude/results/promptfoo-results.json
|
||||
`;
|
||||
|
||||
mkdirSync(dirname(configPath), { recursive: true });
|
||||
writeFileSync(configPath, defaultConfig);
|
||||
|
||||
await generateTestProvider(providerPath);
|
||||
}
|
||||
|
||||
function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
|
||||
if (spec.repo.languages.includes('python')) {
|
||||
return 'pytest';
|
||||
}
|
||||
return 'vitest';
|
||||
}
|
||||
90
src/cli/commands/view.ts
Normal file
90
src/cli/commands/view.ts
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
import { Command } from 'commander';
|
||||
import {
|
||||
loadTrace,
|
||||
listTraces,
|
||||
getLatestTrace,
|
||||
formatTrace,
|
||||
formatTraceList
|
||||
} from '../../observability/index.js';
|
||||
|
||||
export const viewCommand = new Command('view')
|
||||
.description('View evaluation traces')
|
||||
.argument('[trace-id]', 'Specific trace ID to view')
|
||||
.option('--last', 'View the most recent trace')
|
||||
.option('--list', 'List all traces')
|
||||
.option('--json', 'Output as raw JSON')
|
||||
.option('-v, --verbose', 'Show verbose output including tool calls')
|
||||
.option('--tools', 'Show tool call details')
|
||||
.option('--questions', 'Show questions asked', true)
|
||||
.option('--decisions', 'Show decisions made', true)
|
||||
.option('-n, --limit <count>', 'Limit number of traces listed', '20')
|
||||
.option('--eval <eval-id>', 'Filter traces by eval ID')
|
||||
.action(async (traceId: string | undefined, options) => {
|
||||
try {
|
||||
if (options.list) {
|
||||
const traces = await listTraces(options.eval);
|
||||
const limited = traces.slice(0, parseInt(options.limit, 10));
|
||||
|
||||
if (traces.length === 0) {
|
||||
console.log('\nNo traces found.');
|
||||
console.log('Run `evaluclaude run` to generate traces.\n');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(formatTraceList(limited));
|
||||
|
||||
if (traces.length > limited.length) {
|
||||
console.log(`Showing ${limited.length} of ${traces.length} traces.`);
|
||||
console.log(`Use --limit to see more.\n`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let trace;
|
||||
|
||||
if (options.last || !traceId) {
|
||||
trace = await getLatestTrace();
|
||||
if (!trace) {
|
||||
console.log('\nNo traces found.');
|
||||
console.log('Run `evaluclaude run` to generate traces.\n');
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
trace = await loadTrace(traceId);
|
||||
if (!trace) {
|
||||
console.error(`\nTrace not found: ${traceId}`);
|
||||
console.log('Use `evaluclaude view --list` to see available traces.\n');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
const output = formatTrace(trace, {
|
||||
json: options.json,
|
||||
verbose: options.verbose,
|
||||
showToolCalls: options.tools || options.verbose,
|
||||
showQuestions: options.questions,
|
||||
showDecisions: options.decisions,
|
||||
});
|
||||
|
||||
console.log(output);
|
||||
} catch (error) {
|
||||
console.error('Error viewing trace:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const tracesCommand = new Command('traces')
|
||||
.description('List all evaluation traces (alias for view --list)')
|
||||
.option('-n, --limit <count>', 'Limit number of traces', '20')
|
||||
.option('--eval <eval-id>', 'Filter by eval ID')
|
||||
.action(async (options) => {
|
||||
const traces = await listTraces(options.eval);
|
||||
const limited = traces.slice(0, parseInt(options.limit, 10));
|
||||
|
||||
if (traces.length === 0) {
|
||||
console.log('\nNo traces found.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(formatTraceList(limited));
|
||||
});
|
||||
|
|
@ -5,6 +5,9 @@ import { introCommand } from './commands/intro.js';
|
|||
import { analyzeCommand } from './commands/analyze.js';
|
||||
import { renderCommand } from './commands/render.js';
|
||||
import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/grade.js';
|
||||
import { runCommand } from './commands/run.js';
|
||||
import { viewCommand, tracesCommand } from './commands/view.js';
|
||||
import { uiCommand, evalCommand } from './commands/ui.js';
|
||||
|
||||
const program = new Command();
|
||||
|
||||
|
|
@ -19,5 +22,10 @@ program.addCommand(renderCommand);
|
|||
program.addCommand(gradeCommand);
|
||||
program.addCommand(listRubricsCommand);
|
||||
program.addCommand(calibrateCommand);
|
||||
program.addCommand(runCommand);
|
||||
program.addCommand(viewCommand);
|
||||
program.addCommand(tracesCommand);
|
||||
program.addCommand(uiCommand);
|
||||
program.addCommand(evalCommand);
|
||||
|
||||
program.parse(process.argv);
|
||||
|
|
|
|||
|
|
@ -2,3 +2,6 @@ export * from './introspector/index.js';
|
|||
export * from './analyzer/index.js';
|
||||
export * from './renderers/index.js';
|
||||
export * from './graders/index.js';
|
||||
export * from './runners/index.js';
|
||||
export * from './observability/index.js';
|
||||
export * from './promptfoo/index.js';
|
||||
|
|
|
|||
15
src/observability/index.ts
Normal file
15
src/observability/index.ts
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
export * from './types.js';
|
||||
export { Tracer, createTracer } from './tracer.js';
|
||||
export {
|
||||
TraceStore,
|
||||
traceStore,
|
||||
saveTrace,
|
||||
loadTrace,
|
||||
listTraces,
|
||||
getLatestTrace
|
||||
} from './trace-store.js';
|
||||
export {
|
||||
formatTrace,
|
||||
formatTraceList,
|
||||
type ViewOptions
|
||||
} from './trace-viewer.js';
|
||||
117
src/observability/trace-store.ts
Normal file
117
src/observability/trace-store.ts
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
import { mkdir, readdir, readFile, writeFile } from 'fs/promises';
|
||||
import { existsSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import type { EvalTrace, TraceListItem } from './types.js';
|
||||
|
||||
const DEFAULT_TRACES_DIR = '.evaluclaude/traces';
|
||||
|
||||
export class TraceStore {
|
||||
private tracesDir: string;
|
||||
|
||||
constructor(tracesDir: string = DEFAULT_TRACES_DIR) {
|
||||
this.tracesDir = tracesDir;
|
||||
}
|
||||
|
||||
async save(trace: EvalTrace): Promise<string> {
|
||||
await mkdir(this.tracesDir, { recursive: true });
|
||||
const filePath = join(this.tracesDir, `${trace.id}.json`);
|
||||
await writeFile(filePath, JSON.stringify(trace, null, 2));
|
||||
return filePath;
|
||||
}
|
||||
|
||||
async load(traceId: string): Promise<EvalTrace | null> {
|
||||
const filePath = join(this.tracesDir, `${traceId}.json`);
|
||||
if (!existsSync(filePath)) {
|
||||
return null;
|
||||
}
|
||||
const content = await readFile(filePath, 'utf-8');
|
||||
return JSON.parse(content) as EvalTrace;
|
||||
}
|
||||
|
||||
async list(evalId?: string): Promise<TraceListItem[]> {
|
||||
if (!existsSync(this.tracesDir)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const files = await readdir(this.tracesDir);
|
||||
const jsonFiles = files.filter(f => f.endsWith('.json'));
|
||||
|
||||
const traces: TraceListItem[] = [];
|
||||
|
||||
for (const file of jsonFiles) {
|
||||
try {
|
||||
const content = await readFile(join(this.tracesDir, file), 'utf-8');
|
||||
const trace = JSON.parse(content) as EvalTrace;
|
||||
|
||||
if (evalId && trace.evalId !== evalId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
traces.push({
|
||||
id: trace.id,
|
||||
evalId: trace.evalId,
|
||||
startedAt: trace.startedAt,
|
||||
status: trace.status,
|
||||
duration: trace.duration,
|
||||
testsPassed: trace.execution.testsPassed,
|
||||
testsFailed: trace.execution.testsFailed,
|
||||
});
|
||||
} catch (e) {
|
||||
}
|
||||
}
|
||||
|
||||
return traces.sort((a, b) =>
|
||||
new Date(b.startedAt).getTime() - new Date(a.startedAt).getTime()
|
||||
);
|
||||
}
|
||||
|
||||
async getLatest(): Promise<EvalTrace | null> {
|
||||
const traces = await this.list();
|
||||
if (traces.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return this.load(traces[0].id);
|
||||
}
|
||||
|
||||
async delete(traceId: string): Promise<boolean> {
|
||||
const filePath = join(this.tracesDir, `${traceId}.json`);
|
||||
if (!existsSync(filePath)) {
|
||||
return false;
|
||||
}
|
||||
const { unlink } = await import('fs/promises');
|
||||
await unlink(filePath);
|
||||
return true;
|
||||
}
|
||||
|
||||
async cleanup(keepCount: number = 50): Promise<number> {
|
||||
const traces = await this.list();
|
||||
const toDelete = traces.slice(keepCount);
|
||||
|
||||
let deleted = 0;
|
||||
for (const trace of toDelete) {
|
||||
if (await this.delete(trace.id)) {
|
||||
deleted++;
|
||||
}
|
||||
}
|
||||
|
||||
return deleted;
|
||||
}
|
||||
}
|
||||
|
||||
export const traceStore = new TraceStore();
|
||||
|
||||
export async function saveTrace(trace: EvalTrace): Promise<string> {
|
||||
return traceStore.save(trace);
|
||||
}
|
||||
|
||||
export async function loadTrace(traceId: string): Promise<EvalTrace | null> {
|
||||
return traceStore.load(traceId);
|
||||
}
|
||||
|
||||
export async function listTraces(evalId?: string): Promise<TraceListItem[]> {
|
||||
return traceStore.list(evalId);
|
||||
}
|
||||
|
||||
export async function getLatestTrace(): Promise<EvalTrace | null> {
|
||||
return traceStore.getLatest();
|
||||
}
|
||||
226
src/observability/trace-viewer.ts
Normal file
226
src/observability/trace-viewer.ts
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
import type { EvalTrace, ToolCall, Question, Decision, TestFailure } from './types.js';
|
||||
|
||||
export interface ViewOptions {
|
||||
json: boolean;
|
||||
verbose: boolean;
|
||||
showToolCalls: boolean;
|
||||
showQuestions: boolean;
|
||||
showDecisions: boolean;
|
||||
}
|
||||
|
||||
const DEFAULT_VIEW_OPTIONS: ViewOptions = {
|
||||
json: false,
|
||||
verbose: false,
|
||||
showToolCalls: false,
|
||||
showQuestions: true,
|
||||
showDecisions: true,
|
||||
};
|
||||
|
||||
export function formatTrace(trace: EvalTrace, options: Partial<ViewOptions> = {}): string {
|
||||
const opts = { ...DEFAULT_VIEW_OPTIONS, ...options };
|
||||
|
||||
if (opts.json) {
|
||||
return JSON.stringify(trace, null, 2);
|
||||
}
|
||||
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push('');
|
||||
lines.push('═'.repeat(60));
|
||||
lines.push(`📊 Trace: ${trace.id}`);
|
||||
lines.push('═'.repeat(60));
|
||||
lines.push('');
|
||||
|
||||
lines.push(` Status: ${formatStatus(trace.status)}`);
|
||||
lines.push(` Started: ${formatDate(trace.startedAt)}`);
|
||||
lines.push(` Duration: ${formatDuration(trace.duration)}`);
|
||||
lines.push(` Eval ID: ${trace.evalId}`);
|
||||
lines.push('');
|
||||
|
||||
lines.push('📂 Introspection');
|
||||
lines.push('─'.repeat(40));
|
||||
lines.push(` Files analyzed: ${trace.introspection.filesAnalyzed.length}`);
|
||||
lines.push(` Functions found: ${trace.introspection.totalFunctions}`);
|
||||
lines.push(` Classes found: ${trace.introspection.totalClasses}`);
|
||||
lines.push(` Duration: ${formatDuration(trace.introspection.duration)}`);
|
||||
lines.push('');
|
||||
|
||||
lines.push('🤖 Analysis');
|
||||
lines.push('─'.repeat(40));
|
||||
lines.push(` Tool calls: ${trace.analysis.toolCalls.length}`);
|
||||
lines.push(` Questions asked: ${trace.analysis.questionsAsked.length}`);
|
||||
lines.push(` Decisions made: ${trace.analysis.decisions.length}`);
|
||||
lines.push(` Prompt tokens: ${trace.analysis.promptTokens.toLocaleString()}`);
|
||||
lines.push(` Completion tokens: ${trace.analysis.completionTokens.toLocaleString()}`);
|
||||
lines.push('');
|
||||
|
||||
lines.push('📝 Generation');
|
||||
lines.push('─'.repeat(40));
|
||||
lines.push(` Scenarios: ${trace.generation.scenariosGenerated}`);
|
||||
lines.push(` Files written: ${trace.generation.filesWritten.length}`);
|
||||
lines.push('');
|
||||
|
||||
lines.push('🧪 Execution');
|
||||
lines.push('─'.repeat(40));
|
||||
lines.push(` ✅ Passed: ${trace.execution.testsPassed}`);
|
||||
lines.push(` ❌ Failed: ${trace.execution.testsFailed}`);
|
||||
lines.push(` ⏭️ Skipped: ${trace.execution.testsSkipped}`);
|
||||
lines.push('');
|
||||
|
||||
if (opts.showQuestions && trace.analysis.questionsAsked.length > 0) {
|
||||
lines.push('❓ Questions Asked');
|
||||
lines.push('─'.repeat(40));
|
||||
for (const q of trace.analysis.questionsAsked) {
|
||||
lines.push(formatQuestion(q));
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
if (opts.showDecisions && trace.analysis.decisions.length > 0) {
|
||||
lines.push('🎯 Key Decisions');
|
||||
lines.push('─'.repeat(40));
|
||||
for (const d of trace.analysis.decisions.slice(0, 10)) {
|
||||
lines.push(formatDecision(d));
|
||||
}
|
||||
if (trace.analysis.decisions.length > 10) {
|
||||
lines.push(` ... and ${trace.analysis.decisions.length - 10} more`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
if (opts.showToolCalls && trace.analysis.toolCalls.length > 0) {
|
||||
lines.push('🔧 Tool Calls');
|
||||
lines.push('─'.repeat(40));
|
||||
for (const tc of trace.analysis.toolCalls.slice(0, 20)) {
|
||||
lines.push(formatToolCall(tc, opts.verbose));
|
||||
}
|
||||
if (trace.analysis.toolCalls.length > 20) {
|
||||
lines.push(` ... and ${trace.analysis.toolCalls.length - 20} more`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
if (trace.execution.failures.length > 0) {
|
||||
lines.push('❌ Test Failures');
|
||||
lines.push('─'.repeat(40));
|
||||
for (const f of trace.execution.failures) {
|
||||
lines.push(formatFailure(f));
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
if (trace.errors.length > 0) {
|
||||
lines.push('⚠️ Errors');
|
||||
lines.push('─'.repeat(40));
|
||||
for (const e of trace.errors) {
|
||||
lines.push(` [${formatDate(e.timestamp)}]`);
|
||||
lines.push(` ${e.message}`);
|
||||
if (e.context) {
|
||||
lines.push(` Context: ${e.context}`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
}
|
||||
|
||||
lines.push('═'.repeat(60));
|
||||
lines.push('');
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
function formatStatus(status: EvalTrace['status']): string {
|
||||
switch (status) {
|
||||
case 'success':
|
||||
return '✅ Success';
|
||||
case 'partial':
|
||||
return '⚠️ Partial';
|
||||
case 'failed':
|
||||
return '❌ Failed';
|
||||
default:
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
function formatDate(iso: string): string {
|
||||
return new Date(iso).toLocaleString();
|
||||
}
|
||||
|
||||
function formatDuration(ms: number): string {
|
||||
if (ms < 1000) {
|
||||
return `${ms}ms`;
|
||||
}
|
||||
if (ms < 60000) {
|
||||
return `${(ms / 1000).toFixed(1)}s`;
|
||||
}
|
||||
const minutes = Math.floor(ms / 60000);
|
||||
const seconds = ((ms % 60000) / 1000).toFixed(0);
|
||||
return `${minutes}m ${seconds}s`;
|
||||
}
|
||||
|
||||
function formatQuestion(q: Question): string {
|
||||
const lines: string[] = [];
|
||||
lines.push(` Q: ${q.question}`);
|
||||
if (q.answer) {
|
||||
lines.push(` A: ${q.answer}`);
|
||||
} else {
|
||||
lines.push(` A: (no answer)`);
|
||||
}
|
||||
lines.push('');
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
function formatDecision(d: Decision): string {
|
||||
const icon = d.type === 'include' ? '✓' : d.type === 'exclude' ? '✗' : '→';
|
||||
return ` ${icon} [${d.type}] ${d.subject}\n Reason: ${d.reasoning}\n Confidence: ${(d.confidence * 100).toFixed(0)}%\n`;
|
||||
}
|
||||
|
||||
function formatToolCall(tc: ToolCall, verbose: boolean): string {
|
||||
const duration = formatDuration(tc.duration);
|
||||
if (verbose) {
|
||||
return ` [${tc.tool}] (${duration})\n Input: ${JSON.stringify(tc.input).slice(0, 100)}...\n`;
|
||||
}
|
||||
return ` ${tc.tool} (${duration})`;
|
||||
}
|
||||
|
||||
function formatFailure(f: TestFailure): string {
|
||||
const lines: string[] = [];
|
||||
lines.push(` • ${f.testName}`);
|
||||
lines.push(` Scenario: ${f.scenarioId}`);
|
||||
lines.push(` Error: ${f.error}`);
|
||||
if (f.expected !== undefined && f.actual !== undefined) {
|
||||
lines.push(` Expected: ${JSON.stringify(f.expected)}`);
|
||||
lines.push(` Actual: ${JSON.stringify(f.actual)}`);
|
||||
}
|
||||
lines.push('');
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
export function formatTraceList(traces: Array<{
|
||||
id: string;
|
||||
startedAt: string;
|
||||
status: string;
|
||||
duration: number;
|
||||
testsPassed: number;
|
||||
testsFailed: number;
|
||||
}>): string {
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push('');
|
||||
lines.push('📋 Recent Traces');
|
||||
lines.push('═'.repeat(80));
|
||||
lines.push('');
|
||||
lines.push('ID Status Passed Failed Duration');
|
||||
lines.push('─'.repeat(80));
|
||||
|
||||
for (const t of traces) {
|
||||
const statusIcon = t.status === 'success' ? '✅' : t.status === 'partial' ? '⚠️ ' : '❌';
|
||||
const id = t.id.slice(0, 36);
|
||||
const passed = String(t.testsPassed).padStart(6);
|
||||
const failed = String(t.testsFailed).padStart(6);
|
||||
const duration = formatDuration(t.duration).padStart(8);
|
||||
lines.push(`${id} ${statusIcon} ${passed} ${failed} ${duration}`);
|
||||
}
|
||||
|
||||
lines.push('');
|
||||
return lines.join('\n');
|
||||
}
|
||||
168
src/observability/tracer.ts
Normal file
168
src/observability/tracer.ts
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
import { randomUUID } from 'crypto';
|
||||
import type {
|
||||
EvalTrace,
|
||||
ToolCall,
|
||||
Question,
|
||||
Decision,
|
||||
TraceError,
|
||||
TestFailure,
|
||||
IntrospectionTrace,
|
||||
GenerationTrace,
|
||||
ExecutionTrace,
|
||||
} from './types.js';
|
||||
|
||||
export class Tracer {
|
||||
private trace: EvalTrace;
|
||||
private currentToolCall?: { name: string; input: unknown; startTime: number };
|
||||
private startTime: number;
|
||||
|
||||
constructor(evalId: string) {
|
||||
this.startTime = Date.now();
|
||||
this.trace = {
|
||||
id: randomUUID(),
|
||||
evalId,
|
||||
startedAt: new Date().toISOString(),
|
||||
completedAt: '',
|
||||
duration: 0,
|
||||
status: 'success',
|
||||
introspection: {
|
||||
filesAnalyzed: [],
|
||||
totalFunctions: 0,
|
||||
totalClasses: 0,
|
||||
duration: 0,
|
||||
},
|
||||
analysis: {
|
||||
promptTokens: 0,
|
||||
completionTokens: 0,
|
||||
toolCalls: [],
|
||||
questionsAsked: [],
|
||||
decisions: [],
|
||||
},
|
||||
generation: {
|
||||
scenariosGenerated: 0,
|
||||
filesWritten: [],
|
||||
},
|
||||
execution: {
|
||||
testsPassed: 0,
|
||||
testsFailed: 0,
|
||||
testsSkipped: 0,
|
||||
failures: [],
|
||||
},
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
|
||||
get traceId(): string {
|
||||
return this.trace.id;
|
||||
}
|
||||
|
||||
recordToolStart(name: string, input: unknown): void {
|
||||
this.currentToolCall = { name, input, startTime: Date.now() };
|
||||
}
|
||||
|
||||
recordToolEnd(name: string, output: unknown): void {
|
||||
if (this.currentToolCall?.name === name) {
|
||||
const toolCall: ToolCall = {
|
||||
timestamp: new Date().toISOString(),
|
||||
tool: name,
|
||||
input: this.currentToolCall.input,
|
||||
output,
|
||||
duration: Date.now() - this.currentToolCall.startTime,
|
||||
};
|
||||
this.trace.analysis.toolCalls.push(toolCall);
|
||||
this.currentToolCall = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
recordQuestion(question: Question): void {
|
||||
this.trace.analysis.questionsAsked.push({
|
||||
...question,
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
|
||||
recordAnswer(questionId: string, answer: string): void {
|
||||
const question = this.trace.analysis.questionsAsked.find(q => q.id === questionId);
|
||||
if (question) {
|
||||
question.answer = answer;
|
||||
}
|
||||
}
|
||||
|
||||
recordDecision(
|
||||
type: Decision['type'],
|
||||
subject: string,
|
||||
reasoning: string,
|
||||
confidence: number
|
||||
): void {
|
||||
this.trace.analysis.decisions.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
type,
|
||||
subject,
|
||||
reasoning,
|
||||
confidence: Math.max(0, Math.min(1, confidence)),
|
||||
});
|
||||
}
|
||||
|
||||
recordIntrospection(data: Partial<IntrospectionTrace>): void {
|
||||
Object.assign(this.trace.introspection, data);
|
||||
}
|
||||
|
||||
recordGeneration(data: Partial<GenerationTrace>): void {
|
||||
Object.assign(this.trace.generation, data);
|
||||
}
|
||||
|
||||
recordExecution(data: Partial<ExecutionTrace>): void {
|
||||
Object.assign(this.trace.execution, data);
|
||||
}
|
||||
|
||||
recordTestFailure(failure: TestFailure): void {
|
||||
this.trace.execution.failures.push(failure);
|
||||
this.trace.execution.testsFailed++;
|
||||
}
|
||||
|
||||
recordTestPass(): void {
|
||||
this.trace.execution.testsPassed++;
|
||||
}
|
||||
|
||||
recordTokenUsage(promptTokens: number, completionTokens: number): void {
|
||||
this.trace.analysis.promptTokens += promptTokens;
|
||||
this.trace.analysis.completionTokens += completionTokens;
|
||||
}
|
||||
|
||||
recordError(error: Error, context?: string): void {
|
||||
const traceError: TraceError = {
|
||||
timestamp: new Date().toISOString(),
|
||||
message: error.message,
|
||||
stack: error.stack,
|
||||
context,
|
||||
};
|
||||
this.trace.errors.push(traceError);
|
||||
|
||||
if (this.trace.status === 'success') {
|
||||
this.trace.status = 'partial';
|
||||
}
|
||||
}
|
||||
|
||||
setStatus(status: EvalTrace['status']): void {
|
||||
this.trace.status = status;
|
||||
}
|
||||
|
||||
finalize(): EvalTrace {
|
||||
this.trace.completedAt = new Date().toISOString();
|
||||
this.trace.duration = Date.now() - this.startTime;
|
||||
|
||||
if (this.trace.errors.length > 0 && this.trace.execution.testsPassed === 0) {
|
||||
this.trace.status = 'failed';
|
||||
}
|
||||
|
||||
return this.trace;
|
||||
}
|
||||
|
||||
getTrace(): EvalTrace {
|
||||
return { ...this.trace };
|
||||
}
|
||||
}
|
||||
|
||||
export function createTracer(evalId: string): Tracer {
|
||||
return new Tracer(evalId);
|
||||
}
|
||||
100
src/observability/types.ts
Normal file
100
src/observability/types.ts
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
export interface EvalTrace {
|
||||
id: string;
|
||||
evalId: string;
|
||||
startedAt: string;
|
||||
completedAt: string;
|
||||
duration: number;
|
||||
|
||||
status: 'success' | 'partial' | 'failed';
|
||||
|
||||
introspection: IntrospectionTrace;
|
||||
analysis: AnalysisTrace;
|
||||
generation: GenerationTrace;
|
||||
execution: ExecutionTrace;
|
||||
|
||||
errors: TraceError[];
|
||||
}
|
||||
|
||||
export interface IntrospectionTrace {
|
||||
filesAnalyzed: string[];
|
||||
totalFunctions: number;
|
||||
totalClasses: number;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
export interface AnalysisTrace {
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
toolCalls: ToolCall[];
|
||||
questionsAsked: Question[];
|
||||
decisions: Decision[];
|
||||
}
|
||||
|
||||
export interface GenerationTrace {
|
||||
scenariosGenerated: number;
|
||||
filesWritten: string[];
|
||||
}
|
||||
|
||||
export interface ExecutionTrace {
|
||||
testsPassed: number;
|
||||
testsFailed: number;
|
||||
testsSkipped: number;
|
||||
failures: TestFailure[];
|
||||
}
|
||||
|
||||
export interface ToolCall {
|
||||
timestamp: string;
|
||||
tool: string;
|
||||
input: unknown;
|
||||
output: unknown;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
export interface Question {
|
||||
id: string;
|
||||
timestamp: string;
|
||||
question: string;
|
||||
options?: string[];
|
||||
answer?: string;
|
||||
defaultAnswer?: string;
|
||||
}
|
||||
|
||||
export interface Decision {
|
||||
timestamp: string;
|
||||
type: 'include' | 'exclude' | 'prioritize' | 'question';
|
||||
subject: string;
|
||||
reasoning: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
export interface TestFailure {
|
||||
scenarioId: string;
|
||||
testName: string;
|
||||
error: string;
|
||||
stack?: string;
|
||||
expected?: unknown;
|
||||
actual?: unknown;
|
||||
}
|
||||
|
||||
export interface TraceError {
|
||||
timestamp: string;
|
||||
message: string;
|
||||
stack?: string;
|
||||
context?: string;
|
||||
}
|
||||
|
||||
export interface TraceEvent {
|
||||
timestamp: string;
|
||||
type: 'tool_start' | 'tool_end' | 'question' | 'decision' | 'error' | 'info';
|
||||
data: unknown;
|
||||
}
|
||||
|
||||
export interface TraceListItem {
|
||||
id: string;
|
||||
evalId: string;
|
||||
startedAt: string;
|
||||
status: EvalTrace['status'];
|
||||
duration: number;
|
||||
testsPassed: number;
|
||||
testsFailed: number;
|
||||
}
|
||||
271
src/promptfoo/config-generator.ts
Normal file
271
src/promptfoo/config-generator.ts
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
import { writeFile, mkdir } from 'fs/promises';
|
||||
import { dirname, join } from 'path';
|
||||
import * as yaml from 'js-yaml';
|
||||
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
||||
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
|
||||
|
||||
export interface ConfigOptions {
|
||||
testDir: string;
|
||||
outputPath: string;
|
||||
framework: 'pytest' | 'vitest' | 'jest';
|
||||
includeTraceLinks: boolean;
|
||||
}
|
||||
|
||||
export async function generatePromptfooConfig(
|
||||
spec: EvalSpec,
|
||||
options: ConfigOptions
|
||||
): Promise<string> {
|
||||
const config = buildConfig(spec, options);
|
||||
const yamlContent = yaml.dump(config, {
|
||||
lineWidth: 120,
|
||||
quotingType: '"',
|
||||
});
|
||||
|
||||
await mkdir(dirname(options.outputPath), { recursive: true });
|
||||
await writeFile(options.outputPath, yamlContent);
|
||||
|
||||
return yamlContent;
|
||||
}
|
||||
|
||||
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
|
||||
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
|
||||
|
||||
return {
|
||||
description: `Evaluclaude functional tests for ${spec.repo.name}`,
|
||||
providers: [
|
||||
{
|
||||
id: `file://providers/test-runner.py`,
|
||||
label: 'functional-tests',
|
||||
config: {
|
||||
test_dir: options.testDir,
|
||||
framework: options.framework,
|
||||
timeout: 300,
|
||||
},
|
||||
},
|
||||
],
|
||||
prompts: ['{{scenario_id}}'],
|
||||
tests,
|
||||
defaultTest: options.includeTraceLinks
|
||||
? {
|
||||
metadata: {
|
||||
traceFile: '.evaluclaude/traces/{{evalId}}.json',
|
||||
},
|
||||
}
|
||||
: undefined,
|
||||
outputPath: '.evaluclaude/results/promptfoo-results.json',
|
||||
};
|
||||
}
|
||||
|
||||
function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest {
|
||||
const assertions = scenario.assertions
|
||||
.filter(a => a.type !== 'llm-rubric')
|
||||
.map(a => buildAssertion(a));
|
||||
|
||||
const llmRubrics = scenario.assertions
|
||||
.filter(a => a.type === 'llm-rubric')
|
||||
.map(a => ({
|
||||
type: 'llm-rubric' as const,
|
||||
value: (a as any).rubric,
|
||||
threshold: (a as any).passingThreshold ?? 0.7,
|
||||
}));
|
||||
|
||||
return {
|
||||
description: scenario.description,
|
||||
vars: {
|
||||
scenario_id: scenario.id,
|
||||
target_module: scenario.target.module,
|
||||
target_function: scenario.target.function,
|
||||
input_args: scenario.input.args,
|
||||
input_kwargs: scenario.input.kwargs,
|
||||
},
|
||||
assert: [...assertions, ...llmRubrics],
|
||||
metadata: {
|
||||
category: scenario.category,
|
||||
priority: scenario.priority,
|
||||
tags: scenario.tags,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildAssertion(assertion: any): PromptfooAssertion {
|
||||
switch (assertion.type) {
|
||||
case 'equals':
|
||||
return {
|
||||
type: 'equals',
|
||||
value: assertion.expected,
|
||||
};
|
||||
|
||||
case 'contains':
|
||||
return {
|
||||
type: 'contains',
|
||||
value: assertion.value,
|
||||
};
|
||||
|
||||
case 'matches':
|
||||
return {
|
||||
type: 'regex',
|
||||
value: assertion.pattern,
|
||||
};
|
||||
|
||||
case 'typeof':
|
||||
return {
|
||||
type: 'python',
|
||||
value: `type(output).__name__ == '${assertion.expected}'`,
|
||||
};
|
||||
|
||||
case 'throws':
|
||||
return {
|
||||
type: 'python',
|
||||
value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`,
|
||||
};
|
||||
|
||||
case 'truthy':
|
||||
return {
|
||||
type: 'python',
|
||||
value: 'bool(output)',
|
||||
};
|
||||
|
||||
case 'falsy':
|
||||
return {
|
||||
type: 'python',
|
||||
value: 'not bool(output)',
|
||||
};
|
||||
|
||||
case 'custom':
|
||||
return {
|
||||
type: 'python',
|
||||
value: assertion.check,
|
||||
};
|
||||
|
||||
default:
|
||||
return {
|
||||
type: 'python',
|
||||
value: 'True',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateTestProvider(outputPath: string): Promise<void> {
|
||||
const providerCode = `#!/usr/bin/env python3
|
||||
"""Promptfoo provider that executes tests and returns structured results."""
|
||||
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
||||
"""Runs tests and returns structured results."""
|
||||
|
||||
test_dir = options.get('config', {}).get('test_dir', './tests')
|
||||
framework = options.get('config', {}).get('framework', 'pytest')
|
||||
timeout = options.get('config', {}).get('timeout', 300)
|
||||
|
||||
scenario_id = prompt.strip()
|
||||
|
||||
try:
|
||||
if framework == 'pytest':
|
||||
result = subprocess.run(
|
||||
[
|
||||
'python', '-m', 'pytest',
|
||||
'--json-report',
|
||||
'--json-report-file=/tmp/pytest_results.json',
|
||||
'-k', scenario_id,
|
||||
test_dir
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
try:
|
||||
with open('/tmp/pytest_results.json') as f:
|
||||
report = json.load(f)
|
||||
|
||||
output = {
|
||||
'passed': report.get('summary', {}).get('passed', 0),
|
||||
'failed': report.get('summary', {}).get('failed', 0),
|
||||
'skipped': report.get('summary', {}).get('skipped', 0),
|
||||
'tests': report.get('tests', []),
|
||||
'stdout': result.stdout,
|
||||
'stderr': result.stderr,
|
||||
'exit_code': result.returncode,
|
||||
}
|
||||
except FileNotFoundError:
|
||||
output = {
|
||||
'passed': 0,
|
||||
'failed': 1,
|
||||
'error': 'Failed to generate pytest report',
|
||||
'stdout': result.stdout,
|
||||
'stderr': result.stderr,
|
||||
}
|
||||
|
||||
elif framework in ('vitest', 'jest'):
|
||||
cmd = ['npx', framework, 'run', '--reporter=json']
|
||||
if scenario_id:
|
||||
cmd.extend(['--testNamePattern', scenario_id])
|
||||
cmd.append(test_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
try:
|
||||
report = json.loads(result.stdout)
|
||||
output = {
|
||||
'passed': report.get('numPassedTests', 0),
|
||||
'failed': report.get('numFailedTests', 0),
|
||||
'skipped': report.get('numSkippedTests', 0),
|
||||
'tests': report.get('testResults', []),
|
||||
'exit_code': result.returncode,
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
output = {
|
||||
'passed': 0,
|
||||
'failed': 1,
|
||||
'error': 'Failed to parse test output',
|
||||
'stdout': result.stdout,
|
||||
'stderr': result.stderr,
|
||||
}
|
||||
else:
|
||||
output = {'error': f'Unknown framework: {framework}'}
|
||||
|
||||
return {
|
||||
'output': json.dumps(output),
|
||||
'error': None,
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return {
|
||||
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
|
||||
'error': None,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'output': None,
|
||||
'error': str(e),
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
# For testing the provider directly
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--scenario', default='')
|
||||
parser.add_argument('--test-dir', default='./tests')
|
||||
parser.add_argument('--framework', default='pytest')
|
||||
args = parser.parse_args()
|
||||
|
||||
result = get_provider_response(
|
||||
args.scenario,
|
||||
{'config': {'test_dir': args.test_dir, 'framework': args.framework}},
|
||||
{}
|
||||
)
|
||||
print(json.dumps(result, indent=2))
|
||||
`;
|
||||
|
||||
await mkdir(dirname(outputPath), { recursive: true });
|
||||
await writeFile(outputPath, providerCode);
|
||||
}
|
||||
2
src/promptfoo/index.ts
Normal file
2
src/promptfoo/index.ts
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
export * from './types.js';
|
||||
export { generatePromptfooConfig, generateTestProvider } from './config-generator.js';
|
||||
89
src/promptfoo/types.ts
Normal file
89
src/promptfoo/types.ts
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
export interface PromptfooConfig {
|
||||
description?: string;
|
||||
providers: PromptfooProvider[];
|
||||
prompts: string[];
|
||||
tests: PromptfooTest[];
|
||||
defaultTest?: PromptfooDefaultTest;
|
||||
outputPath?: string;
|
||||
}
|
||||
|
||||
export interface PromptfooProvider {
|
||||
id: string;
|
||||
label?: string;
|
||||
config?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface PromptfooTest {
|
||||
description?: string;
|
||||
vars?: Record<string, unknown>;
|
||||
assert?: PromptfooAssertion[];
|
||||
options?: Record<string, unknown>;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface PromptfooAssertion {
|
||||
type: string;
|
||||
value?: unknown;
|
||||
threshold?: number;
|
||||
weight?: number;
|
||||
provider?: string;
|
||||
}
|
||||
|
||||
export interface PromptfooDefaultTest {
|
||||
assert?: PromptfooAssertion[];
|
||||
options?: Record<string, unknown>;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface PromptfooResult {
|
||||
version: number;
|
||||
timestamp: string;
|
||||
results: PromptfooTestResult[];
|
||||
stats: {
|
||||
successes: number;
|
||||
failures: number;
|
||||
tokenUsage: {
|
||||
total: number;
|
||||
prompt: number;
|
||||
completion: number;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
export interface PromptfooTestResult {
|
||||
prompt: {
|
||||
raw: string;
|
||||
label: string;
|
||||
};
|
||||
vars: Record<string, unknown>;
|
||||
response: {
|
||||
output: string;
|
||||
tokenUsage?: {
|
||||
total: number;
|
||||
prompt: number;
|
||||
completion: number;
|
||||
};
|
||||
};
|
||||
gradingResult: {
|
||||
pass: boolean;
|
||||
score: number;
|
||||
reason?: string;
|
||||
componentResults?: Array<{
|
||||
pass: boolean;
|
||||
score: number;
|
||||
reason: string;
|
||||
assertion: PromptfooAssertion;
|
||||
}>;
|
||||
};
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface EvalConfig {
|
||||
specPath: string;
|
||||
testDir: string;
|
||||
outputDir: string;
|
||||
framework: 'pytest' | 'vitest' | 'jest';
|
||||
uiPort: number;
|
||||
watch: boolean;
|
||||
}
|
||||
101
src/runners/index.ts
Normal file
101
src/runners/index.ts
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import type { Runner, TestFramework, RunnerConfig, ExecutionResult, ExecutionOptions, SandboxConfig } from './types.js';
|
||||
import { PytestRunner } from './pytest-runner.js';
|
||||
import { VitestRunner, JestRunner } from './vitest-runner.js';
|
||||
import { DEFAULT_SANDBOX_CONFIG } from './types.js';
|
||||
|
||||
export * from './types.js';
|
||||
export { PytestRunner } from './pytest-runner.js';
|
||||
export { VitestRunner, JestRunner } from './vitest-runner.js';
|
||||
export { sandboxedExec } from './sandbox.js';
|
||||
|
||||
const runnerRegistry: Record<TestFramework, new () => Runner> = {
|
||||
pytest: PytestRunner,
|
||||
vitest: VitestRunner,
|
||||
jest: JestRunner,
|
||||
};
|
||||
|
||||
export function createRunner(framework: TestFramework): Runner {
|
||||
const RunnerClass = runnerRegistry[framework];
|
||||
if (!RunnerClass) {
|
||||
throw new Error(`Unknown test framework: ${framework}`);
|
||||
}
|
||||
return new RunnerClass();
|
||||
}
|
||||
|
||||
export async function runTests(
|
||||
testDir: string,
|
||||
options: ExecutionOptions,
|
||||
sandboxConfig: SandboxConfig = DEFAULT_SANDBOX_CONFIG
|
||||
): Promise<ExecutionResult> {
|
||||
const runner = createRunner(options.framework);
|
||||
|
||||
const config: RunnerConfig = {
|
||||
testDir,
|
||||
outputFile: `.evaluclaude/results/${options.framework}-${Date.now()}.json`,
|
||||
options,
|
||||
sandboxConfig: options.sandbox ? sandboxConfig : undefined,
|
||||
};
|
||||
|
||||
return runner.run(config);
|
||||
}
|
||||
|
||||
export function detectTestFramework(testDir: string): TestFramework {
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const pythonFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.py'));
|
||||
const tsFiles = fs.readdirSync(testDir).filter((f: string) => f.endsWith('.ts') || f.endsWith('.js'));
|
||||
|
||||
if (pythonFiles.length > tsFiles.length) {
|
||||
return 'pytest';
|
||||
}
|
||||
|
||||
const packageJsonPath = path.join(testDir, '..', 'package.json');
|
||||
if (fs.existsSync(packageJsonPath)) {
|
||||
try {
|
||||
const pkg = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
|
||||
if (pkg.devDependencies?.jest || pkg.dependencies?.jest) {
|
||||
return 'jest';
|
||||
}
|
||||
} catch (e) {
|
||||
}
|
||||
}
|
||||
|
||||
return 'vitest';
|
||||
}
|
||||
|
||||
export function formatResults(result: ExecutionResult): string {
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push('');
|
||||
lines.push('📊 Test Execution Results');
|
||||
lines.push('═'.repeat(40));
|
||||
lines.push(` Total: ${result.summary.total}`);
|
||||
lines.push(` ✅ Passed: ${result.summary.passed}`);
|
||||
lines.push(` ❌ Failed: ${result.summary.failed}`);
|
||||
lines.push(` ⏭️ Skipped: ${result.summary.skipped}`);
|
||||
lines.push(` ⏱️ Duration: ${result.summary.duration}ms`);
|
||||
|
||||
if (result.errors.length > 0) {
|
||||
lines.push('');
|
||||
lines.push('⚠️ Errors:');
|
||||
for (const error of result.errors) {
|
||||
lines.push(` • ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
const failures = result.tests.filter(t => t.status === 'failed' || t.status === 'error');
|
||||
if (failures.length > 0) {
|
||||
lines.push('');
|
||||
lines.push('❌ Failed Tests:');
|
||||
for (const test of failures) {
|
||||
lines.push(` • ${test.name}`);
|
||||
if (test.error) {
|
||||
lines.push(` ${test.error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lines.push('');
|
||||
return lines.join('\n');
|
||||
}
|
||||
164
src/runners/pytest-runner.ts
Normal file
164
src/runners/pytest-runner.ts
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
import { readFile, writeFile, mkdir } from 'fs/promises';
|
||||
import { existsSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js';
|
||||
import { sandboxedExec } from './sandbox.js';
|
||||
|
||||
interface PytestJsonReport {
|
||||
created: number;
|
||||
duration: number;
|
||||
exitcode: number;
|
||||
root: string;
|
||||
environment: Record<string, string>;
|
||||
summary: {
|
||||
passed: number;
|
||||
failed: number;
|
||||
error: number;
|
||||
skipped: number;
|
||||
total: number;
|
||||
collected: number;
|
||||
};
|
||||
tests: PytestTestResult[];
|
||||
}
|
||||
|
||||
interface PytestTestResult {
|
||||
nodeid: string;
|
||||
outcome: 'passed' | 'failed' | 'skipped' | 'error';
|
||||
keywords: string[];
|
||||
setup?: { duration: number; outcome: string };
|
||||
call?: {
|
||||
duration: number;
|
||||
outcome: string;
|
||||
crash?: { message: string; path: string; lineno: number };
|
||||
traceback?: Array<{ path: string; lineno: number; message: string }>;
|
||||
longrepr?: string;
|
||||
};
|
||||
teardown?: { duration: number; outcome: string };
|
||||
}
|
||||
|
||||
export class PytestRunner implements Runner {
|
||||
name = 'pytest' as const;
|
||||
|
||||
async run(config: RunnerConfig): Promise<ExecutionResult> {
|
||||
const { testDir, outputFile, options, sandboxConfig } = config;
|
||||
|
||||
const reportFile = join(testDir, '.pytest_report.json');
|
||||
|
||||
const args = [
|
||||
'-v',
|
||||
'--tb=short',
|
||||
'--json-report',
|
||||
`--json-report-file=${reportFile}`,
|
||||
];
|
||||
|
||||
if (options.parallel) {
|
||||
args.push('-n', 'auto');
|
||||
}
|
||||
|
||||
if (options.filter && options.filter.length > 0) {
|
||||
args.push('-k', options.filter.join(' or '));
|
||||
}
|
||||
|
||||
args.push(testDir);
|
||||
|
||||
const result = await sandboxedExec('python', ['-m', 'pytest', ...args], {
|
||||
cwd: options.cwd || process.cwd(),
|
||||
timeout: options.timeout,
|
||||
env: options.env,
|
||||
sandboxConfig: sandboxConfig,
|
||||
});
|
||||
|
||||
let report: PytestJsonReport | undefined;
|
||||
if (existsSync(reportFile)) {
|
||||
try {
|
||||
const content = await readFile(reportFile, 'utf-8');
|
||||
report = JSON.parse(content);
|
||||
} catch (e) {
|
||||
}
|
||||
}
|
||||
|
||||
const executionResult = this.parseResults(result.stdout + result.stderr, report);
|
||||
|
||||
if (result.timedOut) {
|
||||
executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
|
||||
}
|
||||
|
||||
if (outputFile) {
|
||||
await mkdir(dirname(outputFile), { recursive: true });
|
||||
await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
|
||||
}
|
||||
|
||||
return executionResult;
|
||||
}
|
||||
|
||||
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
|
||||
const report = jsonReport as PytestJsonReport | undefined;
|
||||
|
||||
if (!report) {
|
||||
return this.parseFromStdout(rawOutput);
|
||||
}
|
||||
|
||||
const summary: ExecutionSummary = {
|
||||
total: report.summary.total,
|
||||
passed: report.summary.passed,
|
||||
failed: report.summary.failed,
|
||||
skipped: report.summary.skipped,
|
||||
duration: report.duration * 1000,
|
||||
};
|
||||
|
||||
const tests: TestResult[] = report.tests.map((t) => ({
|
||||
id: this.extractScenarioId(t.nodeid),
|
||||
name: t.nodeid,
|
||||
status: t.outcome === 'error' ? 'error' : t.outcome,
|
||||
duration: (t.call?.duration || 0) * 1000,
|
||||
assertions: {
|
||||
passed: t.outcome === 'passed' ? 1 : 0,
|
||||
failed: t.outcome === 'failed' ? 1 : 0,
|
||||
details: [],
|
||||
},
|
||||
error: t.call?.crash
|
||||
? { message: t.call.crash.message, stack: t.call.longrepr }
|
||||
: undefined,
|
||||
}));
|
||||
|
||||
return {
|
||||
summary,
|
||||
tests,
|
||||
errors: report.summary.error > 0 ? [`${report.summary.error} tests had errors`] : [],
|
||||
};
|
||||
}
|
||||
|
||||
private parseFromStdout(stdout: string): ExecutionResult {
|
||||
const lines = stdout.split('\n');
|
||||
const summaryMatch = stdout.match(/(\d+) passed|(\d+) failed|(\d+) skipped|(\d+) error/g);
|
||||
|
||||
let passed = 0, failed = 0, skipped = 0;
|
||||
|
||||
if (summaryMatch) {
|
||||
for (const match of summaryMatch) {
|
||||
const [num, type] = match.split(' ');
|
||||
const count = parseInt(num, 10);
|
||||
if (type === 'passed') passed = count;
|
||||
if (type === 'failed') failed = count;
|
||||
if (type === 'skipped') skipped = count;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
summary: {
|
||||
total: passed + failed + skipped,
|
||||
passed,
|
||||
failed,
|
||||
skipped,
|
||||
duration: 0,
|
||||
},
|
||||
tests: [],
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
|
||||
private extractScenarioId(nodeid: string): string {
|
||||
const match = nodeid.match(/test_([a-zA-Z0-9_-]+)/);
|
||||
return match ? match[1] : nodeid;
|
||||
}
|
||||
}
|
||||
126
src/runners/sandbox.ts
Normal file
126
src/runners/sandbox.ts
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
import { spawn, type ChildProcess, type SpawnOptions } from 'child_process';
|
||||
import type { SandboxConfig, DEFAULT_SANDBOX_CONFIG } from './types.js';
|
||||
|
||||
export interface SandboxedExecResult {
|
||||
exitCode: number;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
timedOut: boolean;
|
||||
}
|
||||
|
||||
export async function sandboxedExec(
|
||||
command: string,
|
||||
args: string[],
|
||||
options: {
|
||||
cwd: string;
|
||||
timeout: number;
|
||||
env?: Record<string, string>;
|
||||
sandboxConfig?: SandboxConfig;
|
||||
}
|
||||
): Promise<SandboxedExecResult> {
|
||||
const { cwd, timeout, env = {}, sandboxConfig } = options;
|
||||
|
||||
const spawnEnv: Record<string, string> = {};
|
||||
|
||||
if (sandboxConfig?.enabled) {
|
||||
for (const key of sandboxConfig.env.inherit) {
|
||||
if (process.env[key]) {
|
||||
spawnEnv[key] = process.env[key]!;
|
||||
}
|
||||
}
|
||||
Object.assign(spawnEnv, sandboxConfig.env.set);
|
||||
} else {
|
||||
Object.assign(spawnEnv, process.env);
|
||||
}
|
||||
|
||||
Object.assign(spawnEnv, env);
|
||||
|
||||
const spawnOptions: SpawnOptions = {
|
||||
cwd,
|
||||
env: spawnEnv,
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
};
|
||||
|
||||
return new Promise((resolve) => {
|
||||
let stdout = '';
|
||||
let stderr = '';
|
||||
let timedOut = false;
|
||||
|
||||
const child: ChildProcess = spawn(command, args, spawnOptions);
|
||||
|
||||
const timeoutId = setTimeout(() => {
|
||||
timedOut = true;
|
||||
child.kill('SIGTERM');
|
||||
setTimeout(() => child.kill('SIGKILL'), 1000);
|
||||
}, timeout);
|
||||
|
||||
child.stdout?.on('data', (data: Buffer) => {
|
||||
stdout += data.toString();
|
||||
});
|
||||
|
||||
child.stderr?.on('data', (data: Buffer) => {
|
||||
stderr += data.toString();
|
||||
});
|
||||
|
||||
child.on('close', (code) => {
|
||||
clearTimeout(timeoutId);
|
||||
resolve({
|
||||
exitCode: code ?? 1,
|
||||
stdout,
|
||||
stderr,
|
||||
timedOut,
|
||||
});
|
||||
});
|
||||
|
||||
child.on('error', (err) => {
|
||||
clearTimeout(timeoutId);
|
||||
resolve({
|
||||
exitCode: 1,
|
||||
stdout,
|
||||
stderr: stderr + '\n' + err.message,
|
||||
timedOut: false,
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export function buildSandboxCommand(
|
||||
command: string,
|
||||
args: string[],
|
||||
config: SandboxConfig
|
||||
): { command: string; args: string[] } {
|
||||
if (!config.enabled) {
|
||||
return { command, args };
|
||||
}
|
||||
|
||||
if (process.platform === 'darwin') {
|
||||
const sandboxArgs: string[] = [];
|
||||
|
||||
if (!config.network.allowOutbound) {
|
||||
sandboxArgs.push('--deny-network-outbound');
|
||||
}
|
||||
|
||||
return {
|
||||
command: 'sandbox-exec',
|
||||
args: ['-p', buildSandboxProfile(config), command, ...args],
|
||||
};
|
||||
}
|
||||
|
||||
return { command, args };
|
||||
}
|
||||
|
||||
function buildSandboxProfile(config: SandboxConfig): string {
|
||||
const rules: string[] = ['(version 1)', '(allow default)'];
|
||||
|
||||
if (!config.network.allowOutbound) {
|
||||
rules.push('(deny network-outbound (remote ip "*:*"))');
|
||||
}
|
||||
|
||||
for (const path of config.filesystem.readOnly) {
|
||||
if (path !== '/') {
|
||||
rules.push(`(deny file-write* (subpath "${path}"))`);
|
||||
}
|
||||
}
|
||||
|
||||
return rules.join('\n');
|
||||
}
|
||||
95
src/runners/types.ts
Normal file
95
src/runners/types.ts
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
export type TestFramework = 'pytest' | 'vitest' | 'jest';
|
||||
|
||||
export interface ExecutionOptions {
|
||||
framework: TestFramework;
|
||||
sandbox: boolean;
|
||||
timeout: number;
|
||||
parallel: boolean;
|
||||
filter?: string[];
|
||||
cwd?: string;
|
||||
env?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface ExecutionResult {
|
||||
summary: ExecutionSummary;
|
||||
tests: TestResult[];
|
||||
errors: string[];
|
||||
traceId?: string;
|
||||
}
|
||||
|
||||
export interface ExecutionSummary {
|
||||
total: number;
|
||||
passed: number;
|
||||
failed: number;
|
||||
skipped: number;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
export interface TestResult {
|
||||
id: string;
|
||||
name: string;
|
||||
status: 'passed' | 'failed' | 'skipped' | 'error';
|
||||
duration: number;
|
||||
assertions: {
|
||||
passed: number;
|
||||
failed: number;
|
||||
details: AssertionResult[];
|
||||
};
|
||||
error?: { message: string; stack?: string };
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
}
|
||||
|
||||
export interface AssertionResult {
|
||||
description: string;
|
||||
passed: boolean;
|
||||
expected?: unknown;
|
||||
actual?: unknown;
|
||||
}
|
||||
|
||||
export interface SandboxConfig {
|
||||
enabled: boolean;
|
||||
autoAllowBashIfSandboxed: boolean;
|
||||
network: {
|
||||
allowLocalBinding: boolean;
|
||||
allowOutbound: boolean;
|
||||
};
|
||||
filesystem: {
|
||||
readOnly: string[];
|
||||
writable: string[];
|
||||
};
|
||||
env: {
|
||||
inherit: string[];
|
||||
set: Record<string, string>;
|
||||
};
|
||||
}
|
||||
|
||||
export const DEFAULT_SANDBOX_CONFIG: SandboxConfig = {
|
||||
enabled: true,
|
||||
autoAllowBashIfSandboxed: true,
|
||||
network: {
|
||||
allowLocalBinding: true,
|
||||
allowOutbound: false,
|
||||
},
|
||||
filesystem: {
|
||||
readOnly: ['/'],
|
||||
writable: ['/tmp', './test-output'],
|
||||
},
|
||||
env: {
|
||||
inherit: ['PATH', 'HOME', 'USER'],
|
||||
set: { CI: 'true', NODE_ENV: 'test' },
|
||||
},
|
||||
};
|
||||
|
||||
export interface RunnerConfig {
|
||||
testDir: string;
|
||||
outputFile: string;
|
||||
options: ExecutionOptions;
|
||||
sandboxConfig?: SandboxConfig;
|
||||
}
|
||||
|
||||
export interface Runner {
|
||||
name: TestFramework;
|
||||
run(config: RunnerConfig): Promise<ExecutionResult>;
|
||||
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult;
|
||||
}
|
||||
213
src/runners/vitest-runner.ts
Normal file
213
src/runners/vitest-runner.ts
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
import { readFile, writeFile, mkdir } from 'fs/promises';
|
||||
import { existsSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import type { Runner, RunnerConfig, ExecutionResult, TestResult, ExecutionSummary } from './types.js';
|
||||
import { sandboxedExec } from './sandbox.js';
|
||||
|
||||
interface VitestJsonReport {
|
||||
numTotalTestSuites: number;
|
||||
numPassedTestSuites: number;
|
||||
numFailedTestSuites: number;
|
||||
numTotalTests: number;
|
||||
numPassedTests: number;
|
||||
numFailedTests: number;
|
||||
numSkippedTests: number;
|
||||
startTime: number;
|
||||
endTime: number;
|
||||
testResults: VitestTestFile[];
|
||||
}
|
||||
|
||||
interface VitestTestFile {
|
||||
name: string;
|
||||
status: 'passed' | 'failed';
|
||||
startTime: number;
|
||||
endTime: number;
|
||||
assertionResults: VitestAssertion[];
|
||||
}
|
||||
|
||||
interface VitestAssertion {
|
||||
ancestorTitles: string[];
|
||||
fullName: string;
|
||||
status: 'passed' | 'failed' | 'skipped';
|
||||
title: string;
|
||||
duration: number;
|
||||
failureMessages: string[];
|
||||
}
|
||||
|
||||
export class VitestRunner implements Runner {
|
||||
name = 'vitest' as const;
|
||||
|
||||
async run(config: RunnerConfig): Promise<ExecutionResult> {
|
||||
const { testDir, outputFile, options, sandboxConfig } = config;
|
||||
|
||||
const reportFile = join(testDir, '.vitest_report.json');
|
||||
|
||||
const args = [
|
||||
'vitest',
|
||||
'run',
|
||||
'--reporter=json',
|
||||
`--outputFile=${reportFile}`,
|
||||
];
|
||||
|
||||
if (options.filter && options.filter.length > 0) {
|
||||
args.push('--testNamePattern', options.filter.join('|'));
|
||||
}
|
||||
|
||||
args.push(testDir);
|
||||
|
||||
const result = await sandboxedExec('npx', args, {
|
||||
cwd: options.cwd || process.cwd(),
|
||||
timeout: options.timeout,
|
||||
env: options.env,
|
||||
sandboxConfig: sandboxConfig,
|
||||
});
|
||||
|
||||
let report: VitestJsonReport | undefined;
|
||||
if (existsSync(reportFile)) {
|
||||
try {
|
||||
const content = await readFile(reportFile, 'utf-8');
|
||||
report = JSON.parse(content);
|
||||
} catch (e) {
|
||||
}
|
||||
}
|
||||
|
||||
const executionResult = this.parseResults(result.stdout + result.stderr, report);
|
||||
|
||||
if (result.timedOut) {
|
||||
executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
|
||||
}
|
||||
|
||||
if (outputFile) {
|
||||
await mkdir(dirname(outputFile), { recursive: true });
|
||||
await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
|
||||
}
|
||||
|
||||
return executionResult;
|
||||
}
|
||||
|
||||
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
|
||||
const report = jsonReport as VitestJsonReport | undefined;
|
||||
|
||||
if (!report) {
|
||||
return this.parseFromStdout(rawOutput);
|
||||
}
|
||||
|
||||
const summary: ExecutionSummary = {
|
||||
total: report.numTotalTests,
|
||||
passed: report.numPassedTests,
|
||||
failed: report.numFailedTests,
|
||||
skipped: report.numSkippedTests,
|
||||
duration: report.endTime - report.startTime,
|
||||
};
|
||||
|
||||
const tests: TestResult[] = [];
|
||||
|
||||
for (const file of report.testResults) {
|
||||
for (const assertion of file.assertionResults) {
|
||||
tests.push({
|
||||
id: this.extractScenarioId(assertion.fullName),
|
||||
name: assertion.fullName,
|
||||
status: assertion.status === 'skipped' ? 'skipped' : assertion.status,
|
||||
duration: assertion.duration,
|
||||
assertions: {
|
||||
passed: assertion.status === 'passed' ? 1 : 0,
|
||||
failed: assertion.status === 'failed' ? 1 : 0,
|
||||
details: [],
|
||||
},
|
||||
error: assertion.failureMessages.length > 0
|
||||
? { message: assertion.failureMessages.join('\n') }
|
||||
: undefined,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
summary,
|
||||
tests,
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
|
||||
private parseFromStdout(stdout: string): ExecutionResult {
|
||||
const passMatch = stdout.match(/(\d+) passed/);
|
||||
const failMatch = stdout.match(/(\d+) failed/);
|
||||
const skipMatch = stdout.match(/(\d+) skipped/);
|
||||
|
||||
const passed = passMatch ? parseInt(passMatch[1], 10) : 0;
|
||||
const failed = failMatch ? parseInt(failMatch[1], 10) : 0;
|
||||
const skipped = skipMatch ? parseInt(skipMatch[1], 10) : 0;
|
||||
|
||||
return {
|
||||
summary: {
|
||||
total: passed + failed + skipped,
|
||||
passed,
|
||||
failed,
|
||||
skipped,
|
||||
duration: 0,
|
||||
},
|
||||
tests: [],
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
|
||||
private extractScenarioId(fullName: string): string {
|
||||
const match = fullName.match(/test[_\s]([a-zA-Z0-9_-]+)/i);
|
||||
return match ? match[1] : fullName.replace(/\s+/g, '_');
|
||||
}
|
||||
}
|
||||
|
||||
export class JestRunner implements Runner {
|
||||
name = 'jest' as const;
|
||||
|
||||
async run(config: RunnerConfig): Promise<ExecutionResult> {
|
||||
const { testDir, outputFile, options, sandboxConfig } = config;
|
||||
|
||||
const reportFile = join(testDir, '.jest_report.json');
|
||||
|
||||
const args = [
|
||||
'jest',
|
||||
'--json',
|
||||
`--outputFile=${reportFile}`,
|
||||
];
|
||||
|
||||
if (options.filter && options.filter.length > 0) {
|
||||
args.push('--testNamePattern', options.filter.join('|'));
|
||||
}
|
||||
|
||||
args.push(testDir);
|
||||
|
||||
const result = await sandboxedExec('npx', args, {
|
||||
cwd: options.cwd || process.cwd(),
|
||||
timeout: options.timeout,
|
||||
env: options.env,
|
||||
sandboxConfig: sandboxConfig,
|
||||
});
|
||||
|
||||
let report: VitestJsonReport | undefined;
|
||||
if (existsSync(reportFile)) {
|
||||
try {
|
||||
const content = await readFile(reportFile, 'utf-8');
|
||||
report = JSON.parse(content);
|
||||
} catch (e) {
|
||||
}
|
||||
}
|
||||
|
||||
const executionResult = this.parseResults(result.stdout + result.stderr, report);
|
||||
|
||||
if (result.timedOut) {
|
||||
executionResult.errors.push(`Test execution timed out after ${options.timeout}ms`);
|
||||
}
|
||||
|
||||
if (outputFile) {
|
||||
await mkdir(dirname(outputFile), { recursive: true });
|
||||
await writeFile(outputFile, JSON.stringify(executionResult, null, 2));
|
||||
}
|
||||
|
||||
return executionResult;
|
||||
}
|
||||
|
||||
parseResults(rawOutput: string, jsonReport?: unknown): ExecutionResult {
|
||||
const vitestRunner = new VitestRunner();
|
||||
return vitestRunner.parseResults(rawOutput, jsonReport);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue