mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-18 00:02:46 +00:00
promptfoo ui and testcon
This commit is contained in:
parent
e0c36241b0
commit
6698c12e5b
18 changed files with 2169 additions and 0 deletions
145
src/cli/commands/run.ts
Normal file
145
src/cli/commands/run.ts
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
import { Command } from 'commander';
|
||||
import { existsSync, readFileSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import {
|
||||
runTests,
|
||||
formatResults,
|
||||
detectTestFramework,
|
||||
type TestFramework,
|
||||
type ExecutionOptions,
|
||||
DEFAULT_SANDBOX_CONFIG
|
||||
} from '../../runners/index.js';
|
||||
import { createTracer, saveTrace } from '../../observability/index.js';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
|
||||
export const runCommand = new Command('run')
|
||||
.description('Run generated tests and collect results')
|
||||
.argument('[test-dir]', 'Directory containing test files', './tests/generated')
|
||||
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
||||
.option('-s, --spec <spec>', 'Path to EvalSpec JSON for result mapping')
|
||||
.option('--sandbox', 'Run tests in sandbox mode', true)
|
||||
.option('--no-sandbox', 'Disable sandbox mode')
|
||||
.option('-t, --timeout <ms>', 'Test timeout in milliseconds', '300000')
|
||||
.option('-p, --parallel', 'Run tests in parallel', false)
|
||||
.option('--filter <patterns...>', 'Run only tests matching patterns')
|
||||
.option('-o, --output <file>', 'Output results to JSON file')
|
||||
.option('--trace', 'Record execution trace', true)
|
||||
.option('--no-trace', 'Disable execution tracing')
|
||||
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
|
||||
.action(async (testDir: string, options) => {
|
||||
try {
|
||||
console.log(`\n🧪 Running tests from ${testDir}...\n`);
|
||||
|
||||
if (!existsSync(testDir)) {
|
||||
console.error(`Error: Test directory not found: ${testDir}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const framework: TestFramework = options.framework || detectTestFramework(testDir);
|
||||
console.log(` Framework: ${framework}`);
|
||||
console.log(` Sandbox: ${options.sandbox ? 'enabled' : 'disabled'}`);
|
||||
console.log(` Timeout: ${options.timeout}ms`);
|
||||
|
||||
let spec: EvalSpec | undefined;
|
||||
if (options.spec && existsSync(options.spec)) {
|
||||
spec = JSON.parse(readFileSync(options.spec, 'utf-8')) as EvalSpec;
|
||||
console.log(` Spec: ${options.spec} (${spec.scenarios.length} scenarios)`);
|
||||
}
|
||||
|
||||
const tracer = options.trace ? createTracer(spec?.repo.name || 'unknown') : null;
|
||||
|
||||
const execOptions: ExecutionOptions = {
|
||||
framework,
|
||||
sandbox: options.sandbox,
|
||||
timeout: parseInt(options.timeout, 10),
|
||||
parallel: options.parallel,
|
||||
filter: options.filter,
|
||||
cwd: process.cwd(),
|
||||
};
|
||||
|
||||
if (tracer) {
|
||||
tracer.recordIntrospection({
|
||||
filesAnalyzed: [testDir],
|
||||
duration: 0,
|
||||
});
|
||||
}
|
||||
|
||||
console.log('\n Running tests...\n');
|
||||
const startTime = Date.now();
|
||||
|
||||
const result = await runTests(
|
||||
testDir,
|
||||
execOptions,
|
||||
options.sandbox ? DEFAULT_SANDBOX_CONFIG : undefined
|
||||
);
|
||||
|
||||
if (tracer) {
|
||||
tracer.recordExecution({
|
||||
testsPassed: result.summary.passed,
|
||||
testsFailed: result.summary.failed,
|
||||
testsSkipped: result.summary.skipped,
|
||||
});
|
||||
|
||||
for (const test of result.tests) {
|
||||
if (test.status === 'failed' || test.status === 'error') {
|
||||
tracer.recordTestFailure({
|
||||
scenarioId: test.id,
|
||||
testName: test.name,
|
||||
error: test.error?.message || 'Unknown error',
|
||||
stack: test.error?.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(formatResults(result));
|
||||
|
||||
if (spec) {
|
||||
const mappedResults = mapResultsToScenarios(result, spec);
|
||||
console.log(`\n📊 Scenario Coverage:`);
|
||||
console.log(` Covered: ${mappedResults.covered}/${spec.scenarios.length}`);
|
||||
console.log(` Unmapped: ${mappedResults.unmapped}`);
|
||||
}
|
||||
|
||||
if (options.output) {
|
||||
const { writeFileSync, mkdirSync } = await import('fs');
|
||||
const { dirname } = await import('path');
|
||||
mkdirSync(dirname(options.output), { recursive: true });
|
||||
writeFileSync(options.output, JSON.stringify(result, null, 2));
|
||||
console.log(`\n📁 Results saved to: ${options.output}`);
|
||||
}
|
||||
|
||||
if (tracer) {
|
||||
const trace = tracer.finalize();
|
||||
const tracePath = await saveTrace(trace);
|
||||
console.log(`\n📊 Trace saved: ${tracePath}`);
|
||||
console.log(` View with: evaluclaude view ${trace.id}`);
|
||||
}
|
||||
|
||||
if (result.summary.failed > 0) {
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error running tests:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
function mapResultsToScenarios(
|
||||
result: Awaited<ReturnType<typeof runTests>>,
|
||||
spec: EvalSpec
|
||||
): { covered: number; unmapped: number } {
|
||||
const scenarioIds = new Set(spec.scenarios.map(s => s.id));
|
||||
let covered = 0;
|
||||
let unmapped = 0;
|
||||
|
||||
for (const test of result.tests) {
|
||||
if (scenarioIds.has(test.id)) {
|
||||
covered++;
|
||||
} else {
|
||||
unmapped++;
|
||||
}
|
||||
}
|
||||
|
||||
return { covered, unmapped };
|
||||
}
|
||||
236
src/cli/commands/ui.ts
Normal file
236
src/cli/commands/ui.ts
Normal file
|
|
@ -0,0 +1,236 @@
|
|||
import { Command } from 'commander';
|
||||
import { spawn, type ChildProcess } from 'child_process';
|
||||
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
||||
|
||||
const EVALUCLAUDE_DIR = '.evaluclaude';
|
||||
const CONFIG_FILE = 'promptfooconfig.yaml';
|
||||
const PROVIDERS_DIR = 'providers';
|
||||
|
||||
export const uiCommand = new Command('ui')
|
||||
.description('Launch the evaluation dashboard UI')
|
||||
.option('-p, --port <port>', 'Port to run the UI on', '3000')
|
||||
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
|
||||
.option('--generate', 'Regenerate Promptfoo config from spec')
|
||||
.option('--no-open', 'Do not auto-open browser')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const port = parseInt(options.port, 10);
|
||||
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
|
||||
if (options.spec && options.generate) {
|
||||
console.log('\n📄 Generating Promptfoo configuration...');
|
||||
|
||||
if (!existsSync(options.spec)) {
|
||||
console.error(`Error: Spec file not found: ${options.spec}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
||||
|
||||
await generatePromptfooConfig(spec, {
|
||||
testDir: './tests/generated',
|
||||
outputPath: configPath,
|
||||
framework: detectFramework(spec),
|
||||
includeTraceLinks: true,
|
||||
});
|
||||
|
||||
await generateTestProvider(providerPath);
|
||||
|
||||
console.log(` Config: ${configPath}`);
|
||||
console.log(` Provider: ${providerPath}`);
|
||||
}
|
||||
|
||||
if (!existsSync(configPath)) {
|
||||
console.log('\n⚠️ No Promptfoo config found.');
|
||||
console.log(' Run with --spec <file> --generate to create one.\n');
|
||||
console.log(' Or create one manually:');
|
||||
console.log(` ${configPath}\n`);
|
||||
|
||||
await createDefaultConfig(configPath, providerPath);
|
||||
console.log(` Created default config at ${configPath}`);
|
||||
}
|
||||
|
||||
console.log(`\n🚀 Starting Promptfoo UI on port ${port}...`);
|
||||
console.log(` Config: ${configPath}\n`);
|
||||
|
||||
await launchPromptfooUI(port, configPath, options.open);
|
||||
} catch (error) {
|
||||
console.error('Error launching UI:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const evalCommand = new Command('eval')
|
||||
.description('Run evaluations with Promptfoo and optionally launch UI')
|
||||
.option('-s, --spec <spec>', 'Path to EvalSpec JSON file')
|
||||
.option('-c, --config <config>', 'Path to promptfooconfig.yaml')
|
||||
.option('-o, --output <output>', 'Output path for results', '.evaluclaude/results')
|
||||
.option('--view', 'Launch UI after evaluation', false)
|
||||
.option('-p, --port <port>', 'Port for UI', '3000')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const configPath = options.config || join(EVALUCLAUDE_DIR, CONFIG_FILE);
|
||||
|
||||
if (options.spec) {
|
||||
console.log('\n📄 Generating Promptfoo configuration from spec...');
|
||||
const spec: EvalSpec = JSON.parse(readFileSync(options.spec, 'utf-8'));
|
||||
|
||||
await generatePromptfooConfig(spec, {
|
||||
testDir: './tests/generated',
|
||||
outputPath: configPath,
|
||||
framework: detectFramework(spec),
|
||||
includeTraceLinks: true,
|
||||
});
|
||||
|
||||
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
|
||||
await generateTestProvider(providerPath);
|
||||
}
|
||||
|
||||
if (!existsSync(configPath)) {
|
||||
console.error(`Error: Config not found: ${configPath}`);
|
||||
console.log('Run with --spec <file> to generate from EvalSpec.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n🧪 Running Promptfoo evaluations...\n');
|
||||
|
||||
const outputFile = join(options.output, `eval-${Date.now()}.json`);
|
||||
mkdirSync(dirname(outputFile), { recursive: true });
|
||||
|
||||
await runPromptfooEval(configPath, outputFile);
|
||||
|
||||
console.log(`\n📁 Results saved: ${outputFile}`);
|
||||
|
||||
if (options.view) {
|
||||
console.log(`\n🚀 Launching UI on port ${options.port}...`);
|
||||
await launchPromptfooUI(parseInt(options.port, 10), configPath, true);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error running eval:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
async function launchPromptfooUI(
|
||||
port: number,
|
||||
configPath: string,
|
||||
openBrowser: boolean
|
||||
): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const args = ['promptfoo', 'view', '--port', String(port)];
|
||||
|
||||
if (openBrowser) {
|
||||
args.push('-y');
|
||||
} else {
|
||||
args.push('-n');
|
||||
}
|
||||
|
||||
const configDir = dirname(configPath);
|
||||
args.push(configDir);
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
||||
const child = spawn('npx', args, {
|
||||
stdio: 'inherit',
|
||||
env: { ...process.env },
|
||||
});
|
||||
|
||||
child.on('error', (error) => {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
console.error('\n❌ Promptfoo not found. Install with: npm install -g promptfoo');
|
||||
} else {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`Promptfoo exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
child.kill('SIGINT');
|
||||
process.exit(0);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function runPromptfooEval(configPath: string, outputFile: string): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const args = [
|
||||
'promptfoo',
|
||||
'eval',
|
||||
'-c', configPath,
|
||||
'-o', outputFile,
|
||||
'--no-cache',
|
||||
];
|
||||
|
||||
console.log(` Running: npx ${args.join(' ')}\n`);
|
||||
|
||||
const child = spawn('npx', args, {
|
||||
stdio: 'inherit',
|
||||
env: { ...process.env },
|
||||
});
|
||||
|
||||
child.on('error', reject);
|
||||
|
||||
child.on('close', (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`Promptfoo eval exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function createDefaultConfig(configPath: string, providerPath: string): Promise<void> {
|
||||
const defaultConfig = `# Evaluclaude Promptfoo Configuration
|
||||
# Generated by evaluclaude
|
||||
|
||||
description: "Evaluclaude functional test evaluations"
|
||||
|
||||
providers:
|
||||
- id: file://${providerPath}
|
||||
label: functional-tests
|
||||
config:
|
||||
test_dir: ./tests/generated
|
||||
framework: pytest
|
||||
timeout: 300
|
||||
|
||||
prompts:
|
||||
- "{{scenario_id}}"
|
||||
|
||||
tests:
|
||||
- description: "Example test"
|
||||
vars:
|
||||
scenario_id: "test_example"
|
||||
assert:
|
||||
- type: python
|
||||
value: |
|
||||
import json
|
||||
result = json.loads(output)
|
||||
result.get('passed', 0) > 0
|
||||
|
||||
outputPath: .evaluclaude/results/promptfoo-results.json
|
||||
`;
|
||||
|
||||
mkdirSync(dirname(configPath), { recursive: true });
|
||||
writeFileSync(configPath, defaultConfig);
|
||||
|
||||
await generateTestProvider(providerPath);
|
||||
}
|
||||
|
||||
function detectFramework(spec: EvalSpec): 'pytest' | 'vitest' | 'jest' {
|
||||
if (spec.repo.languages.includes('python')) {
|
||||
return 'pytest';
|
||||
}
|
||||
return 'vitest';
|
||||
}
|
||||
90
src/cli/commands/view.ts
Normal file
90
src/cli/commands/view.ts
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
import { Command } from 'commander';
|
||||
import {
|
||||
loadTrace,
|
||||
listTraces,
|
||||
getLatestTrace,
|
||||
formatTrace,
|
||||
formatTraceList
|
||||
} from '../../observability/index.js';
|
||||
|
||||
export const viewCommand = new Command('view')
|
||||
.description('View evaluation traces')
|
||||
.argument('[trace-id]', 'Specific trace ID to view')
|
||||
.option('--last', 'View the most recent trace')
|
||||
.option('--list', 'List all traces')
|
||||
.option('--json', 'Output as raw JSON')
|
||||
.option('-v, --verbose', 'Show verbose output including tool calls')
|
||||
.option('--tools', 'Show tool call details')
|
||||
.option('--questions', 'Show questions asked', true)
|
||||
.option('--decisions', 'Show decisions made', true)
|
||||
.option('-n, --limit <count>', 'Limit number of traces listed', '20')
|
||||
.option('--eval <eval-id>', 'Filter traces by eval ID')
|
||||
.action(async (traceId: string | undefined, options) => {
|
||||
try {
|
||||
if (options.list) {
|
||||
const traces = await listTraces(options.eval);
|
||||
const limited = traces.slice(0, parseInt(options.limit, 10));
|
||||
|
||||
if (traces.length === 0) {
|
||||
console.log('\nNo traces found.');
|
||||
console.log('Run `evaluclaude run` to generate traces.\n');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(formatTraceList(limited));
|
||||
|
||||
if (traces.length > limited.length) {
|
||||
console.log(`Showing ${limited.length} of ${traces.length} traces.`);
|
||||
console.log(`Use --limit to see more.\n`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let trace;
|
||||
|
||||
if (options.last || !traceId) {
|
||||
trace = await getLatestTrace();
|
||||
if (!trace) {
|
||||
console.log('\nNo traces found.');
|
||||
console.log('Run `evaluclaude run` to generate traces.\n');
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
trace = await loadTrace(traceId);
|
||||
if (!trace) {
|
||||
console.error(`\nTrace not found: ${traceId}`);
|
||||
console.log('Use `evaluclaude view --list` to see available traces.\n');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
const output = formatTrace(trace, {
|
||||
json: options.json,
|
||||
verbose: options.verbose,
|
||||
showToolCalls: options.tools || options.verbose,
|
||||
showQuestions: options.questions,
|
||||
showDecisions: options.decisions,
|
||||
});
|
||||
|
||||
console.log(output);
|
||||
} catch (error) {
|
||||
console.error('Error viewing trace:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const tracesCommand = new Command('traces')
|
||||
.description('List all evaluation traces (alias for view --list)')
|
||||
.option('-n, --limit <count>', 'Limit number of traces', '20')
|
||||
.option('--eval <eval-id>', 'Filter by eval ID')
|
||||
.action(async (options) => {
|
||||
const traces = await listTraces(options.eval);
|
||||
const limited = traces.slice(0, parseInt(options.limit, 10));
|
||||
|
||||
if (traces.length === 0) {
|
||||
console.log('\nNo traces found.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(formatTraceList(limited));
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue