mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-15 13:03:44 +00:00
257 lines
9.1 KiB
TypeScript
257 lines
9.1 KiB
TypeScript
import { Command } from 'commander';
|
|
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
|
import { join, resolve } from 'path';
|
|
import { analyze } from '../../introspector/index.js';
|
|
import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
|
|
import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
|
|
import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
|
|
import { createTracer, saveTrace } from '../../observability/index.js';
|
|
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
|
|
import type { EvalSpec } from '../../analyzer/types.js';
|
|
|
|
const EVALUCLAUDE_DIR = '.evaluclaude';
|
|
|
|
interface PipelineOptions {
|
|
output?: string;
|
|
interactive?: boolean;
|
|
focus?: string;
|
|
maxScenarios: string;
|
|
testDir: string;
|
|
framework?: string;
|
|
skipAnalyze?: boolean;
|
|
skipRender?: boolean;
|
|
skipRun?: boolean;
|
|
promptfoo?: boolean;
|
|
quiet?: boolean;
|
|
}
|
|
|
|
export const pipelineCommand = new Command('pipeline')
|
|
.description('Run the full eval generation pipeline: introspect → analyze → render → run')
|
|
.argument('[path]', 'Path to the repository to analyze', '.')
|
|
.option('-o, --output <dir>', 'Output directory for all artifacts', '.evaluclaude')
|
|
.option('-i, --interactive', 'Enable interactive mode with clarifying questions')
|
|
.option('--focus <modules>', 'Comma-separated list of modules/functions to focus on')
|
|
.option('--max-scenarios <n>', 'Maximum number of test scenarios to generate', '10')
|
|
.option('--test-dir <dir>', 'Directory for generated tests', './tests/generated')
|
|
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
|
.option('--skip-analyze', 'Skip analysis, use existing spec')
|
|
.option('--skip-render', 'Skip rendering, use existing tests')
|
|
.option('--skip-run', 'Skip test execution')
|
|
.option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
|
|
.option('--quiet', 'Suppress progress messages')
|
|
.action(async (repoPath: string, options: PipelineOptions) => {
|
|
const absolutePath = resolve(repoPath);
|
|
const log = options.quiet ? () => {} : console.log;
|
|
const outputDir = options.output || EVALUCLAUDE_DIR;
|
|
|
|
console.log('\n🚀 Evaluclaude Pipeline');
|
|
console.log('═'.repeat(50));
|
|
console.log(` Repository: ${absolutePath}`);
|
|
console.log(` Output: ${outputDir}`);
|
|
console.log('═'.repeat(50) + '\n');
|
|
|
|
// Ensure output directories exist
|
|
mkdirSync(outputDir, { recursive: true });
|
|
mkdirSync(options.testDir, { recursive: true });
|
|
|
|
const specPath = join(outputDir, 'spec.json');
|
|
const tracesDir = join(outputDir, 'traces');
|
|
const resultsDir = join(outputDir, 'results');
|
|
|
|
mkdirSync(tracesDir, { recursive: true });
|
|
mkdirSync(resultsDir, { recursive: true });
|
|
|
|
let spec: EvalSpec;
|
|
|
|
// Step 1: Introspection + Analysis
|
|
if (options.skipAnalyze && existsSync(specPath)) {
|
|
log('📋 Using existing EvalSpec...');
|
|
spec = JSON.parse(readFileSync(specPath, 'utf-8'));
|
|
log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
|
|
} else {
|
|
log('🔬 Step 1: Introspecting codebase...');
|
|
|
|
try {
|
|
const repoSummary = await analyze({
|
|
root: absolutePath,
|
|
onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`),
|
|
});
|
|
|
|
log(` Files: ${repoSummary.files.length}`);
|
|
log(` Languages: ${repoSummary.languages.join(', ')}`);
|
|
log('');
|
|
|
|
log('🤖 Step 2: Generating EvalSpec with Claude...\n');
|
|
|
|
const focus = options.focus?.split(',').map(s => s.trim());
|
|
const maxScenarios = parseInt(options.maxScenarios, 10);
|
|
|
|
let result;
|
|
if (options.interactive) {
|
|
const { default: inquirer } = await import('inquirer');
|
|
|
|
result = await generateEvalSpecInteractive(
|
|
repoSummary,
|
|
async (question: string) => {
|
|
const { answer } = await inquirer.prompt([{
|
|
type: 'input',
|
|
name: 'answer',
|
|
message: `🤖 Claude asks: ${question}`,
|
|
}]);
|
|
return answer;
|
|
},
|
|
{ focus, maxScenarios }
|
|
);
|
|
} else {
|
|
result = await generateEvalSpec(repoSummary, {
|
|
interactive: false,
|
|
focus,
|
|
maxScenarios,
|
|
});
|
|
}
|
|
|
|
spec = result.spec;
|
|
|
|
// Save the spec
|
|
writeFileSync(specPath, JSON.stringify(spec, null, 2));
|
|
|
|
log(`\n✅ EvalSpec generated!`);
|
|
log(` Scenarios: ${spec.scenarios.length}`);
|
|
log(` Tokens: ${result.tokensUsed}`);
|
|
log(` Saved: ${specPath}\n`);
|
|
} catch (error) {
|
|
console.error('\n❌ Analysis failed:', error instanceof Error ? error.message : error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Step 2: Render tests
|
|
if (!options.skipRender) {
|
|
log('📝 Step 3: Rendering test files...');
|
|
|
|
try {
|
|
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
|
|
|
const renderResult = await renderSpec(spec, {
|
|
outputDir: options.testDir,
|
|
framework,
|
|
includeFixtures: true,
|
|
generateMocks: true,
|
|
dryRun: false,
|
|
});
|
|
|
|
log(` Framework: ${framework}`);
|
|
log(` Files: ${renderResult.stats.fileCount}`);
|
|
log(` Scenarios: ${renderResult.stats.scenarioCount}`);
|
|
log(` Assertions: ${renderResult.stats.assertionCount}`);
|
|
log(` Output: ${options.testDir}\n`);
|
|
} catch (error) {
|
|
console.error('\n❌ Rendering failed:', error instanceof Error ? error.message : error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Step 3: Run tests
|
|
if (!options.skipRun) {
|
|
log('🧪 Step 4: Running tests...\n');
|
|
|
|
try {
|
|
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
|
const tracer = createTracer(spec.repo.name);
|
|
|
|
tracer.recordIntrospection({
|
|
filesAnalyzed: spec.scenarios.map(s => s.target.module),
|
|
totalFunctions: spec.scenarios.length,
|
|
duration: 0,
|
|
});
|
|
|
|
tracer.recordGeneration({
|
|
scenariosGenerated: spec.scenarios.length,
|
|
filesWritten: [options.testDir],
|
|
});
|
|
|
|
const result = await runTests(
|
|
options.testDir,
|
|
{
|
|
framework,
|
|
sandbox: true,
|
|
timeout: 300000,
|
|
parallel: false,
|
|
cwd: process.cwd(),
|
|
},
|
|
DEFAULT_SANDBOX_CONFIG
|
|
);
|
|
|
|
tracer.recordExecution({
|
|
testsPassed: result.summary.passed,
|
|
testsFailed: result.summary.failed,
|
|
testsSkipped: result.summary.skipped,
|
|
});
|
|
|
|
for (const test of result.tests) {
|
|
if (test.status === 'failed' || test.status === 'error') {
|
|
tracer.recordTestFailure({
|
|
scenarioId: test.id,
|
|
testName: test.name,
|
|
error: test.error?.message || 'Unknown error',
|
|
stack: test.error?.stack,
|
|
});
|
|
}
|
|
}
|
|
|
|
const trace = tracer.finalize();
|
|
const tracePath = await saveTrace(trace);
|
|
|
|
log(formatResults(result));
|
|
log(`📊 Trace saved: ${tracePath}`);
|
|
log(` View with: evaluclaude view ${trace.id}\n`);
|
|
|
|
// Save results
|
|
const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
|
|
writeFileSync(resultsPath, JSON.stringify(result, null, 2));
|
|
|
|
} catch (error) {
|
|
console.error('\n❌ Test execution failed:', error instanceof Error ? error.message : error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Step 4: Generate Promptfoo config
|
|
if (options.promptfoo) {
|
|
log('📦 Step 5: Generating Promptfoo configuration...');
|
|
|
|
try {
|
|
const configPath = join(outputDir, 'promptfooconfig.yaml');
|
|
const providerPath = join(outputDir, 'providers', 'test-runner.py');
|
|
const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
|
|
|
|
await generatePromptfooConfig(spec, {
|
|
testDir: options.testDir,
|
|
outputPath: configPath,
|
|
framework,
|
|
includeTraceLinks: true,
|
|
providerPath,
|
|
});
|
|
|
|
await generateTestProvider(providerPath);
|
|
|
|
log(` Config: ${configPath}`);
|
|
log(` Provider: ${providerPath}`);
|
|
log(`\n Launch UI with: evaluclaude ui\n`);
|
|
} catch (error) {
|
|
console.error('\n❌ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
|
|
}
|
|
}
|
|
|
|
console.log('═'.repeat(50));
|
|
console.log('✅ Pipeline complete!');
|
|
console.log('═'.repeat(50));
|
|
console.log(`\nNext steps:`);
|
|
console.log(` View traces: evaluclaude view --last`);
|
|
console.log(` List all traces: evaluclaude traces`);
|
|
if (options.promptfoo) {
|
|
console.log(` Launch UI: evaluclaude ui`);
|
|
console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`);
|
|
}
|
|
console.log('');
|
|
});
|