mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-17 21:03:04 +00:00
grader, test renderer
This commit is contained in:
parent
9297f0b1ee
commit
e0c36241b0
22 changed files with 1914 additions and 5 deletions
101
src/cli/commands/grade.ts
Normal file
101
src/cli/commands/grade.ts
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import { Command } from 'commander';
|
||||
import { readFileSync, existsSync } from 'fs';
|
||||
import { gradeWithRubric, loadAllRubrics, analyzeCalibration, calibrate } from '../../graders/index.js';
|
||||
import type { CalibrationExample } from '../../graders/types.js';
|
||||
|
||||
export const gradeCommand = new Command('grade')
|
||||
.description('Grade output using LLM rubric')
|
||||
.argument('<input>', 'Path to input file or string to grade')
|
||||
.option('-r, --rubric <name>', 'Rubric name or path', 'code-quality')
|
||||
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
|
||||
.option('--json', 'Output result as JSON', false)
|
||||
.action(async (input: string, options) => {
|
||||
try {
|
||||
let content: string;
|
||||
|
||||
if (existsSync(input)) {
|
||||
content = readFileSync(input, 'utf-8');
|
||||
} else {
|
||||
content = input;
|
||||
}
|
||||
|
||||
console.log(`Grading with rubric: ${options.rubric}`);
|
||||
|
||||
const result = await gradeWithRubric(content, options.rubric, {
|
||||
rubricsDir: options.rubricsDir,
|
||||
});
|
||||
|
||||
if (options.json) {
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n${result.pass ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(`Score: ${(result.score * 100).toFixed(1)}%`);
|
||||
console.log(`\nSummary: ${result.reason}`);
|
||||
|
||||
console.log('\nCriterion Scores:');
|
||||
for (const cs of result.criterionScores) {
|
||||
const bar = '█'.repeat(Math.round(cs.score * 10)) + '░'.repeat(10 - Math.round(cs.score * 10));
|
||||
console.log(` ${cs.name}: ${bar} ${(cs.score * 100).toFixed(0)}%`);
|
||||
console.log(` ${cs.feedback}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error grading:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const listRubricsCommand = new Command('rubrics')
|
||||
.description('List available rubrics')
|
||||
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const rubrics = loadAllRubrics(options.rubricsDir);
|
||||
|
||||
if (rubrics.size === 0) {
|
||||
console.log(`No rubrics found in ${options.rubricsDir}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Available rubrics (${rubrics.size}):\n`);
|
||||
|
||||
for (const [name, rubric] of rubrics) {
|
||||
console.log(`📋 ${name}`);
|
||||
console.log(` ${rubric.description}`);
|
||||
console.log(` Threshold: ${(rubric.passingThreshold * 100).toFixed(0)}%`);
|
||||
console.log(` Criteria: ${rubric.criteria.map(c => c.name).join(', ')}`);
|
||||
console.log('');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error listing rubrics:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const calibrateCommand = new Command('calibrate')
|
||||
.description('Calibrate a rubric against known examples')
|
||||
.argument('<rubric>', 'Rubric name or path')
|
||||
.argument('<examples>', 'Path to calibration examples JSON')
|
||||
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
|
||||
.action(async (rubricName: string, examplesPath: string, options) => {
|
||||
try {
|
||||
if (!existsSync(examplesPath)) {
|
||||
console.error(`Examples file not found: ${examplesPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const examples: CalibrationExample[] = JSON.parse(readFileSync(examplesPath, 'utf-8'));
|
||||
|
||||
console.log(`Calibrating rubric '${rubricName}' with ${examples.length} examples...`);
|
||||
|
||||
const result = await calibrate(rubricName, examples, {
|
||||
rubricsDir: options.rubricsDir,
|
||||
});
|
||||
|
||||
console.log('\n' + analyzeCalibration(result));
|
||||
} catch (error) {
|
||||
console.error('Error calibrating:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
61
src/cli/commands/render.ts
Normal file
61
src/cli/commands/render.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import { Command } from 'commander';
|
||||
import { readFileSync, existsSync } from 'fs';
|
||||
import { renderSpec, detectFramework, type Framework } from '../../renderers/index.js';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
|
||||
export const renderCommand = new Command('render')
|
||||
.description('Render EvalSpec JSON into runnable test files')
|
||||
.argument('<spec>', 'Path to EvalSpec JSON file')
|
||||
.option('-o, --output <dir>', 'Output directory for test files', './tests/generated')
|
||||
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
||||
.option('--fixtures', 'Generate fixture stubs', false)
|
||||
.option('--mocks', 'Generate mock stubs', false)
|
||||
.option('--dry-run', 'Preview without writing files', false)
|
||||
.action(async (specPath: string, options) => {
|
||||
try {
|
||||
if (!existsSync(specPath)) {
|
||||
console.error(`Error: Spec file not found: ${specPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const specContent = readFileSync(specPath, 'utf-8');
|
||||
const spec: EvalSpec = JSON.parse(specContent);
|
||||
|
||||
const framework = (options.framework as Framework) || detectFramework(spec);
|
||||
|
||||
console.log(`Rendering ${spec.scenarios.length} scenarios with ${framework}...`);
|
||||
|
||||
const result = await renderSpec(spec, {
|
||||
outputDir: options.output,
|
||||
framework,
|
||||
includeFixtures: options.fixtures,
|
||||
generateMocks: options.mocks,
|
||||
dryRun: options.dryRun,
|
||||
});
|
||||
|
||||
if (options.dryRun) {
|
||||
console.log('\n--- DRY RUN ---\n');
|
||||
for (const file of result.files) {
|
||||
console.log(`📄 ${file.path}`);
|
||||
console.log('---');
|
||||
console.log(file.content);
|
||||
console.log('---\n');
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✅ Rendered ${result.stats.scenarioCount} scenarios`);
|
||||
console.log(` 📁 ${result.stats.fileCount} test files`);
|
||||
console.log(` 🔍 ${result.stats.assertionCount} assertions`);
|
||||
|
||||
if (result.stats.skippedCount > 0) {
|
||||
console.log(` ⏭️ ${result.stats.skippedCount} scenarios skipped (LLM rubric assertions)`);
|
||||
}
|
||||
|
||||
if (!options.dryRun) {
|
||||
console.log(`\n📂 Output: ${options.output}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error rendering spec:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue