mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-16 16:01:03 +00:00
grader, test renderer
This commit is contained in:
parent
9297f0b1ee
commit
e0c36241b0
22 changed files with 1914 additions and 5 deletions
101
src/cli/commands/grade.ts
Normal file
101
src/cli/commands/grade.ts
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import { Command } from 'commander';
|
||||
import { readFileSync, existsSync } from 'fs';
|
||||
import { gradeWithRubric, loadAllRubrics, analyzeCalibration, calibrate } from '../../graders/index.js';
|
||||
import type { CalibrationExample } from '../../graders/types.js';
|
||||
|
||||
export const gradeCommand = new Command('grade')
|
||||
.description('Grade output using LLM rubric')
|
||||
.argument('<input>', 'Path to input file or string to grade')
|
||||
.option('-r, --rubric <name>', 'Rubric name or path', 'code-quality')
|
||||
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
|
||||
.option('--json', 'Output result as JSON', false)
|
||||
.action(async (input: string, options) => {
|
||||
try {
|
||||
let content: string;
|
||||
|
||||
if (existsSync(input)) {
|
||||
content = readFileSync(input, 'utf-8');
|
||||
} else {
|
||||
content = input;
|
||||
}
|
||||
|
||||
console.log(`Grading with rubric: ${options.rubric}`);
|
||||
|
||||
const result = await gradeWithRubric(content, options.rubric, {
|
||||
rubricsDir: options.rubricsDir,
|
||||
});
|
||||
|
||||
if (options.json) {
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n${result.pass ? '✅ PASS' : '❌ FAIL'}`);
|
||||
console.log(`Score: ${(result.score * 100).toFixed(1)}%`);
|
||||
console.log(`\nSummary: ${result.reason}`);
|
||||
|
||||
console.log('\nCriterion Scores:');
|
||||
for (const cs of result.criterionScores) {
|
||||
const bar = '█'.repeat(Math.round(cs.score * 10)) + '░'.repeat(10 - Math.round(cs.score * 10));
|
||||
console.log(` ${cs.name}: ${bar} ${(cs.score * 100).toFixed(0)}%`);
|
||||
console.log(` ${cs.feedback}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error grading:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const listRubricsCommand = new Command('rubrics')
|
||||
.description('List available rubrics')
|
||||
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const rubrics = loadAllRubrics(options.rubricsDir);
|
||||
|
||||
if (rubrics.size === 0) {
|
||||
console.log(`No rubrics found in ${options.rubricsDir}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Available rubrics (${rubrics.size}):\n`);
|
||||
|
||||
for (const [name, rubric] of rubrics) {
|
||||
console.log(`📋 ${name}`);
|
||||
console.log(` ${rubric.description}`);
|
||||
console.log(` Threshold: ${(rubric.passingThreshold * 100).toFixed(0)}%`);
|
||||
console.log(` Criteria: ${rubric.criteria.map(c => c.name).join(', ')}`);
|
||||
console.log('');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error listing rubrics:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
export const calibrateCommand = new Command('calibrate')
|
||||
.description('Calibrate a rubric against known examples')
|
||||
.argument('<rubric>', 'Rubric name or path')
|
||||
.argument('<examples>', 'Path to calibration examples JSON')
|
||||
.option('--rubrics-dir <dir>', 'Directory containing rubric YAML files', 'rubrics')
|
||||
.action(async (rubricName: string, examplesPath: string, options) => {
|
||||
try {
|
||||
if (!existsSync(examplesPath)) {
|
||||
console.error(`Examples file not found: ${examplesPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const examples: CalibrationExample[] = JSON.parse(readFileSync(examplesPath, 'utf-8'));
|
||||
|
||||
console.log(`Calibrating rubric '${rubricName}' with ${examples.length} examples...`);
|
||||
|
||||
const result = await calibrate(rubricName, examples, {
|
||||
rubricsDir: options.rubricsDir,
|
||||
});
|
||||
|
||||
console.log('\n' + analyzeCalibration(result));
|
||||
} catch (error) {
|
||||
console.error('Error calibrating:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
61
src/cli/commands/render.ts
Normal file
61
src/cli/commands/render.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import { Command } from 'commander';
|
||||
import { readFileSync, existsSync } from 'fs';
|
||||
import { renderSpec, detectFramework, type Framework } from '../../renderers/index.js';
|
||||
import type { EvalSpec } from '../../analyzer/types.js';
|
||||
|
||||
export const renderCommand = new Command('render')
|
||||
.description('Render EvalSpec JSON into runnable test files')
|
||||
.argument('<spec>', 'Path to EvalSpec JSON file')
|
||||
.option('-o, --output <dir>', 'Output directory for test files', './tests/generated')
|
||||
.option('-f, --framework <framework>', 'Test framework (pytest, vitest, jest)')
|
||||
.option('--fixtures', 'Generate fixture stubs', false)
|
||||
.option('--mocks', 'Generate mock stubs', false)
|
||||
.option('--dry-run', 'Preview without writing files', false)
|
||||
.action(async (specPath: string, options) => {
|
||||
try {
|
||||
if (!existsSync(specPath)) {
|
||||
console.error(`Error: Spec file not found: ${specPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const specContent = readFileSync(specPath, 'utf-8');
|
||||
const spec: EvalSpec = JSON.parse(specContent);
|
||||
|
||||
const framework = (options.framework as Framework) || detectFramework(spec);
|
||||
|
||||
console.log(`Rendering ${spec.scenarios.length} scenarios with ${framework}...`);
|
||||
|
||||
const result = await renderSpec(spec, {
|
||||
outputDir: options.output,
|
||||
framework,
|
||||
includeFixtures: options.fixtures,
|
||||
generateMocks: options.mocks,
|
||||
dryRun: options.dryRun,
|
||||
});
|
||||
|
||||
if (options.dryRun) {
|
||||
console.log('\n--- DRY RUN ---\n');
|
||||
for (const file of result.files) {
|
||||
console.log(`📄 ${file.path}`);
|
||||
console.log('---');
|
||||
console.log(file.content);
|
||||
console.log('---\n');
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✅ Rendered ${result.stats.scenarioCount} scenarios`);
|
||||
console.log(` 📁 ${result.stats.fileCount} test files`);
|
||||
console.log(` 🔍 ${result.stats.assertionCount} assertions`);
|
||||
|
||||
if (result.stats.skippedCount > 0) {
|
||||
console.log(` ⏭️ ${result.stats.skippedCount} scenarios skipped (LLM rubric assertions)`);
|
||||
}
|
||||
|
||||
if (!options.dryRun) {
|
||||
console.log(`\n📂 Output: ${options.output}`);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error rendering spec:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
|
@ -3,6 +3,8 @@
|
|||
import { Command } from 'commander';
|
||||
import { introCommand } from './commands/intro.js';
|
||||
import { analyzeCommand } from './commands/analyze.js';
|
||||
import { renderCommand } from './commands/render.js';
|
||||
import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/grade.js';
|
||||
|
||||
const program = new Command();
|
||||
|
||||
|
|
@ -13,5 +15,9 @@ program
|
|||
|
||||
program.addCommand(introCommand);
|
||||
program.addCommand(analyzeCommand);
|
||||
program.addCommand(renderCommand);
|
||||
program.addCommand(gradeCommand);
|
||||
program.addCommand(listRubricsCommand);
|
||||
program.addCommand(calibrateCommand);
|
||||
|
||||
program.parse(process.argv);
|
||||
|
|
|
|||
22
src/graders/index.ts
Normal file
22
src/graders/index.ts
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
export type {
|
||||
Rubric,
|
||||
RubricCriterion,
|
||||
RubricGradingResult,
|
||||
CriterionScore,
|
||||
GradeRequest,
|
||||
GraderOptions,
|
||||
CalibrationExample,
|
||||
CalibrationSet,
|
||||
CalibrationResult,
|
||||
} from './types.js';
|
||||
|
||||
export {
|
||||
LLMGrader,
|
||||
gradeWithRubric,
|
||||
loadRubric,
|
||||
loadAllRubrics,
|
||||
clearRubricCache,
|
||||
formatRubricForPrompt,
|
||||
calibrate,
|
||||
analyzeCalibration,
|
||||
} from './llm/index.js';
|
||||
68
src/graders/llm/calibrator.ts
Normal file
68
src/graders/llm/calibrator.ts
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
import type { Rubric, CalibrationExample, CalibrationResult, GraderOptions } from '../types.js';
|
||||
import { gradeWithRubric } from './grader.js';
|
||||
import { loadRubric } from './rubric-loader.js';
|
||||
|
||||
const AGREEMENT_THRESHOLD = 0.1;
|
||||
const MIN_AGREEMENT_RATE = 0.8;
|
||||
|
||||
export async function calibrate(
|
||||
rubricNameOrDef: string | Rubric,
|
||||
examples: CalibrationExample[],
|
||||
options?: GraderOptions
|
||||
): Promise<CalibrationResult> {
|
||||
const rubric = typeof rubricNameOrDef === 'string'
|
||||
? loadRubric(rubricNameOrDef, options?.rubricsDir)
|
||||
: rubricNameOrDef;
|
||||
|
||||
const results = await Promise.all(
|
||||
examples.map(async (ex) => {
|
||||
const result = await gradeWithRubric(ex.input, rubric, options);
|
||||
return {
|
||||
example: ex,
|
||||
actualScore: result.score,
|
||||
difference: result.score - ex.expectedScore,
|
||||
};
|
||||
})
|
||||
);
|
||||
|
||||
const withinThreshold = results.filter(r =>
|
||||
Math.abs(r.difference) < AGREEMENT_THRESHOLD
|
||||
);
|
||||
|
||||
const agreement = withinThreshold.length / results.length;
|
||||
const drift = results.map(r => r.difference);
|
||||
|
||||
return {
|
||||
agreement,
|
||||
drift,
|
||||
needsAdjustment: agreement < MIN_AGREEMENT_RATE,
|
||||
details: results,
|
||||
};
|
||||
}
|
||||
|
||||
export function analyzeCalibration(result: CalibrationResult): string {
|
||||
const lines: string[] = [];
|
||||
|
||||
lines.push(`Calibration Results`);
|
||||
lines.push(`==================`);
|
||||
lines.push(`Agreement Rate: ${(result.agreement * 100).toFixed(1)}%`);
|
||||
lines.push(`Status: ${result.needsAdjustment ? '⚠️ Needs Adjustment' : '✅ Calibrated'}`);
|
||||
lines.push('');
|
||||
|
||||
if (result.drift.length > 0) {
|
||||
const avgDrift = result.drift.reduce((a, b) => a + b, 0) / result.drift.length;
|
||||
const maxDrift = Math.max(...result.drift.map(Math.abs));
|
||||
|
||||
lines.push(`Average Drift: ${avgDrift > 0 ? '+' : ''}${avgDrift.toFixed(3)}`);
|
||||
lines.push(`Max Absolute Drift: ${maxDrift.toFixed(3)}`);
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
lines.push(`Individual Results:`);
|
||||
for (const detail of result.details) {
|
||||
const status = Math.abs(detail.difference) < AGREEMENT_THRESHOLD ? '✓' : '✗';
|
||||
lines.push(` ${status} Expected: ${detail.example.expectedScore.toFixed(2)}, Actual: ${detail.actualScore.toFixed(2)}, Diff: ${detail.difference > 0 ? '+' : ''}${detail.difference.toFixed(3)}`);
|
||||
}
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
98
src/graders/llm/grader.ts
Normal file
98
src/graders/llm/grader.ts
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import type { Rubric, RubricGradingResult, GraderOptions, CriterionScore } from '../types.js';
|
||||
import { loadRubric } from './rubric-loader.js';
|
||||
import { buildGraderSystemPrompt, buildGraderUserPrompt } from './prompt-builder.js';
|
||||
|
||||
const DEFAULT_MODEL = 'claude-sonnet-4-20250514';
|
||||
const DEFAULT_MAX_TOKENS = 1024;
|
||||
|
||||
interface GradingResponse {
|
||||
scores: Record<string, { score: number; feedback: string }>;
|
||||
overall: number;
|
||||
summary: string;
|
||||
}
|
||||
|
||||
export class LLMGrader {
|
||||
private client: Anthropic;
|
||||
private options: Required<GraderOptions>;
|
||||
|
||||
constructor(options: GraderOptions = {}) {
|
||||
this.client = new Anthropic();
|
||||
this.options = {
|
||||
model: options.model || DEFAULT_MODEL,
|
||||
maxTokens: options.maxTokens || DEFAULT_MAX_TOKENS,
|
||||
rubricsDir: options.rubricsDir || 'rubrics',
|
||||
};
|
||||
}
|
||||
|
||||
async grade(output: string, rubricNameOrDef: string | Rubric): Promise<RubricGradingResult> {
|
||||
const rubric = typeof rubricNameOrDef === 'string'
|
||||
? loadRubric(rubricNameOrDef, this.options.rubricsDir)
|
||||
: rubricNameOrDef;
|
||||
|
||||
const systemPrompt = buildGraderSystemPrompt();
|
||||
const userPrompt = buildGraderUserPrompt(output, rubric);
|
||||
|
||||
const response = await this.client.messages.create({
|
||||
model: this.options.model,
|
||||
max_tokens: this.options.maxTokens,
|
||||
system: systemPrompt,
|
||||
messages: [{ role: 'user', content: userPrompt }],
|
||||
});
|
||||
|
||||
const responseText = response.content[0].type === 'text'
|
||||
? response.content[0].text
|
||||
: '';
|
||||
|
||||
const parsed = this.parseResponse(responseText);
|
||||
|
||||
return this.buildResult(parsed, rubric);
|
||||
}
|
||||
|
||||
private parseResponse(text: string): GradingResponse {
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error('Failed to extract JSON from grader response');
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(jsonMatch[0]) as GradingResponse;
|
||||
} catch (e) {
|
||||
throw new Error(`Failed to parse grader response as JSON: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
private buildResult(parsed: GradingResponse, rubric: Rubric): RubricGradingResult {
|
||||
const criterionScores: CriterionScore[] = rubric.criteria.map(c => {
|
||||
const score = parsed.scores[c.name];
|
||||
return {
|
||||
name: c.name,
|
||||
score: score?.score ?? 0,
|
||||
feedback: score?.feedback ?? 'No feedback provided',
|
||||
};
|
||||
});
|
||||
|
||||
const weightedScore = rubric.criteria.reduce((sum, c) => {
|
||||
const criterionScore = parsed.scores[c.name]?.score ?? 0;
|
||||
return sum + criterionScore * c.weight;
|
||||
}, 0);
|
||||
|
||||
const finalScore = parsed.overall ?? weightedScore;
|
||||
|
||||
return {
|
||||
pass: finalScore >= rubric.passingThreshold,
|
||||
score: finalScore,
|
||||
reason: parsed.summary || 'No summary provided',
|
||||
criterionScores,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function gradeWithRubric(
|
||||
output: string,
|
||||
rubricNameOrDef: string | Rubric,
|
||||
options?: GraderOptions
|
||||
): Promise<RubricGradingResult> {
|
||||
const grader = new LLMGrader(options);
|
||||
return grader.grade(output, rubricNameOrDef);
|
||||
}
|
||||
4
src/graders/llm/index.ts
Normal file
4
src/graders/llm/index.ts
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
export { LLMGrader, gradeWithRubric } from './grader.js';
|
||||
export { loadRubric, loadAllRubrics, clearRubricCache, formatRubricForPrompt } from './rubric-loader.js';
|
||||
export { calibrate, analyzeCalibration } from './calibrator.js';
|
||||
export { buildGraderSystemPrompt, buildGraderUserPrompt, clearPromptCache } from './prompt-builder.js';
|
||||
50
src/graders/llm/prompt-builder.ts
Normal file
50
src/graders/llm/prompt-builder.ts
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import { readFileSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import type { Rubric } from '../types.js';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const PROMPTS_DIR = join(__dirname, '../../../prompts');
|
||||
|
||||
let promptCache: Map<string, string> = new Map();
|
||||
|
||||
function loadPrompt(name: string): string {
|
||||
if (promptCache.has(name)) {
|
||||
return promptCache.get(name)!;
|
||||
}
|
||||
|
||||
const filePath = join(PROMPTS_DIR, `${name}.md`);
|
||||
const content = readFileSync(filePath, 'utf-8');
|
||||
promptCache.set(name, content);
|
||||
return content;
|
||||
}
|
||||
|
||||
export function buildGraderSystemPrompt(): string {
|
||||
return loadPrompt('grader-system');
|
||||
}
|
||||
|
||||
export function buildGraderUserPrompt(output: string, rubric: Rubric): string {
|
||||
const template = loadPrompt('grader-user');
|
||||
|
||||
const criteriaList = rubric.criteria.map(c => {
|
||||
let entry = `#### ${c.name} (weight: ${(c.weight * 100).toFixed(0)}%)\n\n${c.description}`;
|
||||
|
||||
if (c.examples) {
|
||||
entry += `\n\n**Good example:** ${c.examples.good}`;
|
||||
entry += `\n**Bad example:** ${c.examples.bad}`;
|
||||
}
|
||||
|
||||
return entry;
|
||||
}).join('\n\n');
|
||||
|
||||
return template
|
||||
.replace('{{RUBRIC_NAME}}', rubric.name)
|
||||
.replace('{{RUBRIC_DESCRIPTION}}', rubric.description)
|
||||
.replace('{{PASSING_THRESHOLD}}', String(Math.round(rubric.passingThreshold * 100)))
|
||||
.replace('{{CRITERIA_LIST}}', criteriaList)
|
||||
.replace('{{OUTPUT}}', output);
|
||||
}
|
||||
|
||||
export function clearPromptCache(): void {
|
||||
promptCache.clear();
|
||||
}
|
||||
127
src/graders/llm/rubric-loader.ts
Normal file
127
src/graders/llm/rubric-loader.ts
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
import { readFileSync, existsSync, readdirSync } from 'fs';
|
||||
import { join, basename } from 'path';
|
||||
import yaml from 'js-yaml';
|
||||
import type { Rubric } from '../types.js';
|
||||
|
||||
const DEFAULT_RUBRICS_DIR = 'rubrics';
|
||||
|
||||
let rubricCache: Map<string, Rubric> = new Map();
|
||||
|
||||
export function loadRubric(nameOrPath: string, rubricsDir: string = DEFAULT_RUBRICS_DIR): Rubric {
|
||||
if (rubricCache.has(nameOrPath)) {
|
||||
return rubricCache.get(nameOrPath)!;
|
||||
}
|
||||
|
||||
let rubricPath: string;
|
||||
|
||||
if (existsSync(nameOrPath)) {
|
||||
rubricPath = nameOrPath;
|
||||
} else {
|
||||
rubricPath = join(rubricsDir, `${nameOrPath}.yaml`);
|
||||
if (!existsSync(rubricPath)) {
|
||||
rubricPath = join(rubricsDir, `${nameOrPath}.yml`);
|
||||
}
|
||||
}
|
||||
|
||||
if (!existsSync(rubricPath)) {
|
||||
throw new Error(`Rubric not found: ${nameOrPath} (searched in ${rubricsDir})`);
|
||||
}
|
||||
|
||||
const content = readFileSync(rubricPath, 'utf-8');
|
||||
const rubric = yaml.load(content) as Rubric;
|
||||
|
||||
validateRubric(rubric);
|
||||
rubricCache.set(nameOrPath, rubric);
|
||||
|
||||
return rubric;
|
||||
}
|
||||
|
||||
export function loadAllRubrics(rubricsDir: string = DEFAULT_RUBRICS_DIR): Map<string, Rubric> {
|
||||
if (!existsSync(rubricsDir)) {
|
||||
return new Map();
|
||||
}
|
||||
|
||||
const files = readdirSync(rubricsDir).filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
|
||||
const rubrics = new Map<string, Rubric>();
|
||||
|
||||
for (const file of files) {
|
||||
const name = basename(file).replace(/\.(yaml|yml)$/, '');
|
||||
try {
|
||||
const rubric = loadRubric(join(rubricsDir, file));
|
||||
rubrics.set(name, rubric);
|
||||
} catch (e) {
|
||||
console.warn(`Failed to load rubric ${file}:`, e);
|
||||
}
|
||||
}
|
||||
|
||||
return rubrics;
|
||||
}
|
||||
|
||||
function validateRubric(rubric: unknown): asserts rubric is Rubric {
|
||||
if (!rubric || typeof rubric !== 'object') {
|
||||
throw new Error('Rubric must be an object');
|
||||
}
|
||||
|
||||
const r = rubric as Record<string, unknown>;
|
||||
|
||||
if (typeof r.name !== 'string') {
|
||||
throw new Error('Rubric must have a name (string)');
|
||||
}
|
||||
if (typeof r.description !== 'string') {
|
||||
throw new Error('Rubric must have a description (string)');
|
||||
}
|
||||
if (typeof r.passingThreshold !== 'number' || r.passingThreshold < 0 || r.passingThreshold > 1) {
|
||||
throw new Error('Rubric must have a passingThreshold between 0 and 1');
|
||||
}
|
||||
if (!Array.isArray(r.criteria) || r.criteria.length === 0) {
|
||||
throw new Error('Rubric must have at least one criterion');
|
||||
}
|
||||
|
||||
for (const criterion of r.criteria) {
|
||||
validateCriterion(criterion);
|
||||
}
|
||||
|
||||
const totalWeight = (r.criteria as Array<{ weight: number }>).reduce((sum, c) => sum + c.weight, 0);
|
||||
if (Math.abs(totalWeight - 1) > 0.01) {
|
||||
console.warn(`Rubric '${r.name}' weights sum to ${totalWeight}, not 1.0`);
|
||||
}
|
||||
}
|
||||
|
||||
function validateCriterion(criterion: unknown): void {
|
||||
if (!criterion || typeof criterion !== 'object') {
|
||||
throw new Error('Criterion must be an object');
|
||||
}
|
||||
|
||||
const c = criterion as Record<string, unknown>;
|
||||
|
||||
if (typeof c.name !== 'string') {
|
||||
throw new Error('Criterion must have a name');
|
||||
}
|
||||
if (typeof c.description !== 'string') {
|
||||
throw new Error('Criterion must have a description');
|
||||
}
|
||||
if (typeof c.weight !== 'number' || c.weight < 0 || c.weight > 1) {
|
||||
throw new Error('Criterion must have a weight between 0 and 1');
|
||||
}
|
||||
}
|
||||
|
||||
export function clearRubricCache(): void {
|
||||
rubricCache.clear();
|
||||
}
|
||||
|
||||
export function formatRubricForPrompt(rubric: Rubric): string {
|
||||
let prompt = `# ${rubric.name}\n\n${rubric.description}\n\nPassing threshold: ${rubric.passingThreshold * 100}%\n\n## Criteria\n\n`;
|
||||
|
||||
for (const criterion of rubric.criteria) {
|
||||
prompt += `### ${criterion.name} (weight: ${criterion.weight * 100}%)\n`;
|
||||
prompt += `${criterion.description}\n`;
|
||||
|
||||
if (criterion.examples) {
|
||||
prompt += `\n**Good example:** ${criterion.examples.good}\n`;
|
||||
prompt += `**Bad example:** ${criterion.examples.bad}\n`;
|
||||
}
|
||||
prompt += '\n';
|
||||
}
|
||||
|
||||
return prompt;
|
||||
}
|
||||
63
src/graders/types.ts
Normal file
63
src/graders/types.ts
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
export interface Rubric {
|
||||
name: string;
|
||||
description: string;
|
||||
criteria: RubricCriterion[];
|
||||
passingThreshold: number;
|
||||
}
|
||||
|
||||
export interface RubricCriterion {
|
||||
name: string;
|
||||
description: string;
|
||||
weight: number;
|
||||
examples?: {
|
||||
good: string;
|
||||
bad: string;
|
||||
};
|
||||
}
|
||||
|
||||
export interface RubricGradingResult {
|
||||
pass: boolean;
|
||||
score: number;
|
||||
reason: string;
|
||||
criterionScores: CriterionScore[];
|
||||
}
|
||||
|
||||
export interface CriterionScore {
|
||||
name: string;
|
||||
score: number;
|
||||
feedback: string;
|
||||
}
|
||||
|
||||
export interface GradeRequest {
|
||||
output: string;
|
||||
rubric: string | Rubric;
|
||||
context?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface GraderOptions {
|
||||
model?: string;
|
||||
maxTokens?: number;
|
||||
rubricsDir?: string;
|
||||
}
|
||||
|
||||
export interface CalibrationExample {
|
||||
input: string;
|
||||
expectedScore: number;
|
||||
expectedFeedback?: string[];
|
||||
}
|
||||
|
||||
export interface CalibrationSet {
|
||||
rubric: string;
|
||||
examples: CalibrationExample[];
|
||||
}
|
||||
|
||||
export interface CalibrationResult {
|
||||
agreement: number;
|
||||
drift: number[];
|
||||
needsAdjustment: boolean;
|
||||
details: {
|
||||
example: CalibrationExample;
|
||||
actualScore: number;
|
||||
difference: number;
|
||||
}[];
|
||||
}
|
||||
|
|
@ -1,2 +1,4 @@
|
|||
export * from './introspector/index.js';
|
||||
export * from './analyzer/index.js';
|
||||
export * from './renderers/index.js';
|
||||
export * from './graders/index.js';
|
||||
|
|
|
|||
96
src/renderers/base.ts
Normal file
96
src/renderers/base.ts
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
||||
import type { RenderOptions, RenderResult, GeneratedFile, ScenarioGroup, RenderStats } from './types.js';
|
||||
|
||||
export abstract class BaseRenderer {
|
||||
protected options: RenderOptions;
|
||||
|
||||
constructor(options: RenderOptions) {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
abstract get language(): 'python' | 'typescript';
|
||||
abstract get fileExtension(): string;
|
||||
|
||||
async render(spec: EvalSpec): Promise<RenderResult> {
|
||||
const groups = this.groupByModule(spec.scenarios);
|
||||
const files: GeneratedFile[] = [];
|
||||
let assertionCount = 0;
|
||||
let skippedCount = 0;
|
||||
|
||||
for (const group of groups) {
|
||||
const validScenarios = group.scenarios.filter(s => this.canRender(s));
|
||||
skippedCount += group.scenarios.length - validScenarios.length;
|
||||
|
||||
if (validScenarios.length === 0) continue;
|
||||
|
||||
const content = this.renderTestFile(group.module, validScenarios);
|
||||
const path = this.getOutputPath(group.module);
|
||||
|
||||
assertionCount += validScenarios.reduce((sum, s) => sum + s.assertions.length, 0);
|
||||
|
||||
files.push({
|
||||
path,
|
||||
content,
|
||||
scenarios: validScenarios.map(s => s.id),
|
||||
language: this.language,
|
||||
});
|
||||
}
|
||||
|
||||
const stats: RenderStats = {
|
||||
scenarioCount: spec.scenarios.length - skippedCount,
|
||||
fileCount: files.length,
|
||||
assertionCount,
|
||||
skippedCount,
|
||||
};
|
||||
|
||||
return { files, stats };
|
||||
}
|
||||
|
||||
protected groupByModule(scenarios: EvalScenario[]): ScenarioGroup[] {
|
||||
const groups = new Map<string, EvalScenario[]>();
|
||||
|
||||
for (const scenario of scenarios) {
|
||||
const module = scenario.target.module;
|
||||
if (!groups.has(module)) {
|
||||
groups.set(module, []);
|
||||
}
|
||||
groups.get(module)!.push(scenario);
|
||||
}
|
||||
|
||||
return Array.from(groups.entries()).map(([module, scenarios]) => ({
|
||||
module,
|
||||
scenarios,
|
||||
}));
|
||||
}
|
||||
|
||||
protected canRender(scenario: EvalScenario): boolean {
|
||||
return scenario.assertions.every(a => a.type !== 'llm-rubric');
|
||||
}
|
||||
|
||||
protected getOutputPath(modulePath: string): string {
|
||||
const baseName = modulePath
|
||||
.replace(/\.(py|ts|tsx|js|jsx)$/, '')
|
||||
.replace(/\//g, '_');
|
||||
return `${this.options.outputDir}/test_${baseName}${this.fileExtension}`;
|
||||
}
|
||||
|
||||
protected abstract renderTestFile(module: string, scenarios: EvalScenario[]): string;
|
||||
|
||||
protected abstract renderAssertion(assertion: EvalScenario['assertions'][0], resultVar: string): string;
|
||||
|
||||
protected toSnakeCase(str: string): string {
|
||||
return str.replace(/-/g, '_').replace(/([A-Z])/g, '_$1').toLowerCase();
|
||||
}
|
||||
|
||||
protected toCamelCase(str: string): string {
|
||||
return str.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
||||
}
|
||||
|
||||
protected formatValue(value: unknown): string {
|
||||
if (value === null) return 'null';
|
||||
if (value === undefined) return 'undefined';
|
||||
if (typeof value === 'string') return JSON.stringify(value);
|
||||
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
|
||||
return JSON.stringify(value);
|
||||
}
|
||||
}
|
||||
67
src/renderers/index.ts
Normal file
67
src/renderers/index.ts
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import type { EvalSpec } from '../analyzer/types.js';
|
||||
import type { RenderOptions, RenderResult, Framework, GeneratedFile } from './types.js';
|
||||
import { BaseRenderer } from './base.js';
|
||||
import { PytestRenderer } from './python/pytest-renderer.js';
|
||||
import { VitestRenderer, JestRenderer } from './typescript/vitest-renderer.js';
|
||||
import { writeFileSync, mkdirSync } from 'fs';
|
||||
import { dirname } from 'path';
|
||||
|
||||
export type { RenderOptions, RenderResult, GeneratedFile, Framework } from './types.js';
|
||||
export { BaseRenderer } from './base.js';
|
||||
export { PytestRenderer } from './python/pytest-renderer.js';
|
||||
export { VitestRenderer, JestRenderer } from './typescript/vitest-renderer.js';
|
||||
|
||||
const rendererRegistry: Record<Framework, new (options: RenderOptions) => BaseRenderer> = {
|
||||
pytest: PytestRenderer,
|
||||
vitest: VitestRenderer,
|
||||
jest: JestRenderer,
|
||||
};
|
||||
|
||||
export function createRenderer(options: RenderOptions): BaseRenderer {
|
||||
const RendererClass = rendererRegistry[options.framework];
|
||||
if (!RendererClass) {
|
||||
throw new Error(`Unknown framework: ${options.framework}`);
|
||||
}
|
||||
return new RendererClass(options);
|
||||
}
|
||||
|
||||
export async function renderSpec(spec: EvalSpec, options: RenderOptions): Promise<RenderResult> {
|
||||
const renderer = createRenderer(options);
|
||||
const result = await renderer.render(spec);
|
||||
|
||||
if (!options.dryRun) {
|
||||
for (const file of result.files) {
|
||||
mkdirSync(dirname(file.path), { recursive: true });
|
||||
writeFileSync(file.path, file.content, 'utf-8');
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
export async function renderIncremental(
|
||||
spec: EvalSpec,
|
||||
options: RenderOptions,
|
||||
changedFiles: string[]
|
||||
): Promise<RenderResult> {
|
||||
const filteredSpec: EvalSpec = {
|
||||
...spec,
|
||||
scenarios: spec.scenarios.filter(s =>
|
||||
changedFiles.some(f => s.target.module.includes(f))
|
||||
),
|
||||
};
|
||||
return renderSpec(filteredSpec, options);
|
||||
}
|
||||
|
||||
export function detectFramework(spec: EvalSpec): Framework {
|
||||
const languages = spec.repo.languages;
|
||||
|
||||
if (languages.includes('python')) {
|
||||
return 'pytest';
|
||||
}
|
||||
if (languages.includes('typescript') || languages.includes('javascript')) {
|
||||
return 'vitest';
|
||||
}
|
||||
|
||||
return 'vitest';
|
||||
}
|
||||
104
src/renderers/python/assertions.ts
Normal file
104
src/renderers/python/assertions.ts
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
import type { Assertion } from '../../analyzer/types.js';
|
||||
|
||||
export function renderPytestAssertion(assertion: Assertion, resultVar: string): string {
|
||||
const target = getTargetExpression(assertion, resultVar);
|
||||
|
||||
switch (assertion.type) {
|
||||
case 'equals':
|
||||
return `assert ${target} == ${formatPythonValue(assertion.expected)}`;
|
||||
|
||||
case 'contains':
|
||||
return `assert ${formatPythonValue(assertion.value)} in ${target}`;
|
||||
|
||||
case 'typeof':
|
||||
return `assert isinstance(${target}, ${mapPythonType(assertion.expected)})`;
|
||||
|
||||
case 'matches':
|
||||
return `assert re.match(${formatPythonValue(assertion.pattern)}, ${target})`;
|
||||
|
||||
case 'throws':
|
||||
return ''; // Handled specially in test structure
|
||||
|
||||
case 'truthy':
|
||||
return `assert ${target}`;
|
||||
|
||||
case 'falsy':
|
||||
return `assert not ${target}`;
|
||||
|
||||
case 'custom':
|
||||
return `assert ${assertion.check} # ${assertion.description}`;
|
||||
|
||||
case 'llm-rubric':
|
||||
return `# LLM Rubric: ${assertion.rubric} - skipped (requires grader)`;
|
||||
|
||||
default:
|
||||
return `# Unknown assertion type: ${(assertion as Assertion).type}`;
|
||||
}
|
||||
}
|
||||
|
||||
function getTargetExpression(assertion: Assertion, resultVar: string): string {
|
||||
if ('path' in assertion && assertion.path) {
|
||||
const path = assertion.path;
|
||||
if (path.startsWith('[')) {
|
||||
return `${resultVar}${path}`;
|
||||
}
|
||||
return `${resultVar}["${path}"]`;
|
||||
}
|
||||
return resultVar;
|
||||
}
|
||||
|
||||
function formatPythonValue(value: unknown): string {
|
||||
if (value === null) return 'None';
|
||||
if (value === undefined) return 'None';
|
||||
if (typeof value === 'boolean') return value ? 'True' : 'False';
|
||||
if (typeof value === 'string') return JSON.stringify(value);
|
||||
if (typeof value === 'number') return String(value);
|
||||
if (Array.isArray(value)) {
|
||||
return `[${value.map(formatPythonValue).join(', ')}]`;
|
||||
}
|
||||
if (typeof value === 'object') {
|
||||
const entries = Object.entries(value)
|
||||
.map(([k, v]) => `"${k}": ${formatPythonValue(v)}`)
|
||||
.join(', ');
|
||||
return `{${entries}}`;
|
||||
}
|
||||
return String(value);
|
||||
}
|
||||
|
||||
function mapPythonType(tsType: string): string {
|
||||
const typeMap: Record<string, string> = {
|
||||
'string': 'str',
|
||||
'number': '(int, float)',
|
||||
'boolean': 'bool',
|
||||
'object': 'dict',
|
||||
'array': 'list',
|
||||
'null': 'type(None)',
|
||||
'undefined': 'type(None)',
|
||||
};
|
||||
return typeMap[tsType] || tsType;
|
||||
}
|
||||
|
||||
export function renderThrowsContext(assertion: Assertion): { contextManager: string; exceptionType: string } | null {
|
||||
if (assertion.type !== 'throws') return null;
|
||||
|
||||
const exceptionType = assertion.errorType || 'Exception';
|
||||
let contextManager = `pytest.raises(${exceptionType})`;
|
||||
|
||||
if (assertion.messageContains) {
|
||||
contextManager = `pytest.raises(${exceptionType}, match=${formatPythonValue(assertion.messageContains)})`;
|
||||
}
|
||||
|
||||
return { contextManager, exceptionType };
|
||||
}
|
||||
|
||||
export function formatPythonArgs(args: Record<string, unknown>): string {
|
||||
return Object.entries(args)
|
||||
.map(([_, value]) => formatPythonValue(value))
|
||||
.join(', ');
|
||||
}
|
||||
|
||||
export function formatPythonKwargs(kwargs: Record<string, unknown>): string {
|
||||
return Object.entries(kwargs)
|
||||
.map(([key, value]) => `${key}=${formatPythonValue(value)}`)
|
||||
.join(', ');
|
||||
}
|
||||
160
src/renderers/python/pytest-renderer.ts
Normal file
160
src/renderers/python/pytest-renderer.ts
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
import type { EvalScenario, Assertion } from '../../analyzer/types.js';
|
||||
import { BaseRenderer } from '../base.js';
|
||||
import { renderPytestAssertion, renderThrowsContext, formatPythonArgs, formatPythonKwargs } from './assertions.js';
|
||||
|
||||
export class PytestRenderer extends BaseRenderer {
|
||||
get language(): 'python' {
|
||||
return 'python';
|
||||
}
|
||||
|
||||
get fileExtension(): string {
|
||||
return '.py';
|
||||
}
|
||||
|
||||
protected renderTestFile(module: string, scenarios: EvalScenario[]): string {
|
||||
const imports = this.generateImports(module, scenarios);
|
||||
const fixtures = this.options.includeFixtures ? this.generateFixtures(scenarios) : '';
|
||||
const tests = scenarios.map(s => this.renderTest(s)).join('\n\n');
|
||||
|
||||
return `${imports}\n\n${fixtures}${tests}\n`;
|
||||
}
|
||||
|
||||
private generateImports(module: string, scenarios: EvalScenario[]): string {
|
||||
const imports: string[] = [
|
||||
'import pytest',
|
||||
];
|
||||
|
||||
const hasRegex = scenarios.some(s =>
|
||||
s.assertions.some(a => a.type === 'matches')
|
||||
);
|
||||
if (hasRegex) {
|
||||
imports.push('import re');
|
||||
}
|
||||
|
||||
const hasMocks = scenarios.some(s => s.setup?.mocks?.length);
|
||||
if (hasMocks || this.options.generateMocks) {
|
||||
imports.push('from unittest.mock import patch, MagicMock');
|
||||
}
|
||||
|
||||
const functions = [...new Set(scenarios.map(s => s.target.function))];
|
||||
const modulePath = module.replace(/\.(py|ts|tsx|js|jsx)$/, '').replace(/\//g, '.');
|
||||
imports.push(`from ${modulePath} import ${functions.join(', ')}`);
|
||||
|
||||
return imports.join('\n');
|
||||
}
|
||||
|
||||
private generateFixtures(scenarios: EvalScenario[]): string {
|
||||
const fixtureNames = new Set<string>();
|
||||
for (const scenario of scenarios) {
|
||||
if (scenario.setup?.fixtures) {
|
||||
for (const fixture of scenario.setup.fixtures) {
|
||||
fixtureNames.add(fixture);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fixtureNames.size === 0) return '';
|
||||
|
||||
const fixtures = Array.from(fixtureNames).map(name => `
|
||||
@pytest.fixture
|
||||
def ${name}():
|
||||
# TODO: Implement fixture
|
||||
pass
|
||||
`).join('\n');
|
||||
|
||||
return fixtures + '\n';
|
||||
}
|
||||
|
||||
private renderTest(scenario: EvalScenario): string {
|
||||
const testName = `test_${this.toSnakeCase(scenario.id)}`;
|
||||
const docstring = scenario.description ? ` """${scenario.description}"""\n` : '';
|
||||
|
||||
const throwsAssertion = scenario.assertions.find(a => a.type === 'throws');
|
||||
const regularAssertions = scenario.assertions.filter(a => a.type !== 'throws');
|
||||
|
||||
const fixtureParams = scenario.setup?.fixtures?.join(', ') || '';
|
||||
const funcParams = fixtureParams ? `(${fixtureParams})` : '()';
|
||||
|
||||
let body: string;
|
||||
if (throwsAssertion) {
|
||||
body = this.renderThrowsTest(scenario, throwsAssertion);
|
||||
} else {
|
||||
body = this.renderRegularTest(scenario, regularAssertions);
|
||||
}
|
||||
|
||||
const mocks = this.renderMocks(scenario);
|
||||
if (mocks) {
|
||||
body = mocks.decorators + `def ${testName}${funcParams}:\n${docstring}${mocks.setup}${body}`;
|
||||
} else {
|
||||
body = `def ${testName}${funcParams}:\n${docstring}${body}`;
|
||||
}
|
||||
|
||||
return body;
|
||||
}
|
||||
|
||||
private renderRegularTest(scenario: EvalScenario, assertions: Assertion[]): string {
|
||||
const funcCall = this.renderFunctionCall(scenario);
|
||||
const assertionLines = assertions
|
||||
.map(a => this.renderAssertion(a, 'result'))
|
||||
.filter(Boolean)
|
||||
.map(a => ` ${a}`)
|
||||
.join('\n');
|
||||
|
||||
return ` result = ${funcCall}\n${assertionLines}`;
|
||||
}
|
||||
|
||||
private renderThrowsTest(scenario: EvalScenario, throwsAssertion: Assertion): string {
|
||||
const ctx = renderThrowsContext(throwsAssertion);
|
||||
if (!ctx) return this.renderRegularTest(scenario, scenario.assertions);
|
||||
|
||||
const funcCall = this.renderFunctionCall(scenario);
|
||||
return ` with ${ctx.contextManager}:\n ${funcCall}`;
|
||||
}
|
||||
|
||||
private renderFunctionCall(scenario: EvalScenario): string {
|
||||
const func = scenario.target.function;
|
||||
const args = formatPythonArgs(scenario.input.args);
|
||||
const kwargs = scenario.input.kwargs ? formatPythonKwargs(scenario.input.kwargs) : '';
|
||||
|
||||
const allArgs = [args, kwargs].filter(Boolean).join(', ');
|
||||
return `${func}(${allArgs})`;
|
||||
}
|
||||
|
||||
private renderMocks(scenario: EvalScenario): { decorators: string; setup: string } | null {
|
||||
if (!scenario.setup?.mocks?.length && !this.options.generateMocks) return null;
|
||||
|
||||
const mocks = scenario.setup?.mocks || [];
|
||||
if (mocks.length === 0) return null;
|
||||
|
||||
const decorators = mocks
|
||||
.map((m, i) => `@patch("${m.target}")\n`)
|
||||
.join('');
|
||||
|
||||
const setup = mocks
|
||||
.map((m, i) => {
|
||||
const mockName = `mock_${i}`;
|
||||
if (m.returnValue !== undefined) {
|
||||
return ` ${mockName}.return_value = ${this.formatValue(m.returnValue)}\n`;
|
||||
}
|
||||
if (m.sideEffect) {
|
||||
return ` ${mockName}.side_effect = ${m.sideEffect}\n`;
|
||||
}
|
||||
return '';
|
||||
})
|
||||
.join('');
|
||||
|
||||
return { decorators, setup };
|
||||
}
|
||||
|
||||
protected renderAssertion(assertion: Assertion, resultVar: string): string {
|
||||
return renderPytestAssertion(assertion, resultVar);
|
||||
}
|
||||
|
||||
protected formatValue(value: unknown): string {
|
||||
if (value === null) return 'None';
|
||||
if (value === undefined) return 'None';
|
||||
if (typeof value === 'boolean') return value ? 'True' : 'False';
|
||||
if (typeof value === 'string') return JSON.stringify(value);
|
||||
return String(value);
|
||||
}
|
||||
}
|
||||
40
src/renderers/types.ts
Normal file
40
src/renderers/types.ts
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
||||
|
||||
export type Framework = 'pytest' | 'vitest' | 'jest';
|
||||
|
||||
export interface RenderOptions {
|
||||
outputDir: string;
|
||||
framework: Framework;
|
||||
includeFixtures: boolean;
|
||||
generateMocks: boolean;
|
||||
dryRun?: boolean;
|
||||
}
|
||||
|
||||
export interface RenderResult {
|
||||
files: GeneratedFile[];
|
||||
stats: RenderStats;
|
||||
}
|
||||
|
||||
export interface RenderStats {
|
||||
scenarioCount: number;
|
||||
fileCount: number;
|
||||
assertionCount: number;
|
||||
skippedCount: number;
|
||||
}
|
||||
|
||||
export interface GeneratedFile {
|
||||
path: string;
|
||||
content: string;
|
||||
scenarios: string[];
|
||||
language: 'python' | 'typescript';
|
||||
}
|
||||
|
||||
export interface RendererContext {
|
||||
spec: EvalSpec;
|
||||
options: RenderOptions;
|
||||
}
|
||||
|
||||
export interface ScenarioGroup {
|
||||
module: string;
|
||||
scenarios: EvalScenario[];
|
||||
}
|
||||
114
src/renderers/typescript/assertions.ts
Normal file
114
src/renderers/typescript/assertions.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import type { Assertion } from '../../analyzer/types.js';
|
||||
|
||||
export type TSFramework = 'vitest' | 'jest';
|
||||
|
||||
export function renderTSAssertion(assertion: Assertion, resultVar: string, framework: TSFramework): string {
|
||||
const target = getTargetExpression(assertion, resultVar);
|
||||
|
||||
switch (assertion.type) {
|
||||
case 'equals':
|
||||
if (typeof assertion.expected === 'object' && assertion.expected !== null) {
|
||||
return `expect(${target}).toEqual(${formatTSValue(assertion.expected)});`;
|
||||
}
|
||||
return `expect(${target}).toBe(${formatTSValue(assertion.expected)});`;
|
||||
|
||||
case 'contains':
|
||||
return `expect(${target}).toContain(${formatTSValue(assertion.value)});`;
|
||||
|
||||
case 'typeof':
|
||||
return renderTypeofAssertion(target, assertion.expected, framework);
|
||||
|
||||
case 'matches':
|
||||
return `expect(${target}).toMatch(${formatTSValue(assertion.pattern)});`;
|
||||
|
||||
case 'throws':
|
||||
return ''; // Handled specially in test structure
|
||||
|
||||
case 'truthy':
|
||||
return `expect(${target}).toBeTruthy();`;
|
||||
|
||||
case 'falsy':
|
||||
return `expect(${target}).toBeFalsy();`;
|
||||
|
||||
case 'custom':
|
||||
return `expect(${assertion.check}).toBe(true); // ${assertion.description}`;
|
||||
|
||||
case 'llm-rubric':
|
||||
return `// LLM Rubric: ${assertion.rubric} - skipped (requires grader)`;
|
||||
|
||||
default:
|
||||
return `// Unknown assertion type: ${(assertion as Assertion).type}`;
|
||||
}
|
||||
}
|
||||
|
||||
function renderTypeofAssertion(target: string, expected: string, _framework: TSFramework): string {
|
||||
switch (expected) {
|
||||
case 'array':
|
||||
return `expect(Array.isArray(${target})).toBe(true);`;
|
||||
case 'null':
|
||||
return `expect(${target}).toBeNull();`;
|
||||
case 'undefined':
|
||||
return `expect(${target}).toBeUndefined();`;
|
||||
case 'object':
|
||||
return `expect(typeof ${target}).toBe('object');`;
|
||||
default:
|
||||
return `expect(typeof ${target}).toBe('${expected}');`;
|
||||
}
|
||||
}
|
||||
|
||||
function getTargetExpression(assertion: Assertion, resultVar: string): string {
|
||||
if ('path' in assertion && assertion.path) {
|
||||
const path = assertion.path;
|
||||
if (path.startsWith('[')) {
|
||||
return `${resultVar}${path}`;
|
||||
}
|
||||
if (path.includes('.')) {
|
||||
return `${resultVar}.${path}`;
|
||||
}
|
||||
return `${resultVar}['${path}']`;
|
||||
}
|
||||
return resultVar;
|
||||
}
|
||||
|
||||
function formatTSValue(value: unknown): string {
|
||||
if (value === null) return 'null';
|
||||
if (value === undefined) return 'undefined';
|
||||
if (typeof value === 'string') return JSON.stringify(value);
|
||||
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
|
||||
if (Array.isArray(value)) {
|
||||
return `[${value.map(formatTSValue).join(', ')}]`;
|
||||
}
|
||||
if (typeof value === 'object') {
|
||||
const entries = Object.entries(value)
|
||||
.map(([k, v]) => `${k}: ${formatTSValue(v)}`)
|
||||
.join(', ');
|
||||
return `{ ${entries} }`;
|
||||
}
|
||||
return String(value);
|
||||
}
|
||||
|
||||
export function renderThrowsExpectation(
|
||||
funcCall: string,
|
||||
assertion: Assertion,
|
||||
isAsync: boolean
|
||||
): string {
|
||||
if (assertion.type !== 'throws') return '';
|
||||
|
||||
const expectFn = isAsync
|
||||
? `await expect(async () => ${funcCall})`
|
||||
: `expect(() => ${funcCall})`;
|
||||
|
||||
const throwMatcher = isAsync ? 'rejects.toThrow' : 'toThrow';
|
||||
|
||||
if (assertion.errorType) {
|
||||
return `${expectFn}.${throwMatcher}(${assertion.errorType});`;
|
||||
}
|
||||
if (assertion.messageContains) {
|
||||
return `${expectFn}.${throwMatcher}(${formatTSValue(assertion.messageContains)});`;
|
||||
}
|
||||
return `${expectFn}.${throwMatcher}();`;
|
||||
}
|
||||
|
||||
export function formatTSArgs(args: Record<string, unknown>): string {
|
||||
return Object.values(args).map(formatTSValue).join(', ');
|
||||
}
|
||||
152
src/renderers/typescript/vitest-renderer.ts
Normal file
152
src/renderers/typescript/vitest-renderer.ts
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
import type { EvalScenario, Assertion } from '../../analyzer/types.js';
|
||||
import { BaseRenderer } from '../base.js';
|
||||
import { renderTSAssertion, renderThrowsExpectation, formatTSArgs } from './assertions.js';
|
||||
|
||||
export class VitestRenderer extends BaseRenderer {
|
||||
get language(): 'typescript' {
|
||||
return 'typescript';
|
||||
}
|
||||
|
||||
get fileExtension(): string {
|
||||
return '.test.ts';
|
||||
}
|
||||
|
||||
protected renderTestFile(module: string, scenarios: EvalScenario[]): string {
|
||||
const imports = this.generateImports(module, scenarios);
|
||||
const describes = this.generateDescribe(module, scenarios);
|
||||
return `${imports}\n\n${describes}\n`;
|
||||
}
|
||||
|
||||
protected generateImports(module: string, scenarios: EvalScenario[]): string {
|
||||
const imports: string[] = [
|
||||
`import { describe, it, expect } from 'vitest';`,
|
||||
];
|
||||
|
||||
const hasMocks = scenarios.some(s => s.setup?.mocks?.length);
|
||||
if (hasMocks || this.options.generateMocks) {
|
||||
imports.push(`import { vi } from 'vitest';`);
|
||||
}
|
||||
|
||||
const functions = [...new Set(scenarios.map(s => s.target.function))];
|
||||
const modulePath = module.replace(/\.(ts|tsx|js|jsx)$/, '');
|
||||
imports.push(`import { ${functions.join(', ')} } from '${modulePath}';`);
|
||||
|
||||
return imports.join('\n');
|
||||
}
|
||||
|
||||
protected generateDescribe(module: string, scenarios: EvalScenario[]): string {
|
||||
const moduleName = module.split('/').pop()?.replace(/\.(ts|tsx|js|jsx)$/, '') || module;
|
||||
const tests = scenarios.map(s => this.renderTest(s)).join('\n\n');
|
||||
return `describe('${moduleName}', () => {\n${tests}\n});`;
|
||||
}
|
||||
|
||||
protected renderTest(scenario: EvalScenario): string {
|
||||
const testName = scenario.name || scenario.id;
|
||||
const isAsync = this.hasAsyncTarget(scenario);
|
||||
const asyncPrefix = isAsync ? 'async ' : '';
|
||||
|
||||
const throwsAssertion = scenario.assertions.find(a => a.type === 'throws');
|
||||
const regularAssertions = scenario.assertions.filter(a => a.type !== 'throws');
|
||||
|
||||
let body: string;
|
||||
if (throwsAssertion) {
|
||||
body = this.renderThrowsTest(scenario, throwsAssertion, isAsync);
|
||||
} else {
|
||||
body = this.renderRegularTest(scenario, regularAssertions, isAsync);
|
||||
}
|
||||
|
||||
const mocks = this.renderMocks(scenario);
|
||||
const mockSetup = mocks ? `\n${mocks}` : '';
|
||||
|
||||
return ` it('${testName}', ${asyncPrefix}() => {${mockSetup}
|
||||
${body}
|
||||
});`;
|
||||
}
|
||||
|
||||
protected hasAsyncTarget(scenario: EvalScenario): boolean {
|
||||
return scenario.target.type === 'function' &&
|
||||
(scenario.target.function.startsWith('async') ||
|
||||
scenario.tags?.includes('async'));
|
||||
}
|
||||
|
||||
protected renderRegularTest(scenario: EvalScenario, assertions: Assertion[], isAsync: boolean): string {
|
||||
const funcCall = this.renderFunctionCall(scenario);
|
||||
const awaitPrefix = isAsync ? 'await ' : '';
|
||||
|
||||
const assertionLines = assertions
|
||||
.map(a => this.renderAssertion(a, 'result'))
|
||||
.filter(Boolean)
|
||||
.map(a => ` ${a}`)
|
||||
.join('\n');
|
||||
|
||||
return ` const result = ${awaitPrefix}${funcCall};\n${assertionLines}`;
|
||||
}
|
||||
|
||||
protected renderThrowsTest(scenario: EvalScenario, throwsAssertion: Assertion, isAsync: boolean): string {
|
||||
const funcCall = this.renderFunctionCall(scenario);
|
||||
return ` ${renderThrowsExpectation(funcCall, throwsAssertion, isAsync)}`;
|
||||
}
|
||||
|
||||
protected renderFunctionCall(scenario: EvalScenario): string {
|
||||
const func = scenario.target.function;
|
||||
const args = formatTSArgs(scenario.input.args);
|
||||
return `${func}(${args})`;
|
||||
}
|
||||
|
||||
protected renderMocks(scenario: EvalScenario): string | null {
|
||||
if (!scenario.setup?.mocks?.length) return null;
|
||||
|
||||
return scenario.setup.mocks
|
||||
.map(m => {
|
||||
if (m.returnValue !== undefined) {
|
||||
return ` vi.mock('${m.target}', () => (${JSON.stringify(m.returnValue)}));`;
|
||||
}
|
||||
if (m.sideEffect) {
|
||||
return ` vi.mock('${m.target}', () => { throw new Error('${m.sideEffect}'); });`;
|
||||
}
|
||||
return ` vi.mock('${m.target}');`;
|
||||
})
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
protected renderAssertion(assertion: Assertion, resultVar: string): string {
|
||||
return renderTSAssertion(assertion, resultVar, 'vitest');
|
||||
}
|
||||
}
|
||||
|
||||
export class JestRenderer extends VitestRenderer {
|
||||
protected generateImports(module: string, scenarios: EvalScenario[]): string {
|
||||
const imports: string[] = [];
|
||||
|
||||
const hasMocks = scenarios.some(s => s.setup?.mocks?.length);
|
||||
if (hasMocks || this.options.generateMocks) {
|
||||
imports.push(`import { jest } from '@jest/globals';`);
|
||||
}
|
||||
|
||||
const functions = [...new Set(scenarios.map(s => s.target.function))];
|
||||
const modulePath = module.replace(/\.(ts|tsx|js|jsx)$/, '');
|
||||
imports.push(`import { ${functions.join(', ')} } from '${modulePath}';`);
|
||||
|
||||
return imports.join('\n');
|
||||
}
|
||||
|
||||
protected renderMocks(scenario: EvalScenario): string | null {
|
||||
if (!scenario.setup?.mocks?.length) return null;
|
||||
|
||||
return scenario.setup.mocks
|
||||
.map(m => {
|
||||
if (m.returnValue !== undefined) {
|
||||
return ` jest.mock('${m.target}', () => (${JSON.stringify(m.returnValue)}));`;
|
||||
}
|
||||
if (m.sideEffect) {
|
||||
return ` jest.mock('${m.target}', () => { throw new Error('${m.sideEffect}'); });`;
|
||||
}
|
||||
return ` jest.mock('${m.target}');`;
|
||||
})
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
protected renderAssertion(assertion: Assertion, resultVar: string): string {
|
||||
return renderTSAssertion(assertion, resultVar, 'jest');
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue