evaluclaude-harness/src/promptfoo/config-generator.ts

271 lines
7.7 KiB
TypeScript

import { writeFile, mkdir } from 'fs/promises';
import { dirname, join } from 'path';
import * as yaml from 'js-yaml';
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
export interface ConfigOptions {
testDir: string;
outputPath: string;
framework: 'pytest' | 'vitest' | 'jest';
includeTraceLinks: boolean;
}
export async function generatePromptfooConfig(
spec: EvalSpec,
options: ConfigOptions
): Promise<string> {
const config = buildConfig(spec, options);
const yamlContent = yaml.dump(config, {
lineWidth: 120,
quotingType: '"',
});
await mkdir(dirname(options.outputPath), { recursive: true });
await writeFile(options.outputPath, yamlContent);
return yamlContent;
}
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
return {
description: `Evaluclaude functional tests for ${spec.repo.name}`,
providers: [
{
id: `file://providers/test-runner.py`,
label: 'functional-tests',
config: {
test_dir: options.testDir,
framework: options.framework,
timeout: 300,
},
},
],
prompts: ['{{scenario_id}}'],
tests,
defaultTest: options.includeTraceLinks
? {
metadata: {
traceFile: '.evaluclaude/traces/{{evalId}}.json',
},
}
: undefined,
outputPath: '.evaluclaude/results/promptfoo-results.json',
};
}
function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest {
const assertions = scenario.assertions
.filter(a => a.type !== 'llm-rubric')
.map(a => buildAssertion(a));
const llmRubrics = scenario.assertions
.filter(a => a.type === 'llm-rubric')
.map(a => ({
type: 'llm-rubric' as const,
value: (a as any).rubric,
threshold: (a as any).passingThreshold ?? 0.7,
}));
return {
description: scenario.description,
vars: {
scenario_id: scenario.id,
target_module: scenario.target.module,
target_function: scenario.target.function,
input_args: scenario.input.args,
input_kwargs: scenario.input.kwargs,
},
assert: [...assertions, ...llmRubrics],
metadata: {
category: scenario.category,
priority: scenario.priority,
tags: scenario.tags,
},
};
}
function buildAssertion(assertion: any): PromptfooAssertion {
switch (assertion.type) {
case 'equals':
return {
type: 'equals',
value: assertion.expected,
};
case 'contains':
return {
type: 'contains',
value: assertion.value,
};
case 'matches':
return {
type: 'regex',
value: assertion.pattern,
};
case 'typeof':
return {
type: 'python',
value: `type(output).__name__ == '${assertion.expected}'`,
};
case 'throws':
return {
type: 'python',
value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`,
};
case 'truthy':
return {
type: 'python',
value: 'bool(output)',
};
case 'falsy':
return {
type: 'python',
value: 'not bool(output)',
};
case 'custom':
return {
type: 'python',
value: assertion.check,
};
default:
return {
type: 'python',
value: 'True',
};
}
}
export async function generateTestProvider(outputPath: string): Promise<void> {
const providerCode = `#!/usr/bin/env python3
"""Promptfoo provider that executes tests and returns structured results."""
import subprocess
import json
import sys
import os
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
"""Runs tests and returns structured results."""
test_dir = options.get('config', {}).get('test_dir', './tests')
framework = options.get('config', {}).get('framework', 'pytest')
timeout = options.get('config', {}).get('timeout', 300)
scenario_id = prompt.strip()
try:
if framework == 'pytest':
result = subprocess.run(
[
'python', '-m', 'pytest',
'--json-report',
'--json-report-file=/tmp/pytest_results.json',
'-k', scenario_id,
test_dir
],
capture_output=True,
text=True,
timeout=timeout
)
try:
with open('/tmp/pytest_results.json') as f:
report = json.load(f)
output = {
'passed': report.get('summary', {}).get('passed', 0),
'failed': report.get('summary', {}).get('failed', 0),
'skipped': report.get('summary', {}).get('skipped', 0),
'tests': report.get('tests', []),
'stdout': result.stdout,
'stderr': result.stderr,
'exit_code': result.returncode,
}
except FileNotFoundError:
output = {
'passed': 0,
'failed': 1,
'error': 'Failed to generate pytest report',
'stdout': result.stdout,
'stderr': result.stderr,
}
elif framework in ('vitest', 'jest'):
cmd = ['npx', framework, 'run', '--reporter=json']
if scenario_id:
cmd.extend(['--testNamePattern', scenario_id])
cmd.append(test_dir)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout
)
try:
report = json.loads(result.stdout)
output = {
'passed': report.get('numPassedTests', 0),
'failed': report.get('numFailedTests', 0),
'skipped': report.get('numSkippedTests', 0),
'tests': report.get('testResults', []),
'exit_code': result.returncode,
}
except json.JSONDecodeError:
output = {
'passed': 0,
'failed': 1,
'error': 'Failed to parse test output',
'stdout': result.stdout,
'stderr': result.stderr,
}
else:
output = {'error': f'Unknown framework: {framework}'}
return {
'output': json.dumps(output),
'error': None,
}
except subprocess.TimeoutExpired:
return {
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
'error': None,
}
except Exception as e:
return {
'output': None,
'error': str(e),
}
if __name__ == '__main__':
# For testing the provider directly
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--scenario', default='')
parser.add_argument('--test-dir', default='./tests')
parser.add_argument('--framework', default='pytest')
args = parser.parse_args()
result = get_provider_response(
args.scenario,
{'config': {'test_dir': args.test_dir, 'framework': args.framework}},
{}
)
print(json.dumps(result, indent=2))
`;
await mkdir(dirname(outputPath), { recursive: true });
await writeFile(outputPath, providerCode);
}