mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-15 09:01:15 +00:00
271 lines
7.7 KiB
TypeScript
271 lines
7.7 KiB
TypeScript
import { writeFile, mkdir } from 'fs/promises';
|
|
import { dirname, join } from 'path';
|
|
import * as yaml from 'js-yaml';
|
|
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
|
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
|
|
|
|
export interface ConfigOptions {
|
|
testDir: string;
|
|
outputPath: string;
|
|
framework: 'pytest' | 'vitest' | 'jest';
|
|
includeTraceLinks: boolean;
|
|
}
|
|
|
|
export async function generatePromptfooConfig(
|
|
spec: EvalSpec,
|
|
options: ConfigOptions
|
|
): Promise<string> {
|
|
const config = buildConfig(spec, options);
|
|
const yamlContent = yaml.dump(config, {
|
|
lineWidth: 120,
|
|
quotingType: '"',
|
|
});
|
|
|
|
await mkdir(dirname(options.outputPath), { recursive: true });
|
|
await writeFile(options.outputPath, yamlContent);
|
|
|
|
return yamlContent;
|
|
}
|
|
|
|
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
|
|
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
|
|
|
|
return {
|
|
description: `Evaluclaude functional tests for ${spec.repo.name}`,
|
|
providers: [
|
|
{
|
|
id: `file://providers/test-runner.py`,
|
|
label: 'functional-tests',
|
|
config: {
|
|
test_dir: options.testDir,
|
|
framework: options.framework,
|
|
timeout: 300,
|
|
},
|
|
},
|
|
],
|
|
prompts: ['{{scenario_id}}'],
|
|
tests,
|
|
defaultTest: options.includeTraceLinks
|
|
? {
|
|
metadata: {
|
|
traceFile: '.evaluclaude/traces/{{evalId}}.json',
|
|
},
|
|
}
|
|
: undefined,
|
|
outputPath: '.evaluclaude/results/promptfoo-results.json',
|
|
};
|
|
}
|
|
|
|
function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest {
|
|
const assertions = scenario.assertions
|
|
.filter(a => a.type !== 'llm-rubric')
|
|
.map(a => buildAssertion(a));
|
|
|
|
const llmRubrics = scenario.assertions
|
|
.filter(a => a.type === 'llm-rubric')
|
|
.map(a => ({
|
|
type: 'llm-rubric' as const,
|
|
value: (a as any).rubric,
|
|
threshold: (a as any).passingThreshold ?? 0.7,
|
|
}));
|
|
|
|
return {
|
|
description: scenario.description,
|
|
vars: {
|
|
scenario_id: scenario.id,
|
|
target_module: scenario.target.module,
|
|
target_function: scenario.target.function,
|
|
input_args: scenario.input.args,
|
|
input_kwargs: scenario.input.kwargs,
|
|
},
|
|
assert: [...assertions, ...llmRubrics],
|
|
metadata: {
|
|
category: scenario.category,
|
|
priority: scenario.priority,
|
|
tags: scenario.tags,
|
|
},
|
|
};
|
|
}
|
|
|
|
function buildAssertion(assertion: any): PromptfooAssertion {
|
|
switch (assertion.type) {
|
|
case 'equals':
|
|
return {
|
|
type: 'equals',
|
|
value: assertion.expected,
|
|
};
|
|
|
|
case 'contains':
|
|
return {
|
|
type: 'contains',
|
|
value: assertion.value,
|
|
};
|
|
|
|
case 'matches':
|
|
return {
|
|
type: 'regex',
|
|
value: assertion.pattern,
|
|
};
|
|
|
|
case 'typeof':
|
|
return {
|
|
type: 'python',
|
|
value: `type(output).__name__ == '${assertion.expected}'`,
|
|
};
|
|
|
|
case 'throws':
|
|
return {
|
|
type: 'python',
|
|
value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`,
|
|
};
|
|
|
|
case 'truthy':
|
|
return {
|
|
type: 'python',
|
|
value: 'bool(output)',
|
|
};
|
|
|
|
case 'falsy':
|
|
return {
|
|
type: 'python',
|
|
value: 'not bool(output)',
|
|
};
|
|
|
|
case 'custom':
|
|
return {
|
|
type: 'python',
|
|
value: assertion.check,
|
|
};
|
|
|
|
default:
|
|
return {
|
|
type: 'python',
|
|
value: 'True',
|
|
};
|
|
}
|
|
}
|
|
|
|
export async function generateTestProvider(outputPath: string): Promise<void> {
|
|
const providerCode = `#!/usr/bin/env python3
|
|
"""Promptfoo provider that executes tests and returns structured results."""
|
|
|
|
import subprocess
|
|
import json
|
|
import sys
|
|
import os
|
|
|
|
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
|
"""Runs tests and returns structured results."""
|
|
|
|
test_dir = options.get('config', {}).get('test_dir', './tests')
|
|
framework = options.get('config', {}).get('framework', 'pytest')
|
|
timeout = options.get('config', {}).get('timeout', 300)
|
|
|
|
scenario_id = prompt.strip()
|
|
|
|
try:
|
|
if framework == 'pytest':
|
|
result = subprocess.run(
|
|
[
|
|
'python', '-m', 'pytest',
|
|
'--json-report',
|
|
'--json-report-file=/tmp/pytest_results.json',
|
|
'-k', scenario_id,
|
|
test_dir
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout
|
|
)
|
|
|
|
try:
|
|
with open('/tmp/pytest_results.json') as f:
|
|
report = json.load(f)
|
|
|
|
output = {
|
|
'passed': report.get('summary', {}).get('passed', 0),
|
|
'failed': report.get('summary', {}).get('failed', 0),
|
|
'skipped': report.get('summary', {}).get('skipped', 0),
|
|
'tests': report.get('tests', []),
|
|
'stdout': result.stdout,
|
|
'stderr': result.stderr,
|
|
'exit_code': result.returncode,
|
|
}
|
|
except FileNotFoundError:
|
|
output = {
|
|
'passed': 0,
|
|
'failed': 1,
|
|
'error': 'Failed to generate pytest report',
|
|
'stdout': result.stdout,
|
|
'stderr': result.stderr,
|
|
}
|
|
|
|
elif framework in ('vitest', 'jest'):
|
|
cmd = ['npx', framework, 'run', '--reporter=json']
|
|
if scenario_id:
|
|
cmd.extend(['--testNamePattern', scenario_id])
|
|
cmd.append(test_dir)
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout
|
|
)
|
|
|
|
try:
|
|
report = json.loads(result.stdout)
|
|
output = {
|
|
'passed': report.get('numPassedTests', 0),
|
|
'failed': report.get('numFailedTests', 0),
|
|
'skipped': report.get('numSkippedTests', 0),
|
|
'tests': report.get('testResults', []),
|
|
'exit_code': result.returncode,
|
|
}
|
|
except json.JSONDecodeError:
|
|
output = {
|
|
'passed': 0,
|
|
'failed': 1,
|
|
'error': 'Failed to parse test output',
|
|
'stdout': result.stdout,
|
|
'stderr': result.stderr,
|
|
}
|
|
else:
|
|
output = {'error': f'Unknown framework: {framework}'}
|
|
|
|
return {
|
|
'output': json.dumps(output),
|
|
'error': None,
|
|
}
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return {
|
|
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
|
|
'error': None,
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'output': None,
|
|
'error': str(e),
|
|
}
|
|
|
|
if __name__ == '__main__':
|
|
# For testing the provider directly
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--scenario', default='')
|
|
parser.add_argument('--test-dir', default='./tests')
|
|
parser.add_argument('--framework', default='pytest')
|
|
args = parser.parse_args()
|
|
|
|
result = get_provider_response(
|
|
args.scenario,
|
|
{'config': {'test_dir': args.test_dir, 'framework': args.framework}},
|
|
{}
|
|
)
|
|
print(json.dumps(result, indent=2))
|
|
`;
|
|
|
|
await mkdir(dirname(outputPath), { recursive: true });
|
|
await writeFile(outputPath, providerCode);
|
|
}
|