mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-17 19:03:55 +00:00
promptfoo ui and testcon
This commit is contained in:
parent
e0c36241b0
commit
6698c12e5b
18 changed files with 2169 additions and 0 deletions
271
src/promptfoo/config-generator.ts
Normal file
271
src/promptfoo/config-generator.ts
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
import { writeFile, mkdir } from 'fs/promises';
|
||||
import { dirname, join } from 'path';
|
||||
import * as yaml from 'js-yaml';
|
||||
import type { EvalSpec, EvalScenario } from '../analyzer/types.js';
|
||||
import type { PromptfooConfig, PromptfooTest, PromptfooAssertion } from './types.js';
|
||||
|
||||
export interface ConfigOptions {
|
||||
testDir: string;
|
||||
outputPath: string;
|
||||
framework: 'pytest' | 'vitest' | 'jest';
|
||||
includeTraceLinks: boolean;
|
||||
}
|
||||
|
||||
export async function generatePromptfooConfig(
|
||||
spec: EvalSpec,
|
||||
options: ConfigOptions
|
||||
): Promise<string> {
|
||||
const config = buildConfig(spec, options);
|
||||
const yamlContent = yaml.dump(config, {
|
||||
lineWidth: 120,
|
||||
quotingType: '"',
|
||||
});
|
||||
|
||||
await mkdir(dirname(options.outputPath), { recursive: true });
|
||||
await writeFile(options.outputPath, yamlContent);
|
||||
|
||||
return yamlContent;
|
||||
}
|
||||
|
||||
function buildConfig(spec: EvalSpec, options: ConfigOptions): PromptfooConfig {
|
||||
const tests = spec.scenarios.map(scenario => buildTest(scenario, options));
|
||||
|
||||
return {
|
||||
description: `Evaluclaude functional tests for ${spec.repo.name}`,
|
||||
providers: [
|
||||
{
|
||||
id: `file://providers/test-runner.py`,
|
||||
label: 'functional-tests',
|
||||
config: {
|
||||
test_dir: options.testDir,
|
||||
framework: options.framework,
|
||||
timeout: 300,
|
||||
},
|
||||
},
|
||||
],
|
||||
prompts: ['{{scenario_id}}'],
|
||||
tests,
|
||||
defaultTest: options.includeTraceLinks
|
||||
? {
|
||||
metadata: {
|
||||
traceFile: '.evaluclaude/traces/{{evalId}}.json',
|
||||
},
|
||||
}
|
||||
: undefined,
|
||||
outputPath: '.evaluclaude/results/promptfoo-results.json',
|
||||
};
|
||||
}
|
||||
|
||||
function buildTest(scenario: EvalScenario, options: ConfigOptions): PromptfooTest {
|
||||
const assertions = scenario.assertions
|
||||
.filter(a => a.type !== 'llm-rubric')
|
||||
.map(a => buildAssertion(a));
|
||||
|
||||
const llmRubrics = scenario.assertions
|
||||
.filter(a => a.type === 'llm-rubric')
|
||||
.map(a => ({
|
||||
type: 'llm-rubric' as const,
|
||||
value: (a as any).rubric,
|
||||
threshold: (a as any).passingThreshold ?? 0.7,
|
||||
}));
|
||||
|
||||
return {
|
||||
description: scenario.description,
|
||||
vars: {
|
||||
scenario_id: scenario.id,
|
||||
target_module: scenario.target.module,
|
||||
target_function: scenario.target.function,
|
||||
input_args: scenario.input.args,
|
||||
input_kwargs: scenario.input.kwargs,
|
||||
},
|
||||
assert: [...assertions, ...llmRubrics],
|
||||
metadata: {
|
||||
category: scenario.category,
|
||||
priority: scenario.priority,
|
||||
tags: scenario.tags,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildAssertion(assertion: any): PromptfooAssertion {
|
||||
switch (assertion.type) {
|
||||
case 'equals':
|
||||
return {
|
||||
type: 'equals',
|
||||
value: assertion.expected,
|
||||
};
|
||||
|
||||
case 'contains':
|
||||
return {
|
||||
type: 'contains',
|
||||
value: assertion.value,
|
||||
};
|
||||
|
||||
case 'matches':
|
||||
return {
|
||||
type: 'regex',
|
||||
value: assertion.pattern,
|
||||
};
|
||||
|
||||
case 'typeof':
|
||||
return {
|
||||
type: 'python',
|
||||
value: `type(output).__name__ == '${assertion.expected}'`,
|
||||
};
|
||||
|
||||
case 'throws':
|
||||
return {
|
||||
type: 'python',
|
||||
value: `'${assertion.errorType || 'Error'}' in str(output.get('error', ''))`,
|
||||
};
|
||||
|
||||
case 'truthy':
|
||||
return {
|
||||
type: 'python',
|
||||
value: 'bool(output)',
|
||||
};
|
||||
|
||||
case 'falsy':
|
||||
return {
|
||||
type: 'python',
|
||||
value: 'not bool(output)',
|
||||
};
|
||||
|
||||
case 'custom':
|
||||
return {
|
||||
type: 'python',
|
||||
value: assertion.check,
|
||||
};
|
||||
|
||||
default:
|
||||
return {
|
||||
type: 'python',
|
||||
value: 'True',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateTestProvider(outputPath: string): Promise<void> {
|
||||
const providerCode = `#!/usr/bin/env python3
|
||||
"""Promptfoo provider that executes tests and returns structured results."""
|
||||
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
|
||||
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
||||
"""Runs tests and returns structured results."""
|
||||
|
||||
test_dir = options.get('config', {}).get('test_dir', './tests')
|
||||
framework = options.get('config', {}).get('framework', 'pytest')
|
||||
timeout = options.get('config', {}).get('timeout', 300)
|
||||
|
||||
scenario_id = prompt.strip()
|
||||
|
||||
try:
|
||||
if framework == 'pytest':
|
||||
result = subprocess.run(
|
||||
[
|
||||
'python', '-m', 'pytest',
|
||||
'--json-report',
|
||||
'--json-report-file=/tmp/pytest_results.json',
|
||||
'-k', scenario_id,
|
||||
test_dir
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
try:
|
||||
with open('/tmp/pytest_results.json') as f:
|
||||
report = json.load(f)
|
||||
|
||||
output = {
|
||||
'passed': report.get('summary', {}).get('passed', 0),
|
||||
'failed': report.get('summary', {}).get('failed', 0),
|
||||
'skipped': report.get('summary', {}).get('skipped', 0),
|
||||
'tests': report.get('tests', []),
|
||||
'stdout': result.stdout,
|
||||
'stderr': result.stderr,
|
||||
'exit_code': result.returncode,
|
||||
}
|
||||
except FileNotFoundError:
|
||||
output = {
|
||||
'passed': 0,
|
||||
'failed': 1,
|
||||
'error': 'Failed to generate pytest report',
|
||||
'stdout': result.stdout,
|
||||
'stderr': result.stderr,
|
||||
}
|
||||
|
||||
elif framework in ('vitest', 'jest'):
|
||||
cmd = ['npx', framework, 'run', '--reporter=json']
|
||||
if scenario_id:
|
||||
cmd.extend(['--testNamePattern', scenario_id])
|
||||
cmd.append(test_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
try:
|
||||
report = json.loads(result.stdout)
|
||||
output = {
|
||||
'passed': report.get('numPassedTests', 0),
|
||||
'failed': report.get('numFailedTests', 0),
|
||||
'skipped': report.get('numSkippedTests', 0),
|
||||
'tests': report.get('testResults', []),
|
||||
'exit_code': result.returncode,
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
output = {
|
||||
'passed': 0,
|
||||
'failed': 1,
|
||||
'error': 'Failed to parse test output',
|
||||
'stdout': result.stdout,
|
||||
'stderr': result.stderr,
|
||||
}
|
||||
else:
|
||||
output = {'error': f'Unknown framework: {framework}'}
|
||||
|
||||
return {
|
||||
'output': json.dumps(output),
|
||||
'error': None,
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return {
|
||||
'output': json.dumps({'error': 'Test execution timed out', 'passed': 0, 'failed': 1}),
|
||||
'error': None,
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'output': None,
|
||||
'error': str(e),
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
# For testing the provider directly
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--scenario', default='')
|
||||
parser.add_argument('--test-dir', default='./tests')
|
||||
parser.add_argument('--framework', default='pytest')
|
||||
args = parser.parse_args()
|
||||
|
||||
result = get_provider_response(
|
||||
args.scenario,
|
||||
{'config': {'test_dir': args.test_dir, 'framework': args.framework}},
|
||||
{}
|
||||
)
|
||||
print(json.dumps(result, indent=2))
|
||||
`;
|
||||
|
||||
await mkdir(dirname(outputPath), { recursive: true });
|
||||
await writeFile(outputPath, providerCode);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue