mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-15 06:04:41 +00:00
127 lines
3.6 KiB
TypeScript
127 lines
3.6 KiB
TypeScript
/**
|
|
* Export test execution results to Promptfoo format for viewing in the UI.
|
|
*
|
|
* Instead of using Promptfoo to run tests (which requires a provider that
|
|
* responds quickly), we run tests ourselves and export results to Promptfoo's
|
|
* result format. This allows us to use Promptfoo's excellent visualization UI.
|
|
*/
|
|
|
|
import { writeFile, mkdir } from 'fs/promises';
|
|
import { join } from 'path';
|
|
import type { ExecutionResult } from '../runners/types.js';
|
|
import type { EvalSpec } from '../analyzer/types.js';
|
|
import type { PromptfooResult, PromptfooTestResult } from './types.js';
|
|
|
|
export interface ExportOptions {
|
|
outputDir: string;
|
|
evalId?: string;
|
|
includeSpec?: boolean;
|
|
}
|
|
|
|
/**
|
|
* Export ExecutionResult to Promptfoo result format.
|
|
*/
|
|
export async function exportToPromptfooFormat(
|
|
result: ExecutionResult,
|
|
spec: EvalSpec | undefined,
|
|
options: ExportOptions
|
|
): Promise<string> {
|
|
const { outputDir, evalId = `eval-${Date.now()}` } = options;
|
|
|
|
const promptfooResult = buildPromptfooResult(result, spec, evalId);
|
|
|
|
await mkdir(outputDir, { recursive: true });
|
|
const outputPath = join(outputDir, `${evalId}.json`);
|
|
await writeFile(outputPath, JSON.stringify(promptfooResult, null, 2));
|
|
|
|
// Also write the latest.json symlink equivalent
|
|
const latestPath = join(outputDir, 'latest.json');
|
|
await writeFile(latestPath, JSON.stringify(promptfooResult, null, 2));
|
|
|
|
return outputPath;
|
|
}
|
|
|
|
function buildPromptfooResult(
|
|
result: ExecutionResult,
|
|
spec: EvalSpec | undefined,
|
|
evalId: string
|
|
): PromptfooResult {
|
|
const testResults: PromptfooTestResult[] = result.tests.map(test => {
|
|
// Try to find matching scenario from spec
|
|
const scenario = spec?.scenarios.find(s =>
|
|
s.id === test.id || test.name.includes(s.id)
|
|
);
|
|
|
|
return {
|
|
prompt: {
|
|
raw: scenario?.id || test.id,
|
|
label: scenario?.name || test.name,
|
|
},
|
|
vars: {
|
|
scenario_id: scenario?.id || test.id,
|
|
target_module: scenario?.target.module || '',
|
|
target_function: scenario?.target.function || '',
|
|
description: scenario?.description || test.name,
|
|
},
|
|
response: {
|
|
output: test.status === 'passed'
|
|
? 'Test passed successfully'
|
|
: test.error?.message || 'Test failed',
|
|
},
|
|
gradingResult: {
|
|
pass: test.status === 'passed',
|
|
score: test.status === 'passed' ? 1 : 0,
|
|
reason: test.status === 'passed'
|
|
? 'All assertions passed'
|
|
: test.error?.message || 'Test failed',
|
|
componentResults: test.assertions.details.map(a => ({
|
|
pass: a.passed,
|
|
score: a.passed ? 1 : 0,
|
|
reason: a.description,
|
|
assertion: {
|
|
type: 'custom',
|
|
value: a.description,
|
|
},
|
|
})),
|
|
},
|
|
success: test.status === 'passed',
|
|
error: test.error?.message,
|
|
};
|
|
});
|
|
|
|
return {
|
|
version: 1,
|
|
timestamp: new Date().toISOString(),
|
|
results: testResults,
|
|
stats: {
|
|
successes: result.summary.passed,
|
|
failures: result.summary.failed,
|
|
tokenUsage: {
|
|
total: 0,
|
|
prompt: 0,
|
|
completion: 0,
|
|
},
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Generate a minimal Promptfoo config that just views results (no provider).
|
|
*/
|
|
export function generateViewOnlyConfig(spec: EvalSpec): string {
|
|
return `# Evaluclaude Results Config
|
|
# This config is for viewing results only - tests are run via evaluclaude run
|
|
|
|
description: "Test results for ${spec.repo.name}"
|
|
|
|
# No providers needed - we pre-run tests and import results
|
|
providers: []
|
|
|
|
prompts: []
|
|
|
|
tests: []
|
|
|
|
# Results are stored here by evaluclaude run --export-promptfoo
|
|
outputPath: .evaluclaude/results/latest.json
|
|
`;
|
|
}
|