diff --git a/rubrics/code-quality.yaml b/rubrics/code-quality.yaml
new file mode 100644
index 0000000..5af8b08
--- /dev/null
+++ b/rubrics/code-quality.yaml
@@ -0,0 +1,32 @@
+name: code-quality
+description: Evaluates generated code for quality and maintainability
+passingThreshold: 0.7
+
+criteria:
+ - name: readability
+ weight: 0.3
+ description: Code is easy to read and understand
+ examples:
+ good: "Clear variable names, logical flow, proper indentation"
+ bad: "Single-letter variables, deeply nested logic, inconsistent style"
+
+ - name: correctness
+ weight: 0.4
+ description: Code correctly implements the intended behavior
+ examples:
+ good: "Handles edge cases, correct algorithm, proper error handling"
+ bad: "Missing edge cases, off-by-one errors, swallowed exceptions"
+
+ - name: efficiency
+ weight: 0.2
+ description: Code uses appropriate data structures and algorithms
+ examples:
+ good: "O(n) where O(n) is optimal, avoids unnecessary allocations"
+ bad: "O(nยฒ) when O(n) is possible, creates objects in tight loops"
+
+ - name: maintainability
+ weight: 0.1
+ description: Code is easy to modify and extend
+ examples:
+ good: "Single responsibility, low coupling, clear interfaces"
+ bad: "God functions, tight coupling, magic numbers"
diff --git a/rubrics/documentation.yaml b/rubrics/documentation.yaml
new file mode 100644
index 0000000..7089c96
--- /dev/null
+++ b/rubrics/documentation.yaml
@@ -0,0 +1,32 @@
+name: documentation
+description: Evaluates quality of code documentation and docstrings
+passingThreshold: 0.65
+
+criteria:
+ - name: completeness
+ weight: 0.35
+ description: Documentation covers all parameters, return values, and exceptions
+ examples:
+ good: "Fully documents args, returns, raises, and includes usage example"
+ bad: "Missing parameter descriptions or return type"
+
+ - name: accuracy
+ weight: 0.35
+ description: Documentation accurately describes the function's behavior
+ examples:
+ good: "Description matches implementation, types are correct"
+ bad: "Outdated docs that don't match current behavior"
+
+ - name: examples
+ weight: 0.2
+ description: Includes helpful usage examples
+ examples:
+ good: "Shows common use cases with expected outputs"
+ bad: "No examples or only trivial ones"
+
+ - name: style
+ weight: 0.1
+ description: Follows project/language documentation conventions
+ examples:
+ good: "Uses standard docstring format (Google, NumPy, or reStructuredText)"
+ bad: "Inconsistent or non-standard format"
diff --git a/rubrics/error-messages.yaml b/rubrics/error-messages.yaml
new file mode 100644
index 0000000..3e8dace
--- /dev/null
+++ b/rubrics/error-messages.yaml
@@ -0,0 +1,25 @@
+name: error-messages
+description: Evaluates quality of error messages
+passingThreshold: 0.6
+
+criteria:
+ - name: clarity
+ weight: 0.4
+ description: Error message clearly explains what went wrong
+ examples:
+ good: "Invalid email format: 'not-an-email' is missing '@' symbol"
+ bad: "Error: validation failed"
+
+ - name: actionability
+ weight: 0.4
+ description: Error message suggests how to fix the problem
+ examples:
+ good: "File not found. Create the file or check the path spelling."
+ bad: "ENOENT"
+
+ - name: context
+ weight: 0.2
+ description: Error message includes relevant context (file, line, values)
+ examples:
+ good: "TypeError at line 42 in auth.py: expected str, got int (value=123)"
+ bad: "type error"
diff --git a/src/cli/commands/pipeline.ts b/src/cli/commands/pipeline.ts
new file mode 100644
index 0000000..d1e7fa4
--- /dev/null
+++ b/src/cli/commands/pipeline.ts
@@ -0,0 +1,257 @@
+import { Command } from 'commander';
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
+import { join, resolve } from 'path';
+import { analyze } from '../../introspector/index.js';
+import { generateEvalSpec, generateEvalSpecInteractive } from '../../analyzer/index.js';
+import { renderSpec, detectFramework as detectRenderFramework } from '../../renderers/index.js';
+import { runTests, formatResults, DEFAULT_SANDBOX_CONFIG } from '../../runners/index.js';
+import { createTracer, saveTrace } from '../../observability/index.js';
+import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
+import type { EvalSpec } from '../../analyzer/types.js';
+
+const EVALUCLAUDE_DIR = '.evaluclaude';
+
+interface PipelineOptions {
+ output?: string;
+ interactive?: boolean;
+ focus?: string;
+ maxScenarios: string;
+ testDir: string;
+ framework?: string;
+ skipAnalyze?: boolean;
+ skipRender?: boolean;
+ skipRun?: boolean;
+ promptfoo?: boolean;
+ quiet?: boolean;
+}
+
+export const pipelineCommand = new Command('pipeline')
+ .description('Run the full eval generation pipeline: introspect โ analyze โ render โ run')
+ .argument('[path]', 'Path to the repository to analyze', '.')
+ .option('-o, --output
', 'Output directory for all artifacts', '.evaluclaude')
+ .option('-i, --interactive', 'Enable interactive mode with clarifying questions')
+ .option('--focus ', 'Comma-separated list of modules/functions to focus on')
+ .option('--max-scenarios ', 'Maximum number of test scenarios to generate', '10')
+ .option('--test-dir ', 'Directory for generated tests', './tests/generated')
+ .option('-f, --framework ', 'Test framework (pytest, vitest, jest)')
+ .option('--skip-analyze', 'Skip analysis, use existing spec')
+ .option('--skip-render', 'Skip rendering, use existing tests')
+ .option('--skip-run', 'Skip test execution')
+ .option('--promptfoo', 'Generate Promptfoo configuration for UI viewing')
+ .option('--quiet', 'Suppress progress messages')
+ .action(async (repoPath: string, options: PipelineOptions) => {
+ const absolutePath = resolve(repoPath);
+ const log = options.quiet ? () => {} : console.log;
+ const outputDir = options.output || EVALUCLAUDE_DIR;
+
+ console.log('\n๐ Evaluclaude Pipeline');
+ console.log('โ'.repeat(50));
+ console.log(` Repository: ${absolutePath}`);
+ console.log(` Output: ${outputDir}`);
+ console.log('โ'.repeat(50) + '\n');
+
+ // Ensure output directories exist
+ mkdirSync(outputDir, { recursive: true });
+ mkdirSync(options.testDir, { recursive: true });
+
+ const specPath = join(outputDir, 'spec.json');
+ const tracesDir = join(outputDir, 'traces');
+ const resultsDir = join(outputDir, 'results');
+
+ mkdirSync(tracesDir, { recursive: true });
+ mkdirSync(resultsDir, { recursive: true });
+
+ let spec: EvalSpec;
+
+ // Step 1: Introspection + Analysis
+ if (options.skipAnalyze && existsSync(specPath)) {
+ log('๐ Using existing EvalSpec...');
+ spec = JSON.parse(readFileSync(specPath, 'utf-8'));
+ log(` Loaded: ${specPath} (${spec.scenarios.length} scenarios)\n`);
+ } else {
+ log('๐ฌ Step 1: Introspecting codebase...');
+
+ try {
+ const repoSummary = await analyze({
+ root: absolutePath,
+ onProgress: options.quiet ? undefined : (msg) => log(` ${msg}`),
+ });
+
+ log(` Files: ${repoSummary.files.length}`);
+ log(` Languages: ${repoSummary.languages.join(', ')}`);
+ log('');
+
+ log('๐ค Step 2: Generating EvalSpec with Claude...\n');
+
+ const focus = options.focus?.split(',').map(s => s.trim());
+ const maxScenarios = parseInt(options.maxScenarios, 10);
+
+ let result;
+ if (options.interactive) {
+ const { default: inquirer } = await import('inquirer');
+
+ result = await generateEvalSpecInteractive(
+ repoSummary,
+ async (question: string) => {
+ const { answer } = await inquirer.prompt([{
+ type: 'input',
+ name: 'answer',
+ message: `๐ค Claude asks: ${question}`,
+ }]);
+ return answer;
+ },
+ { focus, maxScenarios }
+ );
+ } else {
+ result = await generateEvalSpec(repoSummary, {
+ interactive: false,
+ focus,
+ maxScenarios,
+ });
+ }
+
+ spec = result.spec;
+
+ // Save the spec
+ writeFileSync(specPath, JSON.stringify(spec, null, 2));
+
+ log(`\nโ
EvalSpec generated!`);
+ log(` Scenarios: ${spec.scenarios.length}`);
+ log(` Tokens: ${result.tokensUsed}`);
+ log(` Saved: ${specPath}\n`);
+ } catch (error) {
+ console.error('\nโ Analysis failed:', error instanceof Error ? error.message : error);
+ process.exit(1);
+ }
+ }
+
+ // Step 2: Render tests
+ if (!options.skipRender) {
+ log('๐ Step 3: Rendering test files...');
+
+ try {
+ const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
+
+ const renderResult = await renderSpec(spec, {
+ outputDir: options.testDir,
+ framework,
+ includeFixtures: true,
+ generateMocks: true,
+ dryRun: false,
+ });
+
+ log(` Framework: ${framework}`);
+ log(` Files: ${renderResult.stats.fileCount}`);
+ log(` Scenarios: ${renderResult.stats.scenarioCount}`);
+ log(` Assertions: ${renderResult.stats.assertionCount}`);
+ log(` Output: ${options.testDir}\n`);
+ } catch (error) {
+ console.error('\nโ Rendering failed:', error instanceof Error ? error.message : error);
+ process.exit(1);
+ }
+ }
+
+ // Step 3: Run tests
+ if (!options.skipRun) {
+ log('๐งช Step 4: Running tests...\n');
+
+ try {
+ const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
+ const tracer = createTracer(spec.repo.name);
+
+ tracer.recordIntrospection({
+ filesAnalyzed: spec.scenarios.map(s => s.target.module),
+ totalFunctions: spec.scenarios.length,
+ duration: 0,
+ });
+
+ tracer.recordGeneration({
+ scenariosGenerated: spec.scenarios.length,
+ filesWritten: [options.testDir],
+ });
+
+ const result = await runTests(
+ options.testDir,
+ {
+ framework,
+ sandbox: true,
+ timeout: 300000,
+ parallel: false,
+ cwd: process.cwd(),
+ },
+ DEFAULT_SANDBOX_CONFIG
+ );
+
+ tracer.recordExecution({
+ testsPassed: result.summary.passed,
+ testsFailed: result.summary.failed,
+ testsSkipped: result.summary.skipped,
+ });
+
+ for (const test of result.tests) {
+ if (test.status === 'failed' || test.status === 'error') {
+ tracer.recordTestFailure({
+ scenarioId: test.id,
+ testName: test.name,
+ error: test.error?.message || 'Unknown error',
+ stack: test.error?.stack,
+ });
+ }
+ }
+
+ const trace = tracer.finalize();
+ const tracePath = await saveTrace(trace);
+
+ log(formatResults(result));
+ log(`๐ Trace saved: ${tracePath}`);
+ log(` View with: evaluclaude view ${trace.id}\n`);
+
+ // Save results
+ const resultsPath = join(resultsDir, `run-${Date.now()}.json`);
+ writeFileSync(resultsPath, JSON.stringify(result, null, 2));
+
+ } catch (error) {
+ console.error('\nโ Test execution failed:', error instanceof Error ? error.message : error);
+ process.exit(1);
+ }
+ }
+
+ // Step 4: Generate Promptfoo config
+ if (options.promptfoo) {
+ log('๐ฆ Step 5: Generating Promptfoo configuration...');
+
+ try {
+ const configPath = join(outputDir, 'promptfooconfig.yaml');
+ const providerPath = join(outputDir, 'providers', 'test-runner.py');
+ const framework = (options.framework as 'pytest' | 'vitest' | 'jest') || detectRenderFramework(spec);
+
+ await generatePromptfooConfig(spec, {
+ testDir: options.testDir,
+ outputPath: configPath,
+ framework,
+ includeTraceLinks: true,
+ providerPath,
+ });
+
+ await generateTestProvider(providerPath);
+
+ log(` Config: ${configPath}`);
+ log(` Provider: ${providerPath}`);
+ log(`\n Launch UI with: evaluclaude ui\n`);
+ } catch (error) {
+ console.error('\nโ Promptfoo config generation failed:', error instanceof Error ? error.message : error);
+ }
+ }
+
+ console.log('โ'.repeat(50));
+ console.log('โ
Pipeline complete!');
+ console.log('โ'.repeat(50));
+ console.log(`\nNext steps:`);
+ console.log(` View traces: evaluclaude view --last`);
+ console.log(` List all traces: evaluclaude traces`);
+ if (options.promptfoo) {
+ console.log(` Launch UI: evaluclaude ui`);
+ console.log(` Run Promptfoo: evaluclaude eval --spec ${specPath}`);
+ }
+ console.log('');
+ });
diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts
index 28f372f..a20c01e 100644
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@@ -10,6 +10,7 @@ import {
DEFAULT_SANDBOX_CONFIG
} from '../../runners/index.js';
import { createTracer, saveTrace } from '../../observability/index.js';
+import { exportToPromptfooFormat } from '../../promptfoo/results-exporter.js';
import type { EvalSpec } from '../../analyzer/types.js';
export const runCommand = new Command('run')
@@ -25,6 +26,7 @@ export const runCommand = new Command('run')
.option('-o, --output ', 'Output results to JSON file')
.option('--trace', 'Record execution trace', true)
.option('--no-trace', 'Disable execution tracing')
+ .option('--export-promptfoo', 'Export results in Promptfoo format', false)
.option('-w, --watch', 'Watch mode (rerun on changes)', false)
.action(async (testDir: string, options) => {
try {
@@ -109,6 +111,16 @@ export const runCommand = new Command('run')
console.log(`\n๐ Results saved to: ${options.output}`);
}
+ // Export to Promptfoo format for UI viewing
+ if (options.exportPromptfoo) {
+ const exportPath = await exportToPromptfooFormat(result, spec, {
+ outputDir: '.evaluclaude/results',
+ evalId: `eval-${Date.now()}`,
+ });
+ console.log(`\n๐ฆ Promptfoo results exported: ${exportPath}`);
+ console.log(` View with: evaluclaude ui`);
+ }
+
if (tracer) {
const trace = tracer.finalize();
const tracePath = await saveTrace(trace);
diff --git a/src/cli/commands/ui.ts b/src/cli/commands/ui.ts
index 9be91eb..a775225 100644
--- a/src/cli/commands/ui.ts
+++ b/src/cli/commands/ui.ts
@@ -1,7 +1,7 @@
import { Command } from 'commander';
-import { spawn, type ChildProcess } from 'child_process';
+import { spawn } from 'child_process';
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
-import { join, dirname } from 'path';
+import { join, dirname, resolve as resolvePath } from 'path';
import type { EvalSpec } from '../../analyzer/types.js';
import { generatePromptfooConfig, generateTestProvider } from '../../promptfoo/index.js';
@@ -21,6 +21,7 @@ export const uiCommand = new Command('ui')
const configPath = join(EVALUCLAUDE_DIR, CONFIG_FILE);
const providerPath = join(EVALUCLAUDE_DIR, PROVIDERS_DIR, 'test-runner.py');
+ // If spec provided with --generate, create/update Promptfoo config
if (options.spec && options.generate) {
console.log('\n๐ Generating Promptfoo configuration...');
@@ -36,6 +37,7 @@ export const uiCommand = new Command('ui')
outputPath: configPath,
framework: detectFramework(spec),
includeTraceLinks: true,
+ providerPath: providerPath,
});
await generateTestProvider(providerPath);
@@ -44,20 +46,31 @@ export const uiCommand = new Command('ui')
console.log(` Provider: ${providerPath}`);
}
+ // Check for existing config, create default if missing
if (!existsSync(configPath)) {
console.log('\nโ ๏ธ No Promptfoo config found.');
- console.log(' Run with --spec --generate to create one.\n');
- console.log(' Or create one manually:');
- console.log(` ${configPath}\n`);
+ console.log(' Creating default configuration...\n');
await createDefaultConfig(configPath, providerPath);
- console.log(` Created default config at ${configPath}`);
+ console.log(` Created: ${configPath}`);
+ }
+
+ // Check for results to display
+ const resultsDir = join(EVALUCLAUDE_DIR, 'results');
+ const latestResults = join(resultsDir, 'latest.json');
+
+ if (!existsSync(latestResults)) {
+ console.log('\nโ ๏ธ No evaluation results found.');
+ console.log(' Run `evaluclaude run --export-promptfoo` first to generate results.\n');
+ console.log(' Or run the full pipeline:');
+ console.log(' evaluclaude pipeline --promptfoo\n');
}
console.log(`\n๐ Starting Promptfoo UI on port ${port}...`);
- console.log(` Config: ${configPath}\n`);
+ console.log(` Results: ${latestResults}\n`);
- await launchPromptfooUI(port, configPath, options.open);
+ // Use promptfoo view with the results file
+ await launchPromptfooView(port, latestResults, options.open);
} catch (error) {
console.error('Error launching UI:', error instanceof Error ? error.message : error);
process.exit(1);
@@ -71,12 +84,21 @@ export const evalCommand = new Command('eval')
.option('-o, --output