This commit is contained in:
Harivansh Rathi 2026-01-11 17:50:26 -05:00
parent 4b24606d0e
commit 9297f0b1ee
13 changed files with 1292 additions and 16 deletions

12
src/analyzer/index.ts Normal file
View file

@ -0,0 +1,12 @@
export { generateEvalSpec, generateEvalSpecInteractive, generateEvalSpecNonInteractive } from './spec-generator.js';
export type { GenerateResult, GenerateOptions } from './spec-generator.js';
export type {
EvalSpec,
EvalScenario,
Assertion,
MockSpec,
DeterministicGrade,
RubricGrade,
} from './types.js';
export { EVAL_SPEC_JSON_SCHEMA } from './types.js';
export { buildSystemPrompt, buildUserPrompt, optimizeForPrompt } from './prompt-builder.js';

View file

@ -0,0 +1,112 @@
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
import { fileURLToPath } from 'node:url';
import type { RepoSummary } from '../introspector/types.js';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PROMPTS_DIR = path.join(__dirname, '../../prompts');
export interface PromptConfig {
repoSummary: RepoSummary;
focus?: string[];
maxScenarios?: number;
}
export async function loadPrompt(name: string): Promise<string> {
const filePath = path.join(PROMPTS_DIR, `${name}.md`);
return fs.readFile(filePath, 'utf-8');
}
export async function buildSystemPrompt(): Promise<string> {
const system = await loadPrompt('analyzer-system');
const developer = await loadPrompt('analyzer-developer');
return `${system}\n\n${developer}`;
}
export async function buildUserPrompt(config: PromptConfig): Promise<string> {
const template = await loadPrompt('analyzer-user');
const optimizedSummary = optimizeForPrompt(config.repoSummary);
const summaryJson = JSON.stringify(optimizedSummary, null, 2);
const focusInstructions = config.focus?.length
? `Focus specifically on these modules/functions: ${config.focus.join(', ')}`
: 'Analyze the entire codebase and identify the most important testable functions.';
const maxScenarios = config.maxScenarios ?? 10;
return template
.replace('{{REPO_SUMMARY}}', summaryJson)
.replace('{{FOCUS_INSTRUCTIONS}}', focusInstructions)
.replace('{{MAX_SCENARIOS}}', String(maxScenarios));
}
export function optimizeForPrompt(summary: RepoSummary): OptimizedRepoSummary {
return {
name: path.basename(summary.root),
languages: summary.languages,
analyzedAt: summary.analyzedAt,
modules: summary.modules.map(m => ({
path: m.path,
complexity: m.complexity,
exports: m.exports.map(e => ({
name: e.name,
kind: e.kind,
signature: e.signature,
docstring: e.docstring,
line: e.lineNumber,
async: e.isAsync,
})).filter(e => !e.name.startsWith('_')),
imports: m.imports.slice(0, 10),
})).filter(m => m.exports.length > 0),
config: {
python: summary.config.python ? {
testFramework: summary.config.python.testFramework,
hasTyping: summary.config.python.hasTyping,
} : undefined,
typescript: summary.config.typescript ? {
testFramework: summary.config.typescript.testFramework,
hasTypes: summary.config.typescript.hasTypes,
} : undefined,
},
git: summary.git ? {
branch: summary.git.branch,
activeFiles: summary.git.fileHistory
?.sort((a, b) => b.commitCount - a.commitCount)
.slice(0, 10)
.map(f => ({ path: f.path, commits: f.commitCount })),
} : undefined,
};
}
export interface OptimizedRepoSummary {
name: string;
languages: string[];
analyzedAt: string;
modules: OptimizedModule[];
config: {
python?: { testFramework: string; hasTyping: boolean };
typescript?: { testFramework: string; hasTypes: boolean };
};
git?: {
branch: string;
activeFiles?: { path: string; commits: number }[];
};
}
interface OptimizedModule {
path: string;
complexity: string;
exports: {
name: string;
kind: string;
signature?: string;
docstring?: string;
line: number;
async?: boolean;
}[];
imports: string[];
}

View file

@ -0,0 +1,174 @@
import { query, type SDKMessage, type Options, type CanUseTool, type PermissionResult } from '@anthropic-ai/claude-agent-sdk';
import type { RepoSummary } from '../introspector/types.js';
import type { EvalSpec } from './types.js';
import { buildSystemPrompt, buildUserPrompt } from './prompt-builder.js';
import { EVAL_SPEC_JSON_SCHEMA } from './types.js';
export interface GenerateOptions {
interactive?: boolean;
onQuestion?: (question: string) => Promise<string>;
focus?: string[];
maxScenarios?: number;
}
export interface GenerateResult {
spec: EvalSpec;
tokensUsed: number;
questionsAsked: number;
}
export async function generateEvalSpec(
repoSummary: RepoSummary,
options: GenerateOptions = {}
): Promise<GenerateResult> {
const { interactive = false, onQuestion, focus, maxScenarios = 10 } = options;
const systemPrompt = await buildSystemPrompt();
const userPrompt = await buildUserPrompt({
repoSummary,
focus,
maxScenarios,
});
let tokensUsed = 0;
let questionsAsked = 0;
let spec: EvalSpec | null = null;
const canUseTool: CanUseTool = async (toolName, input): Promise<PermissionResult> => {
if (toolName === 'AskUserQuestion' && interactive && onQuestion) {
// Extract question from various possible field names
const inputObj = input as Record<string, unknown>;
const question = String(
inputObj.question ||
inputObj.text ||
inputObj.message ||
inputObj.prompt ||
JSON.stringify(input)
);
const answer = await onQuestion(question);
questionsAsked++;
return {
behavior: 'allow',
updatedInput: { ...input, answer },
};
}
// Allow all other tools in interactive mode
return { behavior: 'allow' };
};
const queryOptions: Options = {
// In interactive mode, allow all tools; in non-interactive, restrict to none
tools: interactive
? { type: 'preset', preset: 'claude_code' }
: [],
permissionMode: 'bypassPermissions',
allowDangerouslySkipPermissions: true,
outputFormat: {
type: 'json_schema',
schema: EVAL_SPEC_JSON_SCHEMA,
},
canUseTool: interactive ? canUseTool : undefined,
};
const fullPrompt = `${systemPrompt}\n\n---\n\n${userPrompt}`;
for await (const message of query({ prompt: fullPrompt, options: queryOptions })) {
handleMessage(message);
if (message.type === 'result') {
if (message.subtype === 'success') {
// SDK returns parsed JSON in structured_output when outputFormat is set
const structuredOutput = (message as { structured_output?: unknown }).structured_output;
const resultData = structuredOutput ?? message.result;
spec = parseResult(resultData);
tokensUsed = (message.usage?.input_tokens ?? 0) + (message.usage?.output_tokens ?? 0);
} else {
throw new Error(`Generation failed: ${message.subtype}`);
}
}
}
if (!spec) {
throw new Error('Failed to generate EvalSpec: no result received');
}
spec.metadata = {
...spec.metadata,
generatedBy: 'evaluclaude-harness',
totalTokens: tokensUsed,
questionsAsked,
};
return { spec, tokensUsed, questionsAsked };
}
function parseResult(result: unknown): EvalSpec {
if (typeof result === 'string') {
let jsonStr = result.trim();
// Try to extract JSON from markdown code blocks
const jsonMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
if (jsonMatch) {
jsonStr = jsonMatch[1].trim();
}
// Try to find JSON object in the string
const startIdx = jsonStr.indexOf('{');
const endIdx = jsonStr.lastIndexOf('}');
if (startIdx !== -1 && endIdx !== -1 && endIdx > startIdx) {
jsonStr = jsonStr.slice(startIdx, endIdx + 1);
}
try {
return JSON.parse(jsonStr) as EvalSpec;
} catch (e) {
console.error('Raw result:', result);
throw new Error(`Failed to parse result as JSON: ${e}`);
}
}
if (result && typeof result === 'object') {
return result as EvalSpec;
}
throw new Error(`Unexpected result type: ${typeof result}`);
}
function handleMessage(message: SDKMessage): void {
switch (message.type) {
case 'assistant':
if (message.message?.content) {
for (const block of message.message.content) {
if (block.type === 'text') {
process.stderr.write(`\n${block.text}\n`);
}
}
}
break;
case 'result':
if (message.subtype !== 'success') {
console.error('Error:', message.subtype);
}
break;
}
}
export async function generateEvalSpecNonInteractive(
repoSummary: RepoSummary,
options: Omit<GenerateOptions, 'interactive' | 'onQuestion'> = {}
): Promise<GenerateResult> {
return generateEvalSpec(repoSummary, { ...options, interactive: false });
}
export async function generateEvalSpecInteractive(
repoSummary: RepoSummary,
questionHandler: (question: string) => Promise<string>,
options: Omit<GenerateOptions, 'interactive' | 'onQuestion'> = {}
): Promise<GenerateResult> {
return generateEvalSpec(repoSummary, {
...options,
interactive: true,
onQuestion: questionHandler,
});
}

263
src/analyzer/types.ts Normal file
View file

@ -0,0 +1,263 @@
export interface EvalSpec {
version: '1.0';
repo: {
name: string;
languages: string[];
analyzedAt: string;
};
scenarios: EvalScenario[];
grading: {
deterministic: DeterministicGrade[];
rubrics: RubricGrade[];
};
metadata: {
generatedBy: string;
totalTokens: number;
questionsAsked: number;
confidence: 'low' | 'medium' | 'high';
};
}
export interface EvalScenario {
id: string;
name: string;
description: string;
target: {
module: string;
function: string;
type: 'function' | 'method' | 'class';
};
category: 'unit' | 'integration' | 'edge-case' | 'negative';
priority: 'critical' | 'high' | 'medium' | 'low';
setup?: {
fixtures: string[];
mocks: MockSpec[];
};
input: {
args: Record<string, unknown>;
kwargs?: Record<string, unknown>;
};
assertions: Assertion[];
tags: string[];
}
export interface MockSpec {
target: string;
returnValue?: unknown;
sideEffect?: string;
}
export type Assertion =
| EqualsAssertion
| ContainsAssertion
| ThrowsAssertion
| TypeAssertion
| MatchesAssertion
| TruthyAssertion
| CustomAssertion
| LLMRubricAssertion;
export interface LLMRubricAssertion extends BaseAssertion {
type: 'llm-rubric';
rubric: string;
criteria: string[];
passingThreshold?: number;
}
export interface BaseAssertion {
description?: string;
}
export interface EqualsAssertion extends BaseAssertion {
type: 'equals';
expected: unknown;
path?: string;
}
export interface ContainsAssertion extends BaseAssertion {
type: 'contains';
value: unknown;
path?: string;
}
export interface ThrowsAssertion extends BaseAssertion {
type: 'throws';
errorType?: string;
messageContains?: string;
}
export interface TypeAssertion extends BaseAssertion {
type: 'typeof';
expected: 'string' | 'number' | 'boolean' | 'object' | 'array' | 'null' | 'undefined';
path?: string;
}
export interface MatchesAssertion extends BaseAssertion {
type: 'matches';
pattern: string;
path?: string;
}
export interface TruthyAssertion extends BaseAssertion {
type: 'truthy' | 'falsy';
path?: string;
}
export interface CustomAssertion {
type: 'custom';
description: string;
check: string;
}
export interface DeterministicGrade {
scenarioId: string;
check: 'pass' | 'fail' | 'error';
score: number;
}
export interface RubricGrade {
scenarioId: string;
criteria: string;
maxScore: number;
}
export const EVAL_SPEC_JSON_SCHEMA = {
type: 'object',
properties: {
version: { type: 'string', const: '1.0' },
repo: {
type: 'object',
properties: {
name: { type: 'string' },
languages: { type: 'array', items: { type: 'string' } },
analyzedAt: { type: 'string' },
},
required: ['name', 'languages', 'analyzedAt'],
additionalProperties: false,
},
scenarios: {
type: 'array',
items: {
type: 'object',
properties: {
id: { type: 'string' },
name: { type: 'string' },
description: { type: 'string' },
target: {
type: 'object',
properties: {
module: { type: 'string' },
function: { type: 'string' },
type: { type: 'string', enum: ['function', 'method', 'class'] },
},
required: ['module', 'function', 'type'],
additionalProperties: false,
},
category: { type: 'string', enum: ['unit', 'integration', 'edge-case', 'negative'] },
priority: { type: 'string', enum: ['critical', 'high', 'medium', 'low'] },
setup: {
type: 'object',
properties: {
fixtures: { type: 'array', items: { type: 'string' } },
mocks: {
type: 'array',
items: {
type: 'object',
properties: {
target: { type: 'string' },
returnValue: {},
sideEffect: { type: 'string' },
},
required: ['target'],
additionalProperties: false,
},
},
},
required: ['fixtures', 'mocks'],
additionalProperties: false,
},
input: {
type: 'object',
properties: {
args: { type: 'object' },
kwargs: { type: 'object' },
},
required: ['args'],
additionalProperties: false,
},
assertions: {
type: 'array',
items: {
type: 'object',
properties: {
type: { type: 'string' },
expected: {},
value: {},
path: { type: 'string' },
errorType: { type: 'string' },
messageContains: { type: 'string' },
pattern: { type: 'string' },
description: { type: 'string' },
check: { type: 'string' },
rubric: { type: 'string' },
criteria: { type: 'array', items: { type: 'string' } },
passingThreshold: { type: 'number' },
},
required: ['type'],
additionalProperties: false,
},
},
tags: { type: 'array', items: { type: 'string' } },
},
required: ['id', 'name', 'description', 'target', 'category', 'priority', 'input', 'assertions', 'tags'],
additionalProperties: false,
},
},
grading: {
type: 'object',
properties: {
deterministic: {
type: 'array',
items: {
type: 'object',
properties: {
scenarioId: { type: 'string' },
check: { type: 'string', enum: ['pass', 'fail', 'error'] },
score: { type: 'number' },
},
required: ['scenarioId', 'check', 'score'],
additionalProperties: false,
},
},
rubrics: {
type: 'array',
items: {
type: 'object',
properties: {
scenarioId: { type: 'string' },
criteria: { type: 'string' },
maxScore: { type: 'number' },
},
required: ['scenarioId', 'criteria', 'maxScore'],
additionalProperties: false,
},
},
},
required: ['deterministic', 'rubrics'],
additionalProperties: false,
},
metadata: {
type: 'object',
properties: {
generatedBy: { type: 'string' },
totalTokens: { type: 'number' },
questionsAsked: { type: 'number' },
confidence: { type: 'string', enum: ['low', 'medium', 'high'] },
},
required: ['generatedBy', 'totalTokens', 'questionsAsked', 'confidence'],
additionalProperties: false,
},
},
required: ['version', 'repo', 'scenarios', 'grading', 'metadata'],
additionalProperties: false,
} as const;