mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-17 12:04:13 +00:00
analyze
This commit is contained in:
parent
4b24606d0e
commit
9297f0b1ee
13 changed files with 1292 additions and 16 deletions
12
src/analyzer/index.ts
Normal file
12
src/analyzer/index.ts
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
export { generateEvalSpec, generateEvalSpecInteractive, generateEvalSpecNonInteractive } from './spec-generator.js';
|
||||
export type { GenerateResult, GenerateOptions } from './spec-generator.js';
|
||||
export type {
|
||||
EvalSpec,
|
||||
EvalScenario,
|
||||
Assertion,
|
||||
MockSpec,
|
||||
DeterministicGrade,
|
||||
RubricGrade,
|
||||
} from './types.js';
|
||||
export { EVAL_SPEC_JSON_SCHEMA } from './types.js';
|
||||
export { buildSystemPrompt, buildUserPrompt, optimizeForPrompt } from './prompt-builder.js';
|
||||
112
src/analyzer/prompt-builder.ts
Normal file
112
src/analyzer/prompt-builder.ts
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
import * as fs from 'node:fs/promises';
|
||||
import * as path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import type { RepoSummary } from '../introspector/types.js';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const PROMPTS_DIR = path.join(__dirname, '../../prompts');
|
||||
|
||||
export interface PromptConfig {
|
||||
repoSummary: RepoSummary;
|
||||
focus?: string[];
|
||||
maxScenarios?: number;
|
||||
}
|
||||
|
||||
export async function loadPrompt(name: string): Promise<string> {
|
||||
const filePath = path.join(PROMPTS_DIR, `${name}.md`);
|
||||
return fs.readFile(filePath, 'utf-8');
|
||||
}
|
||||
|
||||
export async function buildSystemPrompt(): Promise<string> {
|
||||
const system = await loadPrompt('analyzer-system');
|
||||
const developer = await loadPrompt('analyzer-developer');
|
||||
return `${system}\n\n${developer}`;
|
||||
}
|
||||
|
||||
export async function buildUserPrompt(config: PromptConfig): Promise<string> {
|
||||
const template = await loadPrompt('analyzer-user');
|
||||
|
||||
const optimizedSummary = optimizeForPrompt(config.repoSummary);
|
||||
const summaryJson = JSON.stringify(optimizedSummary, null, 2);
|
||||
|
||||
const focusInstructions = config.focus?.length
|
||||
? `Focus specifically on these modules/functions: ${config.focus.join(', ')}`
|
||||
: 'Analyze the entire codebase and identify the most important testable functions.';
|
||||
|
||||
const maxScenarios = config.maxScenarios ?? 10;
|
||||
|
||||
return template
|
||||
.replace('{{REPO_SUMMARY}}', summaryJson)
|
||||
.replace('{{FOCUS_INSTRUCTIONS}}', focusInstructions)
|
||||
.replace('{{MAX_SCENARIOS}}', String(maxScenarios));
|
||||
}
|
||||
|
||||
export function optimizeForPrompt(summary: RepoSummary): OptimizedRepoSummary {
|
||||
return {
|
||||
name: path.basename(summary.root),
|
||||
languages: summary.languages,
|
||||
analyzedAt: summary.analyzedAt,
|
||||
|
||||
modules: summary.modules.map(m => ({
|
||||
path: m.path,
|
||||
complexity: m.complexity,
|
||||
exports: m.exports.map(e => ({
|
||||
name: e.name,
|
||||
kind: e.kind,
|
||||
signature: e.signature,
|
||||
docstring: e.docstring,
|
||||
line: e.lineNumber,
|
||||
async: e.isAsync,
|
||||
})).filter(e => !e.name.startsWith('_')),
|
||||
imports: m.imports.slice(0, 10),
|
||||
})).filter(m => m.exports.length > 0),
|
||||
|
||||
config: {
|
||||
python: summary.config.python ? {
|
||||
testFramework: summary.config.python.testFramework,
|
||||
hasTyping: summary.config.python.hasTyping,
|
||||
} : undefined,
|
||||
typescript: summary.config.typescript ? {
|
||||
testFramework: summary.config.typescript.testFramework,
|
||||
hasTypes: summary.config.typescript.hasTypes,
|
||||
} : undefined,
|
||||
},
|
||||
|
||||
git: summary.git ? {
|
||||
branch: summary.git.branch,
|
||||
activeFiles: summary.git.fileHistory
|
||||
?.sort((a, b) => b.commitCount - a.commitCount)
|
||||
.slice(0, 10)
|
||||
.map(f => ({ path: f.path, commits: f.commitCount })),
|
||||
} : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export interface OptimizedRepoSummary {
|
||||
name: string;
|
||||
languages: string[];
|
||||
analyzedAt: string;
|
||||
modules: OptimizedModule[];
|
||||
config: {
|
||||
python?: { testFramework: string; hasTyping: boolean };
|
||||
typescript?: { testFramework: string; hasTypes: boolean };
|
||||
};
|
||||
git?: {
|
||||
branch: string;
|
||||
activeFiles?: { path: string; commits: number }[];
|
||||
};
|
||||
}
|
||||
|
||||
interface OptimizedModule {
|
||||
path: string;
|
||||
complexity: string;
|
||||
exports: {
|
||||
name: string;
|
||||
kind: string;
|
||||
signature?: string;
|
||||
docstring?: string;
|
||||
line: number;
|
||||
async?: boolean;
|
||||
}[];
|
||||
imports: string[];
|
||||
}
|
||||
174
src/analyzer/spec-generator.ts
Normal file
174
src/analyzer/spec-generator.ts
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
import { query, type SDKMessage, type Options, type CanUseTool, type PermissionResult } from '@anthropic-ai/claude-agent-sdk';
|
||||
import type { RepoSummary } from '../introspector/types.js';
|
||||
import type { EvalSpec } from './types.js';
|
||||
import { buildSystemPrompt, buildUserPrompt } from './prompt-builder.js';
|
||||
import { EVAL_SPEC_JSON_SCHEMA } from './types.js';
|
||||
|
||||
export interface GenerateOptions {
|
||||
interactive?: boolean;
|
||||
onQuestion?: (question: string) => Promise<string>;
|
||||
focus?: string[];
|
||||
maxScenarios?: number;
|
||||
}
|
||||
|
||||
export interface GenerateResult {
|
||||
spec: EvalSpec;
|
||||
tokensUsed: number;
|
||||
questionsAsked: number;
|
||||
}
|
||||
|
||||
export async function generateEvalSpec(
|
||||
repoSummary: RepoSummary,
|
||||
options: GenerateOptions = {}
|
||||
): Promise<GenerateResult> {
|
||||
const { interactive = false, onQuestion, focus, maxScenarios = 10 } = options;
|
||||
|
||||
const systemPrompt = await buildSystemPrompt();
|
||||
const userPrompt = await buildUserPrompt({
|
||||
repoSummary,
|
||||
focus,
|
||||
maxScenarios,
|
||||
});
|
||||
|
||||
let tokensUsed = 0;
|
||||
let questionsAsked = 0;
|
||||
let spec: EvalSpec | null = null;
|
||||
|
||||
const canUseTool: CanUseTool = async (toolName, input): Promise<PermissionResult> => {
|
||||
if (toolName === 'AskUserQuestion' && interactive && onQuestion) {
|
||||
// Extract question from various possible field names
|
||||
const inputObj = input as Record<string, unknown>;
|
||||
const question = String(
|
||||
inputObj.question ||
|
||||
inputObj.text ||
|
||||
inputObj.message ||
|
||||
inputObj.prompt ||
|
||||
JSON.stringify(input)
|
||||
);
|
||||
|
||||
const answer = await onQuestion(question);
|
||||
questionsAsked++;
|
||||
return {
|
||||
behavior: 'allow',
|
||||
updatedInput: { ...input, answer },
|
||||
};
|
||||
}
|
||||
// Allow all other tools in interactive mode
|
||||
return { behavior: 'allow' };
|
||||
};
|
||||
|
||||
const queryOptions: Options = {
|
||||
// In interactive mode, allow all tools; in non-interactive, restrict to none
|
||||
tools: interactive
|
||||
? { type: 'preset', preset: 'claude_code' }
|
||||
: [],
|
||||
permissionMode: 'bypassPermissions',
|
||||
allowDangerouslySkipPermissions: true,
|
||||
outputFormat: {
|
||||
type: 'json_schema',
|
||||
schema: EVAL_SPEC_JSON_SCHEMA,
|
||||
},
|
||||
canUseTool: interactive ? canUseTool : undefined,
|
||||
};
|
||||
|
||||
const fullPrompt = `${systemPrompt}\n\n---\n\n${userPrompt}`;
|
||||
|
||||
for await (const message of query({ prompt: fullPrompt, options: queryOptions })) {
|
||||
handleMessage(message);
|
||||
|
||||
if (message.type === 'result') {
|
||||
if (message.subtype === 'success') {
|
||||
// SDK returns parsed JSON in structured_output when outputFormat is set
|
||||
const structuredOutput = (message as { structured_output?: unknown }).structured_output;
|
||||
const resultData = structuredOutput ?? message.result;
|
||||
spec = parseResult(resultData);
|
||||
tokensUsed = (message.usage?.input_tokens ?? 0) + (message.usage?.output_tokens ?? 0);
|
||||
} else {
|
||||
throw new Error(`Generation failed: ${message.subtype}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!spec) {
|
||||
throw new Error('Failed to generate EvalSpec: no result received');
|
||||
}
|
||||
|
||||
spec.metadata = {
|
||||
...spec.metadata,
|
||||
generatedBy: 'evaluclaude-harness',
|
||||
totalTokens: tokensUsed,
|
||||
questionsAsked,
|
||||
};
|
||||
|
||||
return { spec, tokensUsed, questionsAsked };
|
||||
}
|
||||
|
||||
function parseResult(result: unknown): EvalSpec {
|
||||
if (typeof result === 'string') {
|
||||
let jsonStr = result.trim();
|
||||
|
||||
// Try to extract JSON from markdown code blocks
|
||||
const jsonMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (jsonMatch) {
|
||||
jsonStr = jsonMatch[1].trim();
|
||||
}
|
||||
|
||||
// Try to find JSON object in the string
|
||||
const startIdx = jsonStr.indexOf('{');
|
||||
const endIdx = jsonStr.lastIndexOf('}');
|
||||
if (startIdx !== -1 && endIdx !== -1 && endIdx > startIdx) {
|
||||
jsonStr = jsonStr.slice(startIdx, endIdx + 1);
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.parse(jsonStr) as EvalSpec;
|
||||
} catch (e) {
|
||||
console.error('Raw result:', result);
|
||||
throw new Error(`Failed to parse result as JSON: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (result && typeof result === 'object') {
|
||||
return result as EvalSpec;
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected result type: ${typeof result}`);
|
||||
}
|
||||
|
||||
function handleMessage(message: SDKMessage): void {
|
||||
switch (message.type) {
|
||||
case 'assistant':
|
||||
if (message.message?.content) {
|
||||
for (const block of message.message.content) {
|
||||
if (block.type === 'text') {
|
||||
process.stderr.write(`\n${block.text}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'result':
|
||||
if (message.subtype !== 'success') {
|
||||
console.error('Error:', message.subtype);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateEvalSpecNonInteractive(
|
||||
repoSummary: RepoSummary,
|
||||
options: Omit<GenerateOptions, 'interactive' | 'onQuestion'> = {}
|
||||
): Promise<GenerateResult> {
|
||||
return generateEvalSpec(repoSummary, { ...options, interactive: false });
|
||||
}
|
||||
|
||||
export async function generateEvalSpecInteractive(
|
||||
repoSummary: RepoSummary,
|
||||
questionHandler: (question: string) => Promise<string>,
|
||||
options: Omit<GenerateOptions, 'interactive' | 'onQuestion'> = {}
|
||||
): Promise<GenerateResult> {
|
||||
return generateEvalSpec(repoSummary, {
|
||||
...options,
|
||||
interactive: true,
|
||||
onQuestion: questionHandler,
|
||||
});
|
||||
}
|
||||
263
src/analyzer/types.ts
Normal file
263
src/analyzer/types.ts
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
export interface EvalSpec {
|
||||
version: '1.0';
|
||||
repo: {
|
||||
name: string;
|
||||
languages: string[];
|
||||
analyzedAt: string;
|
||||
};
|
||||
scenarios: EvalScenario[];
|
||||
grading: {
|
||||
deterministic: DeterministicGrade[];
|
||||
rubrics: RubricGrade[];
|
||||
};
|
||||
metadata: {
|
||||
generatedBy: string;
|
||||
totalTokens: number;
|
||||
questionsAsked: number;
|
||||
confidence: 'low' | 'medium' | 'high';
|
||||
};
|
||||
}
|
||||
|
||||
export interface EvalScenario {
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
target: {
|
||||
module: string;
|
||||
function: string;
|
||||
type: 'function' | 'method' | 'class';
|
||||
};
|
||||
category: 'unit' | 'integration' | 'edge-case' | 'negative';
|
||||
priority: 'critical' | 'high' | 'medium' | 'low';
|
||||
setup?: {
|
||||
fixtures: string[];
|
||||
mocks: MockSpec[];
|
||||
};
|
||||
input: {
|
||||
args: Record<string, unknown>;
|
||||
kwargs?: Record<string, unknown>;
|
||||
};
|
||||
assertions: Assertion[];
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
export interface MockSpec {
|
||||
target: string;
|
||||
returnValue?: unknown;
|
||||
sideEffect?: string;
|
||||
}
|
||||
|
||||
export type Assertion =
|
||||
| EqualsAssertion
|
||||
| ContainsAssertion
|
||||
| ThrowsAssertion
|
||||
| TypeAssertion
|
||||
| MatchesAssertion
|
||||
| TruthyAssertion
|
||||
| CustomAssertion
|
||||
| LLMRubricAssertion;
|
||||
|
||||
export interface LLMRubricAssertion extends BaseAssertion {
|
||||
type: 'llm-rubric';
|
||||
rubric: string;
|
||||
criteria: string[];
|
||||
passingThreshold?: number;
|
||||
}
|
||||
|
||||
export interface BaseAssertion {
|
||||
description?: string;
|
||||
}
|
||||
|
||||
export interface EqualsAssertion extends BaseAssertion {
|
||||
type: 'equals';
|
||||
expected: unknown;
|
||||
path?: string;
|
||||
}
|
||||
|
||||
export interface ContainsAssertion extends BaseAssertion {
|
||||
type: 'contains';
|
||||
value: unknown;
|
||||
path?: string;
|
||||
}
|
||||
|
||||
export interface ThrowsAssertion extends BaseAssertion {
|
||||
type: 'throws';
|
||||
errorType?: string;
|
||||
messageContains?: string;
|
||||
}
|
||||
|
||||
export interface TypeAssertion extends BaseAssertion {
|
||||
type: 'typeof';
|
||||
expected: 'string' | 'number' | 'boolean' | 'object' | 'array' | 'null' | 'undefined';
|
||||
path?: string;
|
||||
}
|
||||
|
||||
export interface MatchesAssertion extends BaseAssertion {
|
||||
type: 'matches';
|
||||
pattern: string;
|
||||
path?: string;
|
||||
}
|
||||
|
||||
export interface TruthyAssertion extends BaseAssertion {
|
||||
type: 'truthy' | 'falsy';
|
||||
path?: string;
|
||||
}
|
||||
|
||||
export interface CustomAssertion {
|
||||
type: 'custom';
|
||||
description: string;
|
||||
check: string;
|
||||
}
|
||||
|
||||
export interface DeterministicGrade {
|
||||
scenarioId: string;
|
||||
check: 'pass' | 'fail' | 'error';
|
||||
score: number;
|
||||
}
|
||||
|
||||
export interface RubricGrade {
|
||||
scenarioId: string;
|
||||
criteria: string;
|
||||
maxScore: number;
|
||||
}
|
||||
|
||||
export const EVAL_SPEC_JSON_SCHEMA = {
|
||||
type: 'object',
|
||||
properties: {
|
||||
version: { type: 'string', const: '1.0' },
|
||||
repo: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
name: { type: 'string' },
|
||||
languages: { type: 'array', items: { type: 'string' } },
|
||||
analyzedAt: { type: 'string' },
|
||||
},
|
||||
required: ['name', 'languages', 'analyzedAt'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
scenarios: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
id: { type: 'string' },
|
||||
name: { type: 'string' },
|
||||
description: { type: 'string' },
|
||||
target: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
module: { type: 'string' },
|
||||
function: { type: 'string' },
|
||||
type: { type: 'string', enum: ['function', 'method', 'class'] },
|
||||
},
|
||||
required: ['module', 'function', 'type'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
category: { type: 'string', enum: ['unit', 'integration', 'edge-case', 'negative'] },
|
||||
priority: { type: 'string', enum: ['critical', 'high', 'medium', 'low'] },
|
||||
setup: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
fixtures: { type: 'array', items: { type: 'string' } },
|
||||
mocks: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
target: { type: 'string' },
|
||||
returnValue: {},
|
||||
sideEffect: { type: 'string' },
|
||||
},
|
||||
required: ['target'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ['fixtures', 'mocks'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
input: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
args: { type: 'object' },
|
||||
kwargs: { type: 'object' },
|
||||
},
|
||||
required: ['args'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
assertions: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
type: { type: 'string' },
|
||||
expected: {},
|
||||
value: {},
|
||||
path: { type: 'string' },
|
||||
errorType: { type: 'string' },
|
||||
messageContains: { type: 'string' },
|
||||
pattern: { type: 'string' },
|
||||
description: { type: 'string' },
|
||||
check: { type: 'string' },
|
||||
rubric: { type: 'string' },
|
||||
criteria: { type: 'array', items: { type: 'string' } },
|
||||
passingThreshold: { type: 'number' },
|
||||
},
|
||||
required: ['type'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
tags: { type: 'array', items: { type: 'string' } },
|
||||
},
|
||||
required: ['id', 'name', 'description', 'target', 'category', 'priority', 'input', 'assertions', 'tags'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
grading: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
deterministic: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
scenarioId: { type: 'string' },
|
||||
check: { type: 'string', enum: ['pass', 'fail', 'error'] },
|
||||
score: { type: 'number' },
|
||||
},
|
||||
required: ['scenarioId', 'check', 'score'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
rubrics: {
|
||||
type: 'array',
|
||||
items: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
scenarioId: { type: 'string' },
|
||||
criteria: { type: 'string' },
|
||||
maxScore: { type: 'number' },
|
||||
},
|
||||
required: ['scenarioId', 'criteria', 'maxScore'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
required: ['deterministic', 'rubrics'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
metadata: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
generatedBy: { type: 'string' },
|
||||
totalTokens: { type: 'number' },
|
||||
questionsAsked: { type: 'number' },
|
||||
confidence: { type: 'string', enum: ['low', 'medium', 'high'] },
|
||||
},
|
||||
required: ['generatedBy', 'totalTokens', 'questionsAsked', 'confidence'],
|
||||
additionalProperties: false,
|
||||
},
|
||||
},
|
||||
required: ['version', 'repo', 'scenarios', 'grading', 'metadata'],
|
||||
additionalProperties: false,
|
||||
} as const;
|
||||
Loading…
Add table
Add a link
Reference in a new issue