mirror of
https://github.com/harivansh-afk/evaluclaude-harness.git
synced 2026-04-16 04:01:59 +00:00
iteration 0
This commit is contained in:
commit
4b24606d0e
25 changed files with 7843 additions and 0 deletions
269
docs/04-functional-test-execution.md
Normal file
269
docs/04-functional-test-execution.md
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
# 4. Functional Test Execution & Grading - System Design
|
||||
|
||||
> **Priority**: 🟢 MEDIUM — Runtime layer
|
||||
> **Complexity**: Medium-High
|
||||
> **Effort Estimate**: 6-10 hours
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Executes generated tests in a **sandboxed environment** and produces structured results. Tests run in isolation to prevent accidental side effects. Results feed into Promptfoo for aggregation and UI.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Test Execution Pipeline │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Test Files │───▶│ Sandbox │───▶│ Results │ │
|
||||
│ │ (.py/.ts) │ │ Runner │ │ JSON │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ pytest/ │ │ Promptfoo │ │
|
||||
│ │ vitest │ │ Integration │ │
|
||||
│ └──────────────┘ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Core Types
|
||||
|
||||
```typescript
|
||||
interface ExecutionOptions {
|
||||
framework: 'pytest' | 'vitest' | 'jest';
|
||||
sandbox: boolean;
|
||||
timeout: number; // ms per test
|
||||
parallel: boolean;
|
||||
filter?: string[]; // Run specific test IDs
|
||||
}
|
||||
|
||||
interface ExecutionResult {
|
||||
summary: {
|
||||
total: number;
|
||||
passed: number;
|
||||
failed: number;
|
||||
skipped: number;
|
||||
duration: number;
|
||||
};
|
||||
tests: TestResult[];
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
interface TestResult {
|
||||
id: string; // Maps to EvalScenario.id
|
||||
name: string;
|
||||
status: 'passed' | 'failed' | 'skipped' | 'error';
|
||||
duration: number;
|
||||
assertions: {
|
||||
passed: number;
|
||||
failed: number;
|
||||
details: AssertionResult[];
|
||||
};
|
||||
error?: { message: string; stack?: string };
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sandbox Configuration
|
||||
|
||||
```typescript
|
||||
const SANDBOX_CONFIG = {
|
||||
enabled: true,
|
||||
autoAllowBashIfSandboxed: true,
|
||||
network: {
|
||||
allowLocalBinding: true,
|
||||
allowOutbound: false, // No external network
|
||||
},
|
||||
filesystem: {
|
||||
readOnly: ['/'],
|
||||
writable: ['/tmp', './test-output'],
|
||||
},
|
||||
env: {
|
||||
inherit: ['PATH', 'HOME'],
|
||||
set: { CI: 'true', NODE_ENV: 'test' },
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Runner Implementations
|
||||
|
||||
### Pytest Runner
|
||||
|
||||
```typescript
|
||||
async function runPytest(testDir: string, options: ExecutionOptions): Promise<ExecutionResult> {
|
||||
const args = [
|
||||
'-v',
|
||||
'--tb=short',
|
||||
'--json-report',
|
||||
'--json-report-file=results.json',
|
||||
options.parallel ? '-n auto' : '',
|
||||
options.filter?.map(f => `-k ${f}`).join(' ') || '',
|
||||
].filter(Boolean);
|
||||
|
||||
const { exitCode, stdout, stderr } = await exec(
|
||||
`pytest ${args.join(' ')} ${testDir}`,
|
||||
{ timeout: options.timeout, cwd: testDir }
|
||||
);
|
||||
|
||||
const report = JSON.parse(await fs.readFile('results.json', 'utf-8'));
|
||||
return parseJsonReport(report);
|
||||
}
|
||||
```
|
||||
|
||||
### Vitest Runner
|
||||
|
||||
```typescript
|
||||
async function runVitest(testDir: string, options: ExecutionOptions): Promise<ExecutionResult> {
|
||||
const args = [
|
||||
'run',
|
||||
'--reporter=json',
|
||||
'--outputFile=results.json',
|
||||
options.filter?.length ? `--testNamePattern="${options.filter.join('|')}"` : '',
|
||||
].filter(Boolean);
|
||||
|
||||
const { exitCode } = await exec(
|
||||
`npx vitest ${args.join(' ')}`,
|
||||
{ timeout: options.timeout, cwd: testDir }
|
||||
);
|
||||
|
||||
const report = JSON.parse(await fs.readFile('results.json', 'utf-8'));
|
||||
return parseVitestReport(report);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Promptfoo Integration
|
||||
|
||||
### Custom Provider (`providers/test-runner.py`)
|
||||
|
||||
```python
|
||||
def get_provider_response(prompt: str, options: dict, context: dict) -> dict:
|
||||
"""Runs tests and returns structured results."""
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
test_dir = options.get('test_dir', './tests')
|
||||
framework = options.get('framework', 'pytest')
|
||||
|
||||
if framework == 'pytest':
|
||||
result = subprocess.run(
|
||||
['pytest', '--json-report', '--json-report-file=/tmp/results.json', test_dir],
|
||||
capture_output=True, text=True, timeout=300
|
||||
)
|
||||
with open('/tmp/results.json') as f:
|
||||
report = json.load(f)
|
||||
|
||||
return {
|
||||
'output': json.dumps({
|
||||
'passed': report['summary']['passed'],
|
||||
'failed': report['summary']['failed'],
|
||||
'tests': report['tests'],
|
||||
}),
|
||||
'error': None,
|
||||
}
|
||||
```
|
||||
|
||||
### Promptfoo Config
|
||||
|
||||
```yaml
|
||||
providers:
|
||||
- id: file://providers/test-runner.py
|
||||
label: functional-tests
|
||||
config:
|
||||
test_dir: .evaluclaude/tests
|
||||
framework: pytest
|
||||
timeout: 300
|
||||
|
||||
tests:
|
||||
- vars:
|
||||
scenario_id: auth-login-success
|
||||
assert:
|
||||
- type: python
|
||||
value: |
|
||||
import json
|
||||
result = json.loads(output)
|
||||
result['passed'] > 0 and result['failed'] == 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
src/runners/
|
||||
├── index.ts # Main entry + registry
|
||||
├── types.ts # Interfaces
|
||||
├── sandbox.ts # Isolation wrapper
|
||||
├── pytest-runner.ts # Python test execution
|
||||
├── vitest-runner.ts # Vitest execution
|
||||
├── jest-runner.ts # Jest execution
|
||||
└── result-parser.ts # Normalize results
|
||||
|
||||
providers/
|
||||
└── test-runner.py # Promptfoo provider
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Result Parsing
|
||||
|
||||
```typescript
|
||||
function parseJsonReport(report: any): ExecutionResult {
|
||||
return {
|
||||
summary: {
|
||||
total: report.summary.total,
|
||||
passed: report.summary.passed,
|
||||
failed: report.summary.failed,
|
||||
skipped: report.summary.skipped || 0,
|
||||
duration: report.duration,
|
||||
},
|
||||
tests: report.tests.map((t: any) => ({
|
||||
id: extractScenarioId(t.nodeid),
|
||||
name: t.nodeid,
|
||||
status: t.outcome,
|
||||
duration: t.call?.duration || 0,
|
||||
assertions: { passed: 0, failed: 0, details: [] },
|
||||
error: t.call?.crash ? { message: t.call.crash.message } : undefined,
|
||||
})),
|
||||
errors: [],
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Dependencies
|
||||
|
||||
```json
|
||||
{
|
||||
"dependencies": {}
|
||||
}
|
||||
```
|
||||
|
||||
**Test framework deps (installed in target repo):**
|
||||
- `pytest`, `pytest-json-report`, `pytest-xdist` (Python)
|
||||
- `vitest` (TypeScript)
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Pytest tests run and produce JSON results
|
||||
- [ ] Vitest tests run and produce JSON results
|
||||
- [ ] Sandbox prevents network/filesystem escape
|
||||
- [ ] Results map back to EvalScenario IDs
|
||||
- [ ] Promptfoo integration works
|
||||
- [ ] Parallel execution supported
|
||||
Loading…
Add table
Add a link
Reference in a new issue