From e0c36241b03a89cf24c43009510cbeeea6d9f4f8 Mon Sep 17 00:00:00 2001 From: Harivansh Rathi Date: Sun, 11 Jan 2026 18:13:00 -0500 Subject: [PATCH] grader, test renderer --- package-lock.json | 494 +++++++++++++++++++- package.json | 4 + prompts/grader-system.md | 53 +++ prompts/grader-user.md | 33 ++ src/cli/commands/grade.ts | 101 ++++ src/cli/commands/render.ts | 61 +++ src/cli/index.ts | 6 + src/graders/index.ts | 22 + src/graders/llm/calibrator.ts | 68 +++ src/graders/llm/grader.ts | 98 ++++ src/graders/llm/index.ts | 4 + src/graders/llm/prompt-builder.ts | 50 ++ src/graders/llm/rubric-loader.ts | 127 +++++ src/graders/types.ts | 63 +++ src/index.ts | 2 + src/renderers/base.ts | 96 ++++ src/renderers/index.ts | 67 +++ src/renderers/python/assertions.ts | 104 +++++ src/renderers/python/pytest-renderer.ts | 160 +++++++ src/renderers/types.ts | 40 ++ src/renderers/typescript/assertions.ts | 114 +++++ src/renderers/typescript/vitest-renderer.ts | 152 ++++++ 22 files changed, 1914 insertions(+), 5 deletions(-) create mode 100644 prompts/grader-system.md create mode 100644 prompts/grader-user.md create mode 100644 src/cli/commands/grade.ts create mode 100644 src/cli/commands/render.ts create mode 100644 src/graders/index.ts create mode 100644 src/graders/llm/calibrator.ts create mode 100644 src/graders/llm/grader.ts create mode 100644 src/graders/llm/index.ts create mode 100644 src/graders/llm/prompt-builder.ts create mode 100644 src/graders/llm/rubric-loader.ts create mode 100644 src/graders/types.ts create mode 100644 src/renderers/base.ts create mode 100644 src/renderers/index.ts create mode 100644 src/renderers/python/assertions.ts create mode 100644 src/renderers/python/pytest-renderer.ts create mode 100644 src/renderers/types.ts create mode 100644 src/renderers/typescript/assertions.ts create mode 100644 src/renderers/typescript/vitest-renderer.ts diff --git a/package-lock.json b/package-lock.json index f789727..537c310 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,9 +10,12 @@ "license": "MIT", "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.4", + "@anthropic-ai/sdk": "^0.39.0", "commander": "^12.1.0", "glob": "^10.4.0", + "handlebars": "^4.7.8", "inquirer": "^9.2.0", + "js-yaml": "^4.1.0", "tree-sitter": "^0.21.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.21.0" @@ -22,6 +25,7 @@ }, "devDependencies": { "@types/inquirer": "^9.0.7", + "@types/js-yaml": "^4.0.9", "@types/node": "^20.14.0", "eslint": "^8.57.0", "typescript": "^5.4.5", @@ -53,6 +57,36 @@ "zod": "^4.0.0" } }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.39.0", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.39.0.tgz", + "integrity": "sha512-eMyDIPRZbt1CCLErRCi3exlAvNkBtRe+kW5vvJyef93PmNr/clstYgHhtvmkxN82nlKgzyGPCyGxrm0JQ1ZIdg==", + "license": "MIT", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + } + }, + "node_modules/@anthropic-ai/sdk/node_modules/@types/node": { + "version": "18.19.130", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.130.tgz", + "integrity": "sha512-GRaXQx6jGfL8sKfaIDD6OupbIHBr9jv7Jnaml9tB7l4v068PAOXqfcujMMo5PhbIs6ggR1XODELqahT2R8v0fg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@anthropic-ai/sdk/node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", @@ -1399,16 +1433,32 @@ "rxjs": "^7.2.0" } }, + "node_modules/@types/js-yaml": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", + "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/node": { "version": "20.19.28", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.28.tgz", "integrity": "sha512-VyKBr25BuFDzBFCK5sUM6ZXiWfqgCTwTAOK8qzGV/m9FCirXYDlmczJ+d5dXBAQALGCdRRdbteKYfJ84NGEusw==", - "devOptional": true, "license": "MIT", "dependencies": { "undici-types": "~6.21.0" } }, + "node_modules/@types/node-fetch": { + "version": "2.6.13", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.13.tgz", + "integrity": "sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.4" + } + }, "node_modules/@types/through": { "version": "0.0.33", "resolved": "https://registry.npmjs.org/@types/through/-/through-0.0.33.tgz", @@ -1529,6 +1579,18 @@ "url": "https://opencollective.com/vitest" } }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/acorn": { "version": "8.15.0", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", @@ -1565,6 +1627,18 @@ "node": ">=0.4.0" } }, + "node_modules/agentkeepalive": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz", + "integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==", + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -1637,7 +1711,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", - "dev": true, "license": "Python-2.0" }, "node_modules/assertion-error": { @@ -1650,6 +1723,12 @@ "node": "*" } }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -1732,6 +1811,19 @@ "node": ">=8" } }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -1856,6 +1948,18 @@ "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", "license": "MIT" }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/commander": { "version": "12.1.0", "resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz", @@ -1943,6 +2047,15 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/diff-sequences": { "version": "29.6.3", "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", @@ -1966,6 +2079,20 @@ "node": ">=6.0.0" } }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -1978,6 +2105,51 @@ "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", "license": "MIT" }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/esbuild": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", @@ -2191,6 +2363,15 @@ "node": ">=0.10.0" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/execa": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/execa/-/execa-8.0.1.tgz", @@ -2343,6 +2524,41 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/form-data": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==", + "license": "MIT" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "license": "MIT", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, "node_modules/fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -2365,6 +2581,15 @@ "node": "^8.16.0 || ^10.6.0 || >=11.0.0" } }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/get-func-name": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz", @@ -2375,6 +2600,43 @@ "node": "*" } }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/get-stream": { "version": "8.0.1", "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-8.0.1.tgz", @@ -2461,6 +2723,18 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/graphemer": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz", @@ -2468,6 +2742,27 @@ "dev": true, "license": "MIT" }, + "node_modules/handlebars": { + "version": "4.7.8", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.8.tgz", + "integrity": "sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==", + "license": "MIT", + "dependencies": { + "minimist": "^1.2.5", + "neo-async": "^2.6.2", + "source-map": "^0.6.1", + "wordwrap": "^1.0.0" + }, + "bin": { + "handlebars": "bin/handlebars" + }, + "engines": { + "node": ">=0.4.7" + }, + "optionalDependencies": { + "uglify-js": "^3.1.4" + } + }, "node_modules/has-flag": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", @@ -2477,6 +2772,45 @@ "node": ">=8" } }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/human-signals": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-5.0.0.tgz", @@ -2487,6 +2821,15 @@ "node": ">=16.17.0" } }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, "node_modules/iconv-lite": { "version": "0.7.2", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", @@ -2709,7 +3052,6 @@ "version": "4.1.1", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", - "dev": true, "license": "MIT", "dependencies": { "argparse": "^2.0.1" @@ -2845,6 +3187,15 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/merge-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", @@ -2852,6 +3203,27 @@ "dev": true, "license": "MIT" }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, "node_modules/mimic-fn": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz", @@ -2874,6 +3246,15 @@ "node": "*" } }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/minipass": { "version": "7.1.2", "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", @@ -2907,7 +3288,6 @@ "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "dev": true, "license": "MIT" }, "node_modules/mute-stream": { @@ -2945,6 +3325,12 @@ "dev": true, "license": "MIT" }, + "node_modules/neo-async": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", + "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==", + "license": "MIT" + }, "node_modules/node-addon-api": { "version": "8.5.0", "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.5.0.tgz", @@ -2954,6 +3340,46 @@ "node": "^18 || ^20 || >= 21" } }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "deprecated": "Use your platform's native DOMException instead", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, "node_modules/node-gyp-build": { "version": "4.8.4", "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.4.tgz", @@ -3550,6 +3976,15 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/source-map-js": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", @@ -3722,6 +4157,12 @@ "node": ">=14.0.0" } }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, "node_modules/tree-sitter": { "version": "0.21.1", "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.21.1.tgz", @@ -3840,11 +4281,23 @@ "dev": true, "license": "MIT" }, + "node_modules/uglify-js": { + "version": "3.19.3", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.19.3.tgz", + "integrity": "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ==", + "license": "BSD-2-Clause", + "optional": true, + "bin": { + "uglifyjs": "bin/uglifyjs" + }, + "engines": { + "node": ">=0.8.0" + } + }, "node_modules/undici-types": { "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", - "devOptional": true, "license": "MIT" }, "node_modules/uri-js": { @@ -4021,6 +4474,31 @@ "defaults": "^1.0.3" } }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -4063,6 +4541,12 @@ "node": ">=0.10.0" } }, + "node_modules/wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==", + "license": "MIT" + }, "node_modules/wrap-ansi": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz", diff --git a/package.json b/package.json index 1268052..5ad3602 100644 --- a/package.json +++ b/package.json @@ -27,15 +27,19 @@ "license": "MIT", "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.4", + "@anthropic-ai/sdk": "^0.39.0", "commander": "^12.1.0", "glob": "^10.4.0", + "handlebars": "^4.7.8", "inquirer": "^9.2.0", + "js-yaml": "^4.1.0", "tree-sitter": "^0.21.1", "tree-sitter-python": "^0.21.0", "tree-sitter-typescript": "^0.21.0" }, "devDependencies": { "@types/inquirer": "^9.0.7", + "@types/js-yaml": "^4.0.9", "@types/node": "^20.14.0", "eslint": "^8.57.0", "typescript": "^5.4.5", diff --git a/prompts/grader-system.md b/prompts/grader-system.md new file mode 100644 index 0000000..3e8e9c4 --- /dev/null +++ b/prompts/grader-system.md @@ -0,0 +1,53 @@ +# LLM Rubric Grader + +You are an expert evaluator with deep experience in code quality assessment. Your task is to grade output against a structured rubric with precision and consistency. + +## Your Role + +- You evaluate objectively against the criteria provided +- You provide actionable feedback that helps improve quality +- You score consistently—the same quality should always receive the same score +- You justify every score with specific evidence from the output + +## Evaluation Process + +1. **Read the rubric** — Understand each criterion, its weight, and what good/bad looks like +2. **Analyze the output** — Examine it thoroughly before scoring +3. **Score independently** — Rate each criterion without letting others influence it +4. **Cite evidence** — Every score must reference specific parts of the output +5. **Calculate overall** — Compute weighted average accurately + +## Scoring Scale + +| Score | Meaning | +|-------|---------| +| 0.0 | Complete failure, criterion not addressed | +| 0.1-0.3 | Major deficiencies, fundamental issues | +| 0.4-0.5 | Below expectations, significant gaps | +| 0.6-0.7 | Meets basic requirements, room for improvement | +| 0.8-0.9 | Exceeds expectations, minor issues only | +| 1.0 | Exemplary, no improvements needed | + +## Critical Rules + +- **Never score 1.0 unless truly perfect** — Reserve it for exceptional cases +- **Never score 0.0 unless completely absent** — Even poor attempts get some credit +- **Be specific in feedback** — "Could be better" is not helpful; "Variable name 'x' should describe its purpose" is +- **Consider context** — A quick script has different quality expectations than a library API + +## Output Format + +Return ONLY valid JSON. No markdown, no explanation outside the JSON. + +```json +{ + "scores": { + "criterion_name": { + "score": 0.0, + "feedback": "Specific, actionable feedback citing evidence" + } + }, + "overall": 0.0, + "summary": "One-sentence overall assessment" +} +``` diff --git a/prompts/grader-user.md b/prompts/grader-user.md new file mode 100644 index 0000000..36b76c9 --- /dev/null +++ b/prompts/grader-user.md @@ -0,0 +1,33 @@ +# Grading Request + +## Rubric: {{RUBRIC_NAME}} + +{{RUBRIC_DESCRIPTION}} + +**Passing Threshold:** {{PASSING_THRESHOLD}}% + +### Criteria + +{{CRITERIA_LIST}} + +--- + +## Output to Evaluate + +``` +{{OUTPUT}} +``` + +--- + +## Your Task + +1. Evaluate the output against each criterion above +2. Provide a score (0.0-1.0) and specific feedback for each +3. Calculate the weighted overall score +4. Return your assessment as JSON + +Remember: +- Cite specific evidence from the output for each score +- The overall score must equal the weighted average of criterion scores +- Feedback should be actionable and specific diff --git a/src/cli/commands/grade.ts b/src/cli/commands/grade.ts new file mode 100644 index 0000000..0a9f645 --- /dev/null +++ b/src/cli/commands/grade.ts @@ -0,0 +1,101 @@ +import { Command } from 'commander'; +import { readFileSync, existsSync } from 'fs'; +import { gradeWithRubric, loadAllRubrics, analyzeCalibration, calibrate } from '../../graders/index.js'; +import type { CalibrationExample } from '../../graders/types.js'; + +export const gradeCommand = new Command('grade') + .description('Grade output using LLM rubric') + .argument('', 'Path to input file or string to grade') + .option('-r, --rubric ', 'Rubric name or path', 'code-quality') + .option('--rubrics-dir ', 'Directory containing rubric YAML files', 'rubrics') + .option('--json', 'Output result as JSON', false) + .action(async (input: string, options) => { + try { + let content: string; + + if (existsSync(input)) { + content = readFileSync(input, 'utf-8'); + } else { + content = input; + } + + console.log(`Grading with rubric: ${options.rubric}`); + + const result = await gradeWithRubric(content, options.rubric, { + rubricsDir: options.rubricsDir, + }); + + if (options.json) { + console.log(JSON.stringify(result, null, 2)); + return; + } + + console.log(`\n${result.pass ? '✅ PASS' : '❌ FAIL'}`); + console.log(`Score: ${(result.score * 100).toFixed(1)}%`); + console.log(`\nSummary: ${result.reason}`); + + console.log('\nCriterion Scores:'); + for (const cs of result.criterionScores) { + const bar = '█'.repeat(Math.round(cs.score * 10)) + '░'.repeat(10 - Math.round(cs.score * 10)); + console.log(` ${cs.name}: ${bar} ${(cs.score * 100).toFixed(0)}%`); + console.log(` ${cs.feedback}`); + } + } catch (error) { + console.error('Error grading:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); + +export const listRubricsCommand = new Command('rubrics') + .description('List available rubrics') + .option('--rubrics-dir ', 'Directory containing rubric YAML files', 'rubrics') + .action(async (options) => { + try { + const rubrics = loadAllRubrics(options.rubricsDir); + + if (rubrics.size === 0) { + console.log(`No rubrics found in ${options.rubricsDir}`); + return; + } + + console.log(`Available rubrics (${rubrics.size}):\n`); + + for (const [name, rubric] of rubrics) { + console.log(`📋 ${name}`); + console.log(` ${rubric.description}`); + console.log(` Threshold: ${(rubric.passingThreshold * 100).toFixed(0)}%`); + console.log(` Criteria: ${rubric.criteria.map(c => c.name).join(', ')}`); + console.log(''); + } + } catch (error) { + console.error('Error listing rubrics:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); + +export const calibrateCommand = new Command('calibrate') + .description('Calibrate a rubric against known examples') + .argument('', 'Rubric name or path') + .argument('', 'Path to calibration examples JSON') + .option('--rubrics-dir ', 'Directory containing rubric YAML files', 'rubrics') + .action(async (rubricName: string, examplesPath: string, options) => { + try { + if (!existsSync(examplesPath)) { + console.error(`Examples file not found: ${examplesPath}`); + process.exit(1); + } + + const examples: CalibrationExample[] = JSON.parse(readFileSync(examplesPath, 'utf-8')); + + console.log(`Calibrating rubric '${rubricName}' with ${examples.length} examples...`); + + const result = await calibrate(rubricName, examples, { + rubricsDir: options.rubricsDir, + }); + + console.log('\n' + analyzeCalibration(result)); + } catch (error) { + console.error('Error calibrating:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); diff --git a/src/cli/commands/render.ts b/src/cli/commands/render.ts new file mode 100644 index 0000000..300c01e --- /dev/null +++ b/src/cli/commands/render.ts @@ -0,0 +1,61 @@ +import { Command } from 'commander'; +import { readFileSync, existsSync } from 'fs'; +import { renderSpec, detectFramework, type Framework } from '../../renderers/index.js'; +import type { EvalSpec } from '../../analyzer/types.js'; + +export const renderCommand = new Command('render') + .description('Render EvalSpec JSON into runnable test files') + .argument('', 'Path to EvalSpec JSON file') + .option('-o, --output ', 'Output directory for test files', './tests/generated') + .option('-f, --framework ', 'Test framework (pytest, vitest, jest)') + .option('--fixtures', 'Generate fixture stubs', false) + .option('--mocks', 'Generate mock stubs', false) + .option('--dry-run', 'Preview without writing files', false) + .action(async (specPath: string, options) => { + try { + if (!existsSync(specPath)) { + console.error(`Error: Spec file not found: ${specPath}`); + process.exit(1); + } + + const specContent = readFileSync(specPath, 'utf-8'); + const spec: EvalSpec = JSON.parse(specContent); + + const framework = (options.framework as Framework) || detectFramework(spec); + + console.log(`Rendering ${spec.scenarios.length} scenarios with ${framework}...`); + + const result = await renderSpec(spec, { + outputDir: options.output, + framework, + includeFixtures: options.fixtures, + generateMocks: options.mocks, + dryRun: options.dryRun, + }); + + if (options.dryRun) { + console.log('\n--- DRY RUN ---\n'); + for (const file of result.files) { + console.log(`📄 ${file.path}`); + console.log('---'); + console.log(file.content); + console.log('---\n'); + } + } + + console.log(`\n✅ Rendered ${result.stats.scenarioCount} scenarios`); + console.log(` 📁 ${result.stats.fileCount} test files`); + console.log(` 🔍 ${result.stats.assertionCount} assertions`); + + if (result.stats.skippedCount > 0) { + console.log(` ⏭️ ${result.stats.skippedCount} scenarios skipped (LLM rubric assertions)`); + } + + if (!options.dryRun) { + console.log(`\n📂 Output: ${options.output}`); + } + } catch (error) { + console.error('Error rendering spec:', error instanceof Error ? error.message : error); + process.exit(1); + } + }); diff --git a/src/cli/index.ts b/src/cli/index.ts index fc6a982..3b6807e 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -3,6 +3,8 @@ import { Command } from 'commander'; import { introCommand } from './commands/intro.js'; import { analyzeCommand } from './commands/analyze.js'; +import { renderCommand } from './commands/render.js'; +import { gradeCommand, listRubricsCommand, calibrateCommand } from './commands/grade.js'; const program = new Command(); @@ -13,5 +15,9 @@ program program.addCommand(introCommand); program.addCommand(analyzeCommand); +program.addCommand(renderCommand); +program.addCommand(gradeCommand); +program.addCommand(listRubricsCommand); +program.addCommand(calibrateCommand); program.parse(process.argv); diff --git a/src/graders/index.ts b/src/graders/index.ts new file mode 100644 index 0000000..168ef6f --- /dev/null +++ b/src/graders/index.ts @@ -0,0 +1,22 @@ +export type { + Rubric, + RubricCriterion, + RubricGradingResult, + CriterionScore, + GradeRequest, + GraderOptions, + CalibrationExample, + CalibrationSet, + CalibrationResult, +} from './types.js'; + +export { + LLMGrader, + gradeWithRubric, + loadRubric, + loadAllRubrics, + clearRubricCache, + formatRubricForPrompt, + calibrate, + analyzeCalibration, +} from './llm/index.js'; diff --git a/src/graders/llm/calibrator.ts b/src/graders/llm/calibrator.ts new file mode 100644 index 0000000..4854bea --- /dev/null +++ b/src/graders/llm/calibrator.ts @@ -0,0 +1,68 @@ +import type { Rubric, CalibrationExample, CalibrationResult, GraderOptions } from '../types.js'; +import { gradeWithRubric } from './grader.js'; +import { loadRubric } from './rubric-loader.js'; + +const AGREEMENT_THRESHOLD = 0.1; +const MIN_AGREEMENT_RATE = 0.8; + +export async function calibrate( + rubricNameOrDef: string | Rubric, + examples: CalibrationExample[], + options?: GraderOptions +): Promise { + const rubric = typeof rubricNameOrDef === 'string' + ? loadRubric(rubricNameOrDef, options?.rubricsDir) + : rubricNameOrDef; + + const results = await Promise.all( + examples.map(async (ex) => { + const result = await gradeWithRubric(ex.input, rubric, options); + return { + example: ex, + actualScore: result.score, + difference: result.score - ex.expectedScore, + }; + }) + ); + + const withinThreshold = results.filter(r => + Math.abs(r.difference) < AGREEMENT_THRESHOLD + ); + + const agreement = withinThreshold.length / results.length; + const drift = results.map(r => r.difference); + + return { + agreement, + drift, + needsAdjustment: agreement < MIN_AGREEMENT_RATE, + details: results, + }; +} + +export function analyzeCalibration(result: CalibrationResult): string { + const lines: string[] = []; + + lines.push(`Calibration Results`); + lines.push(`==================`); + lines.push(`Agreement Rate: ${(result.agreement * 100).toFixed(1)}%`); + lines.push(`Status: ${result.needsAdjustment ? '⚠️ Needs Adjustment' : '✅ Calibrated'}`); + lines.push(''); + + if (result.drift.length > 0) { + const avgDrift = result.drift.reduce((a, b) => a + b, 0) / result.drift.length; + const maxDrift = Math.max(...result.drift.map(Math.abs)); + + lines.push(`Average Drift: ${avgDrift > 0 ? '+' : ''}${avgDrift.toFixed(3)}`); + lines.push(`Max Absolute Drift: ${maxDrift.toFixed(3)}`); + lines.push(''); + } + + lines.push(`Individual Results:`); + for (const detail of result.details) { + const status = Math.abs(detail.difference) < AGREEMENT_THRESHOLD ? '✓' : '✗'; + lines.push(` ${status} Expected: ${detail.example.expectedScore.toFixed(2)}, Actual: ${detail.actualScore.toFixed(2)}, Diff: ${detail.difference > 0 ? '+' : ''}${detail.difference.toFixed(3)}`); + } + + return lines.join('\n'); +} diff --git a/src/graders/llm/grader.ts b/src/graders/llm/grader.ts new file mode 100644 index 0000000..a9cb2ce --- /dev/null +++ b/src/graders/llm/grader.ts @@ -0,0 +1,98 @@ +import Anthropic from '@anthropic-ai/sdk'; +import type { Rubric, RubricGradingResult, GraderOptions, CriterionScore } from '../types.js'; +import { loadRubric } from './rubric-loader.js'; +import { buildGraderSystemPrompt, buildGraderUserPrompt } from './prompt-builder.js'; + +const DEFAULT_MODEL = 'claude-sonnet-4-20250514'; +const DEFAULT_MAX_TOKENS = 1024; + +interface GradingResponse { + scores: Record; + overall: number; + summary: string; +} + +export class LLMGrader { + private client: Anthropic; + private options: Required; + + constructor(options: GraderOptions = {}) { + this.client = new Anthropic(); + this.options = { + model: options.model || DEFAULT_MODEL, + maxTokens: options.maxTokens || DEFAULT_MAX_TOKENS, + rubricsDir: options.rubricsDir || 'rubrics', + }; + } + + async grade(output: string, rubricNameOrDef: string | Rubric): Promise { + const rubric = typeof rubricNameOrDef === 'string' + ? loadRubric(rubricNameOrDef, this.options.rubricsDir) + : rubricNameOrDef; + + const systemPrompt = buildGraderSystemPrompt(); + const userPrompt = buildGraderUserPrompt(output, rubric); + + const response = await this.client.messages.create({ + model: this.options.model, + max_tokens: this.options.maxTokens, + system: systemPrompt, + messages: [{ role: 'user', content: userPrompt }], + }); + + const responseText = response.content[0].type === 'text' + ? response.content[0].text + : ''; + + const parsed = this.parseResponse(responseText); + + return this.buildResult(parsed, rubric); + } + + private parseResponse(text: string): GradingResponse { + const jsonMatch = text.match(/\{[\s\S]*\}/); + if (!jsonMatch) { + throw new Error('Failed to extract JSON from grader response'); + } + + try { + return JSON.parse(jsonMatch[0]) as GradingResponse; + } catch (e) { + throw new Error(`Failed to parse grader response as JSON: ${e}`); + } + } + + private buildResult(parsed: GradingResponse, rubric: Rubric): RubricGradingResult { + const criterionScores: CriterionScore[] = rubric.criteria.map(c => { + const score = parsed.scores[c.name]; + return { + name: c.name, + score: score?.score ?? 0, + feedback: score?.feedback ?? 'No feedback provided', + }; + }); + + const weightedScore = rubric.criteria.reduce((sum, c) => { + const criterionScore = parsed.scores[c.name]?.score ?? 0; + return sum + criterionScore * c.weight; + }, 0); + + const finalScore = parsed.overall ?? weightedScore; + + return { + pass: finalScore >= rubric.passingThreshold, + score: finalScore, + reason: parsed.summary || 'No summary provided', + criterionScores, + }; + } +} + +export async function gradeWithRubric( + output: string, + rubricNameOrDef: string | Rubric, + options?: GraderOptions +): Promise { + const grader = new LLMGrader(options); + return grader.grade(output, rubricNameOrDef); +} diff --git a/src/graders/llm/index.ts b/src/graders/llm/index.ts new file mode 100644 index 0000000..8a61f90 --- /dev/null +++ b/src/graders/llm/index.ts @@ -0,0 +1,4 @@ +export { LLMGrader, gradeWithRubric } from './grader.js'; +export { loadRubric, loadAllRubrics, clearRubricCache, formatRubricForPrompt } from './rubric-loader.js'; +export { calibrate, analyzeCalibration } from './calibrator.js'; +export { buildGraderSystemPrompt, buildGraderUserPrompt, clearPromptCache } from './prompt-builder.js'; diff --git a/src/graders/llm/prompt-builder.ts b/src/graders/llm/prompt-builder.ts new file mode 100644 index 0000000..208c1c7 --- /dev/null +++ b/src/graders/llm/prompt-builder.ts @@ -0,0 +1,50 @@ +import { readFileSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; +import type { Rubric } from '../types.js'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const PROMPTS_DIR = join(__dirname, '../../../prompts'); + +let promptCache: Map = new Map(); + +function loadPrompt(name: string): string { + if (promptCache.has(name)) { + return promptCache.get(name)!; + } + + const filePath = join(PROMPTS_DIR, `${name}.md`); + const content = readFileSync(filePath, 'utf-8'); + promptCache.set(name, content); + return content; +} + +export function buildGraderSystemPrompt(): string { + return loadPrompt('grader-system'); +} + +export function buildGraderUserPrompt(output: string, rubric: Rubric): string { + const template = loadPrompt('grader-user'); + + const criteriaList = rubric.criteria.map(c => { + let entry = `#### ${c.name} (weight: ${(c.weight * 100).toFixed(0)}%)\n\n${c.description}`; + + if (c.examples) { + entry += `\n\n**Good example:** ${c.examples.good}`; + entry += `\n**Bad example:** ${c.examples.bad}`; + } + + return entry; + }).join('\n\n'); + + return template + .replace('{{RUBRIC_NAME}}', rubric.name) + .replace('{{RUBRIC_DESCRIPTION}}', rubric.description) + .replace('{{PASSING_THRESHOLD}}', String(Math.round(rubric.passingThreshold * 100))) + .replace('{{CRITERIA_LIST}}', criteriaList) + .replace('{{OUTPUT}}', output); +} + +export function clearPromptCache(): void { + promptCache.clear(); +} diff --git a/src/graders/llm/rubric-loader.ts b/src/graders/llm/rubric-loader.ts new file mode 100644 index 0000000..cd31d7c --- /dev/null +++ b/src/graders/llm/rubric-loader.ts @@ -0,0 +1,127 @@ +import { readFileSync, existsSync, readdirSync } from 'fs'; +import { join, basename } from 'path'; +import yaml from 'js-yaml'; +import type { Rubric } from '../types.js'; + +const DEFAULT_RUBRICS_DIR = 'rubrics'; + +let rubricCache: Map = new Map(); + +export function loadRubric(nameOrPath: string, rubricsDir: string = DEFAULT_RUBRICS_DIR): Rubric { + if (rubricCache.has(nameOrPath)) { + return rubricCache.get(nameOrPath)!; + } + + let rubricPath: string; + + if (existsSync(nameOrPath)) { + rubricPath = nameOrPath; + } else { + rubricPath = join(rubricsDir, `${nameOrPath}.yaml`); + if (!existsSync(rubricPath)) { + rubricPath = join(rubricsDir, `${nameOrPath}.yml`); + } + } + + if (!existsSync(rubricPath)) { + throw new Error(`Rubric not found: ${nameOrPath} (searched in ${rubricsDir})`); + } + + const content = readFileSync(rubricPath, 'utf-8'); + const rubric = yaml.load(content) as Rubric; + + validateRubric(rubric); + rubricCache.set(nameOrPath, rubric); + + return rubric; +} + +export function loadAllRubrics(rubricsDir: string = DEFAULT_RUBRICS_DIR): Map { + if (!existsSync(rubricsDir)) { + return new Map(); + } + + const files = readdirSync(rubricsDir).filter(f => f.endsWith('.yaml') || f.endsWith('.yml')); + const rubrics = new Map(); + + for (const file of files) { + const name = basename(file).replace(/\.(yaml|yml)$/, ''); + try { + const rubric = loadRubric(join(rubricsDir, file)); + rubrics.set(name, rubric); + } catch (e) { + console.warn(`Failed to load rubric ${file}:`, e); + } + } + + return rubrics; +} + +function validateRubric(rubric: unknown): asserts rubric is Rubric { + if (!rubric || typeof rubric !== 'object') { + throw new Error('Rubric must be an object'); + } + + const r = rubric as Record; + + if (typeof r.name !== 'string') { + throw new Error('Rubric must have a name (string)'); + } + if (typeof r.description !== 'string') { + throw new Error('Rubric must have a description (string)'); + } + if (typeof r.passingThreshold !== 'number' || r.passingThreshold < 0 || r.passingThreshold > 1) { + throw new Error('Rubric must have a passingThreshold between 0 and 1'); + } + if (!Array.isArray(r.criteria) || r.criteria.length === 0) { + throw new Error('Rubric must have at least one criterion'); + } + + for (const criterion of r.criteria) { + validateCriterion(criterion); + } + + const totalWeight = (r.criteria as Array<{ weight: number }>).reduce((sum, c) => sum + c.weight, 0); + if (Math.abs(totalWeight - 1) > 0.01) { + console.warn(`Rubric '${r.name}' weights sum to ${totalWeight}, not 1.0`); + } +} + +function validateCriterion(criterion: unknown): void { + if (!criterion || typeof criterion !== 'object') { + throw new Error('Criterion must be an object'); + } + + const c = criterion as Record; + + if (typeof c.name !== 'string') { + throw new Error('Criterion must have a name'); + } + if (typeof c.description !== 'string') { + throw new Error('Criterion must have a description'); + } + if (typeof c.weight !== 'number' || c.weight < 0 || c.weight > 1) { + throw new Error('Criterion must have a weight between 0 and 1'); + } +} + +export function clearRubricCache(): void { + rubricCache.clear(); +} + +export function formatRubricForPrompt(rubric: Rubric): string { + let prompt = `# ${rubric.name}\n\n${rubric.description}\n\nPassing threshold: ${rubric.passingThreshold * 100}%\n\n## Criteria\n\n`; + + for (const criterion of rubric.criteria) { + prompt += `### ${criterion.name} (weight: ${criterion.weight * 100}%)\n`; + prompt += `${criterion.description}\n`; + + if (criterion.examples) { + prompt += `\n**Good example:** ${criterion.examples.good}\n`; + prompt += `**Bad example:** ${criterion.examples.bad}\n`; + } + prompt += '\n'; + } + + return prompt; +} diff --git a/src/graders/types.ts b/src/graders/types.ts new file mode 100644 index 0000000..6e6ad2d --- /dev/null +++ b/src/graders/types.ts @@ -0,0 +1,63 @@ +export interface Rubric { + name: string; + description: string; + criteria: RubricCriterion[]; + passingThreshold: number; +} + +export interface RubricCriterion { + name: string; + description: string; + weight: number; + examples?: { + good: string; + bad: string; + }; +} + +export interface RubricGradingResult { + pass: boolean; + score: number; + reason: string; + criterionScores: CriterionScore[]; +} + +export interface CriterionScore { + name: string; + score: number; + feedback: string; +} + +export interface GradeRequest { + output: string; + rubric: string | Rubric; + context?: Record; +} + +export interface GraderOptions { + model?: string; + maxTokens?: number; + rubricsDir?: string; +} + +export interface CalibrationExample { + input: string; + expectedScore: number; + expectedFeedback?: string[]; +} + +export interface CalibrationSet { + rubric: string; + examples: CalibrationExample[]; +} + +export interface CalibrationResult { + agreement: number; + drift: number[]; + needsAdjustment: boolean; + details: { + example: CalibrationExample; + actualScore: number; + difference: number; + }[]; +} diff --git a/src/index.ts b/src/index.ts index bbb7b12..225f0ef 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,2 +1,4 @@ export * from './introspector/index.js'; export * from './analyzer/index.js'; +export * from './renderers/index.js'; +export * from './graders/index.js'; diff --git a/src/renderers/base.ts b/src/renderers/base.ts new file mode 100644 index 0000000..3254568 --- /dev/null +++ b/src/renderers/base.ts @@ -0,0 +1,96 @@ +import type { EvalSpec, EvalScenario } from '../analyzer/types.js'; +import type { RenderOptions, RenderResult, GeneratedFile, ScenarioGroup, RenderStats } from './types.js'; + +export abstract class BaseRenderer { + protected options: RenderOptions; + + constructor(options: RenderOptions) { + this.options = options; + } + + abstract get language(): 'python' | 'typescript'; + abstract get fileExtension(): string; + + async render(spec: EvalSpec): Promise { + const groups = this.groupByModule(spec.scenarios); + const files: GeneratedFile[] = []; + let assertionCount = 0; + let skippedCount = 0; + + for (const group of groups) { + const validScenarios = group.scenarios.filter(s => this.canRender(s)); + skippedCount += group.scenarios.length - validScenarios.length; + + if (validScenarios.length === 0) continue; + + const content = this.renderTestFile(group.module, validScenarios); + const path = this.getOutputPath(group.module); + + assertionCount += validScenarios.reduce((sum, s) => sum + s.assertions.length, 0); + + files.push({ + path, + content, + scenarios: validScenarios.map(s => s.id), + language: this.language, + }); + } + + const stats: RenderStats = { + scenarioCount: spec.scenarios.length - skippedCount, + fileCount: files.length, + assertionCount, + skippedCount, + }; + + return { files, stats }; + } + + protected groupByModule(scenarios: EvalScenario[]): ScenarioGroup[] { + const groups = new Map(); + + for (const scenario of scenarios) { + const module = scenario.target.module; + if (!groups.has(module)) { + groups.set(module, []); + } + groups.get(module)!.push(scenario); + } + + return Array.from(groups.entries()).map(([module, scenarios]) => ({ + module, + scenarios, + })); + } + + protected canRender(scenario: EvalScenario): boolean { + return scenario.assertions.every(a => a.type !== 'llm-rubric'); + } + + protected getOutputPath(modulePath: string): string { + const baseName = modulePath + .replace(/\.(py|ts|tsx|js|jsx)$/, '') + .replace(/\//g, '_'); + return `${this.options.outputDir}/test_${baseName}${this.fileExtension}`; + } + + protected abstract renderTestFile(module: string, scenarios: EvalScenario[]): string; + + protected abstract renderAssertion(assertion: EvalScenario['assertions'][0], resultVar: string): string; + + protected toSnakeCase(str: string): string { + return str.replace(/-/g, '_').replace(/([A-Z])/g, '_$1').toLowerCase(); + } + + protected toCamelCase(str: string): string { + return str.replace(/-([a-z])/g, (_, c) => c.toUpperCase()); + } + + protected formatValue(value: unknown): string { + if (value === null) return 'null'; + if (value === undefined) return 'undefined'; + if (typeof value === 'string') return JSON.stringify(value); + if (typeof value === 'number' || typeof value === 'boolean') return String(value); + return JSON.stringify(value); + } +} diff --git a/src/renderers/index.ts b/src/renderers/index.ts new file mode 100644 index 0000000..bb6ac31 --- /dev/null +++ b/src/renderers/index.ts @@ -0,0 +1,67 @@ +import type { EvalSpec } from '../analyzer/types.js'; +import type { RenderOptions, RenderResult, Framework, GeneratedFile } from './types.js'; +import { BaseRenderer } from './base.js'; +import { PytestRenderer } from './python/pytest-renderer.js'; +import { VitestRenderer, JestRenderer } from './typescript/vitest-renderer.js'; +import { writeFileSync, mkdirSync } from 'fs'; +import { dirname } from 'path'; + +export type { RenderOptions, RenderResult, GeneratedFile, Framework } from './types.js'; +export { BaseRenderer } from './base.js'; +export { PytestRenderer } from './python/pytest-renderer.js'; +export { VitestRenderer, JestRenderer } from './typescript/vitest-renderer.js'; + +const rendererRegistry: Record BaseRenderer> = { + pytest: PytestRenderer, + vitest: VitestRenderer, + jest: JestRenderer, +}; + +export function createRenderer(options: RenderOptions): BaseRenderer { + const RendererClass = rendererRegistry[options.framework]; + if (!RendererClass) { + throw new Error(`Unknown framework: ${options.framework}`); + } + return new RendererClass(options); +} + +export async function renderSpec(spec: EvalSpec, options: RenderOptions): Promise { + const renderer = createRenderer(options); + const result = await renderer.render(spec); + + if (!options.dryRun) { + for (const file of result.files) { + mkdirSync(dirname(file.path), { recursive: true }); + writeFileSync(file.path, file.content, 'utf-8'); + } + } + + return result; +} + +export async function renderIncremental( + spec: EvalSpec, + options: RenderOptions, + changedFiles: string[] +): Promise { + const filteredSpec: EvalSpec = { + ...spec, + scenarios: spec.scenarios.filter(s => + changedFiles.some(f => s.target.module.includes(f)) + ), + }; + return renderSpec(filteredSpec, options); +} + +export function detectFramework(spec: EvalSpec): Framework { + const languages = spec.repo.languages; + + if (languages.includes('python')) { + return 'pytest'; + } + if (languages.includes('typescript') || languages.includes('javascript')) { + return 'vitest'; + } + + return 'vitest'; +} diff --git a/src/renderers/python/assertions.ts b/src/renderers/python/assertions.ts new file mode 100644 index 0000000..6b48067 --- /dev/null +++ b/src/renderers/python/assertions.ts @@ -0,0 +1,104 @@ +import type { Assertion } from '../../analyzer/types.js'; + +export function renderPytestAssertion(assertion: Assertion, resultVar: string): string { + const target = getTargetExpression(assertion, resultVar); + + switch (assertion.type) { + case 'equals': + return `assert ${target} == ${formatPythonValue(assertion.expected)}`; + + case 'contains': + return `assert ${formatPythonValue(assertion.value)} in ${target}`; + + case 'typeof': + return `assert isinstance(${target}, ${mapPythonType(assertion.expected)})`; + + case 'matches': + return `assert re.match(${formatPythonValue(assertion.pattern)}, ${target})`; + + case 'throws': + return ''; // Handled specially in test structure + + case 'truthy': + return `assert ${target}`; + + case 'falsy': + return `assert not ${target}`; + + case 'custom': + return `assert ${assertion.check} # ${assertion.description}`; + + case 'llm-rubric': + return `# LLM Rubric: ${assertion.rubric} - skipped (requires grader)`; + + default: + return `# Unknown assertion type: ${(assertion as Assertion).type}`; + } +} + +function getTargetExpression(assertion: Assertion, resultVar: string): string { + if ('path' in assertion && assertion.path) { + const path = assertion.path; + if (path.startsWith('[')) { + return `${resultVar}${path}`; + } + return `${resultVar}["${path}"]`; + } + return resultVar; +} + +function formatPythonValue(value: unknown): string { + if (value === null) return 'None'; + if (value === undefined) return 'None'; + if (typeof value === 'boolean') return value ? 'True' : 'False'; + if (typeof value === 'string') return JSON.stringify(value); + if (typeof value === 'number') return String(value); + if (Array.isArray(value)) { + return `[${value.map(formatPythonValue).join(', ')}]`; + } + if (typeof value === 'object') { + const entries = Object.entries(value) + .map(([k, v]) => `"${k}": ${formatPythonValue(v)}`) + .join(', '); + return `{${entries}}`; + } + return String(value); +} + +function mapPythonType(tsType: string): string { + const typeMap: Record = { + 'string': 'str', + 'number': '(int, float)', + 'boolean': 'bool', + 'object': 'dict', + 'array': 'list', + 'null': 'type(None)', + 'undefined': 'type(None)', + }; + return typeMap[tsType] || tsType; +} + +export function renderThrowsContext(assertion: Assertion): { contextManager: string; exceptionType: string } | null { + if (assertion.type !== 'throws') return null; + + const exceptionType = assertion.errorType || 'Exception'; + let contextManager = `pytest.raises(${exceptionType})`; + + if (assertion.messageContains) { + contextManager = `pytest.raises(${exceptionType}, match=${formatPythonValue(assertion.messageContains)})`; + } + + return { contextManager, exceptionType }; +} + +export function formatPythonArgs(args: Record): string { + return Object.entries(args) + .map(([_, value]) => formatPythonValue(value)) + .join(', '); +} + +export function formatPythonKwargs(kwargs: Record): string { + return Object.entries(kwargs) + .map(([key, value]) => `${key}=${formatPythonValue(value)}`) + .join(', '); +} diff --git a/src/renderers/python/pytest-renderer.ts b/src/renderers/python/pytest-renderer.ts new file mode 100644 index 0000000..eb05255 --- /dev/null +++ b/src/renderers/python/pytest-renderer.ts @@ -0,0 +1,160 @@ +import type { EvalScenario, Assertion } from '../../analyzer/types.js'; +import { BaseRenderer } from '../base.js'; +import { renderPytestAssertion, renderThrowsContext, formatPythonArgs, formatPythonKwargs } from './assertions.js'; + +export class PytestRenderer extends BaseRenderer { + get language(): 'python' { + return 'python'; + } + + get fileExtension(): string { + return '.py'; + } + + protected renderTestFile(module: string, scenarios: EvalScenario[]): string { + const imports = this.generateImports(module, scenarios); + const fixtures = this.options.includeFixtures ? this.generateFixtures(scenarios) : ''; + const tests = scenarios.map(s => this.renderTest(s)).join('\n\n'); + + return `${imports}\n\n${fixtures}${tests}\n`; + } + + private generateImports(module: string, scenarios: EvalScenario[]): string { + const imports: string[] = [ + 'import pytest', + ]; + + const hasRegex = scenarios.some(s => + s.assertions.some(a => a.type === 'matches') + ); + if (hasRegex) { + imports.push('import re'); + } + + const hasMocks = scenarios.some(s => s.setup?.mocks?.length); + if (hasMocks || this.options.generateMocks) { + imports.push('from unittest.mock import patch, MagicMock'); + } + + const functions = [...new Set(scenarios.map(s => s.target.function))]; + const modulePath = module.replace(/\.(py|ts|tsx|js|jsx)$/, '').replace(/\//g, '.'); + imports.push(`from ${modulePath} import ${functions.join(', ')}`); + + return imports.join('\n'); + } + + private generateFixtures(scenarios: EvalScenario[]): string { + const fixtureNames = new Set(); + for (const scenario of scenarios) { + if (scenario.setup?.fixtures) { + for (const fixture of scenario.setup.fixtures) { + fixtureNames.add(fixture); + } + } + } + + if (fixtureNames.size === 0) return ''; + + const fixtures = Array.from(fixtureNames).map(name => ` +@pytest.fixture +def ${name}(): + # TODO: Implement fixture + pass +`).join('\n'); + + return fixtures + '\n'; + } + + private renderTest(scenario: EvalScenario): string { + const testName = `test_${this.toSnakeCase(scenario.id)}`; + const docstring = scenario.description ? ` """${scenario.description}"""\n` : ''; + + const throwsAssertion = scenario.assertions.find(a => a.type === 'throws'); + const regularAssertions = scenario.assertions.filter(a => a.type !== 'throws'); + + const fixtureParams = scenario.setup?.fixtures?.join(', ') || ''; + const funcParams = fixtureParams ? `(${fixtureParams})` : '()'; + + let body: string; + if (throwsAssertion) { + body = this.renderThrowsTest(scenario, throwsAssertion); + } else { + body = this.renderRegularTest(scenario, regularAssertions); + } + + const mocks = this.renderMocks(scenario); + if (mocks) { + body = mocks.decorators + `def ${testName}${funcParams}:\n${docstring}${mocks.setup}${body}`; + } else { + body = `def ${testName}${funcParams}:\n${docstring}${body}`; + } + + return body; + } + + private renderRegularTest(scenario: EvalScenario, assertions: Assertion[]): string { + const funcCall = this.renderFunctionCall(scenario); + const assertionLines = assertions + .map(a => this.renderAssertion(a, 'result')) + .filter(Boolean) + .map(a => ` ${a}`) + .join('\n'); + + return ` result = ${funcCall}\n${assertionLines}`; + } + + private renderThrowsTest(scenario: EvalScenario, throwsAssertion: Assertion): string { + const ctx = renderThrowsContext(throwsAssertion); + if (!ctx) return this.renderRegularTest(scenario, scenario.assertions); + + const funcCall = this.renderFunctionCall(scenario); + return ` with ${ctx.contextManager}:\n ${funcCall}`; + } + + private renderFunctionCall(scenario: EvalScenario): string { + const func = scenario.target.function; + const args = formatPythonArgs(scenario.input.args); + const kwargs = scenario.input.kwargs ? formatPythonKwargs(scenario.input.kwargs) : ''; + + const allArgs = [args, kwargs].filter(Boolean).join(', '); + return `${func}(${allArgs})`; + } + + private renderMocks(scenario: EvalScenario): { decorators: string; setup: string } | null { + if (!scenario.setup?.mocks?.length && !this.options.generateMocks) return null; + + const mocks = scenario.setup?.mocks || []; + if (mocks.length === 0) return null; + + const decorators = mocks + .map((m, i) => `@patch("${m.target}")\n`) + .join(''); + + const setup = mocks + .map((m, i) => { + const mockName = `mock_${i}`; + if (m.returnValue !== undefined) { + return ` ${mockName}.return_value = ${this.formatValue(m.returnValue)}\n`; + } + if (m.sideEffect) { + return ` ${mockName}.side_effect = ${m.sideEffect}\n`; + } + return ''; + }) + .join(''); + + return { decorators, setup }; + } + + protected renderAssertion(assertion: Assertion, resultVar: string): string { + return renderPytestAssertion(assertion, resultVar); + } + + protected formatValue(value: unknown): string { + if (value === null) return 'None'; + if (value === undefined) return 'None'; + if (typeof value === 'boolean') return value ? 'True' : 'False'; + if (typeof value === 'string') return JSON.stringify(value); + return String(value); + } +} diff --git a/src/renderers/types.ts b/src/renderers/types.ts new file mode 100644 index 0000000..037a9b2 --- /dev/null +++ b/src/renderers/types.ts @@ -0,0 +1,40 @@ +import type { EvalSpec, EvalScenario } from '../analyzer/types.js'; + +export type Framework = 'pytest' | 'vitest' | 'jest'; + +export interface RenderOptions { + outputDir: string; + framework: Framework; + includeFixtures: boolean; + generateMocks: boolean; + dryRun?: boolean; +} + +export interface RenderResult { + files: GeneratedFile[]; + stats: RenderStats; +} + +export interface RenderStats { + scenarioCount: number; + fileCount: number; + assertionCount: number; + skippedCount: number; +} + +export interface GeneratedFile { + path: string; + content: string; + scenarios: string[]; + language: 'python' | 'typescript'; +} + +export interface RendererContext { + spec: EvalSpec; + options: RenderOptions; +} + +export interface ScenarioGroup { + module: string; + scenarios: EvalScenario[]; +} diff --git a/src/renderers/typescript/assertions.ts b/src/renderers/typescript/assertions.ts new file mode 100644 index 0000000..bdd13d9 --- /dev/null +++ b/src/renderers/typescript/assertions.ts @@ -0,0 +1,114 @@ +import type { Assertion } from '../../analyzer/types.js'; + +export type TSFramework = 'vitest' | 'jest'; + +export function renderTSAssertion(assertion: Assertion, resultVar: string, framework: TSFramework): string { + const target = getTargetExpression(assertion, resultVar); + + switch (assertion.type) { + case 'equals': + if (typeof assertion.expected === 'object' && assertion.expected !== null) { + return `expect(${target}).toEqual(${formatTSValue(assertion.expected)});`; + } + return `expect(${target}).toBe(${formatTSValue(assertion.expected)});`; + + case 'contains': + return `expect(${target}).toContain(${formatTSValue(assertion.value)});`; + + case 'typeof': + return renderTypeofAssertion(target, assertion.expected, framework); + + case 'matches': + return `expect(${target}).toMatch(${formatTSValue(assertion.pattern)});`; + + case 'throws': + return ''; // Handled specially in test structure + + case 'truthy': + return `expect(${target}).toBeTruthy();`; + + case 'falsy': + return `expect(${target}).toBeFalsy();`; + + case 'custom': + return `expect(${assertion.check}).toBe(true); // ${assertion.description}`; + + case 'llm-rubric': + return `// LLM Rubric: ${assertion.rubric} - skipped (requires grader)`; + + default: + return `// Unknown assertion type: ${(assertion as Assertion).type}`; + } +} + +function renderTypeofAssertion(target: string, expected: string, _framework: TSFramework): string { + switch (expected) { + case 'array': + return `expect(Array.isArray(${target})).toBe(true);`; + case 'null': + return `expect(${target}).toBeNull();`; + case 'undefined': + return `expect(${target}).toBeUndefined();`; + case 'object': + return `expect(typeof ${target}).toBe('object');`; + default: + return `expect(typeof ${target}).toBe('${expected}');`; + } +} + +function getTargetExpression(assertion: Assertion, resultVar: string): string { + if ('path' in assertion && assertion.path) { + const path = assertion.path; + if (path.startsWith('[')) { + return `${resultVar}${path}`; + } + if (path.includes('.')) { + return `${resultVar}.${path}`; + } + return `${resultVar}['${path}']`; + } + return resultVar; +} + +function formatTSValue(value: unknown): string { + if (value === null) return 'null'; + if (value === undefined) return 'undefined'; + if (typeof value === 'string') return JSON.stringify(value); + if (typeof value === 'number' || typeof value === 'boolean') return String(value); + if (Array.isArray(value)) { + return `[${value.map(formatTSValue).join(', ')}]`; + } + if (typeof value === 'object') { + const entries = Object.entries(value) + .map(([k, v]) => `${k}: ${formatTSValue(v)}`) + .join(', '); + return `{ ${entries} }`; + } + return String(value); +} + +export function renderThrowsExpectation( + funcCall: string, + assertion: Assertion, + isAsync: boolean +): string { + if (assertion.type !== 'throws') return ''; + + const expectFn = isAsync + ? `await expect(async () => ${funcCall})` + : `expect(() => ${funcCall})`; + + const throwMatcher = isAsync ? 'rejects.toThrow' : 'toThrow'; + + if (assertion.errorType) { + return `${expectFn}.${throwMatcher}(${assertion.errorType});`; + } + if (assertion.messageContains) { + return `${expectFn}.${throwMatcher}(${formatTSValue(assertion.messageContains)});`; + } + return `${expectFn}.${throwMatcher}();`; +} + +export function formatTSArgs(args: Record): string { + return Object.values(args).map(formatTSValue).join(', '); +} diff --git a/src/renderers/typescript/vitest-renderer.ts b/src/renderers/typescript/vitest-renderer.ts new file mode 100644 index 0000000..9584625 --- /dev/null +++ b/src/renderers/typescript/vitest-renderer.ts @@ -0,0 +1,152 @@ +import type { EvalScenario, Assertion } from '../../analyzer/types.js'; +import { BaseRenderer } from '../base.js'; +import { renderTSAssertion, renderThrowsExpectation, formatTSArgs } from './assertions.js'; + +export class VitestRenderer extends BaseRenderer { + get language(): 'typescript' { + return 'typescript'; + } + + get fileExtension(): string { + return '.test.ts'; + } + + protected renderTestFile(module: string, scenarios: EvalScenario[]): string { + const imports = this.generateImports(module, scenarios); + const describes = this.generateDescribe(module, scenarios); + return `${imports}\n\n${describes}\n`; + } + + protected generateImports(module: string, scenarios: EvalScenario[]): string { + const imports: string[] = [ + `import { describe, it, expect } from 'vitest';`, + ]; + + const hasMocks = scenarios.some(s => s.setup?.mocks?.length); + if (hasMocks || this.options.generateMocks) { + imports.push(`import { vi } from 'vitest';`); + } + + const functions = [...new Set(scenarios.map(s => s.target.function))]; + const modulePath = module.replace(/\.(ts|tsx|js|jsx)$/, ''); + imports.push(`import { ${functions.join(', ')} } from '${modulePath}';`); + + return imports.join('\n'); + } + + protected generateDescribe(module: string, scenarios: EvalScenario[]): string { + const moduleName = module.split('/').pop()?.replace(/\.(ts|tsx|js|jsx)$/, '') || module; + const tests = scenarios.map(s => this.renderTest(s)).join('\n\n'); + return `describe('${moduleName}', () => {\n${tests}\n});`; + } + + protected renderTest(scenario: EvalScenario): string { + const testName = scenario.name || scenario.id; + const isAsync = this.hasAsyncTarget(scenario); + const asyncPrefix = isAsync ? 'async ' : ''; + + const throwsAssertion = scenario.assertions.find(a => a.type === 'throws'); + const regularAssertions = scenario.assertions.filter(a => a.type !== 'throws'); + + let body: string; + if (throwsAssertion) { + body = this.renderThrowsTest(scenario, throwsAssertion, isAsync); + } else { + body = this.renderRegularTest(scenario, regularAssertions, isAsync); + } + + const mocks = this.renderMocks(scenario); + const mockSetup = mocks ? `\n${mocks}` : ''; + + return ` it('${testName}', ${asyncPrefix}() => {${mockSetup} +${body} + });`; + } + + protected hasAsyncTarget(scenario: EvalScenario): boolean { + return scenario.target.type === 'function' && + (scenario.target.function.startsWith('async') || + scenario.tags?.includes('async')); + } + + protected renderRegularTest(scenario: EvalScenario, assertions: Assertion[], isAsync: boolean): string { + const funcCall = this.renderFunctionCall(scenario); + const awaitPrefix = isAsync ? 'await ' : ''; + + const assertionLines = assertions + .map(a => this.renderAssertion(a, 'result')) + .filter(Boolean) + .map(a => ` ${a}`) + .join('\n'); + + return ` const result = ${awaitPrefix}${funcCall};\n${assertionLines}`; + } + + protected renderThrowsTest(scenario: EvalScenario, throwsAssertion: Assertion, isAsync: boolean): string { + const funcCall = this.renderFunctionCall(scenario); + return ` ${renderThrowsExpectation(funcCall, throwsAssertion, isAsync)}`; + } + + protected renderFunctionCall(scenario: EvalScenario): string { + const func = scenario.target.function; + const args = formatTSArgs(scenario.input.args); + return `${func}(${args})`; + } + + protected renderMocks(scenario: EvalScenario): string | null { + if (!scenario.setup?.mocks?.length) return null; + + return scenario.setup.mocks + .map(m => { + if (m.returnValue !== undefined) { + return ` vi.mock('${m.target}', () => (${JSON.stringify(m.returnValue)}));`; + } + if (m.sideEffect) { + return ` vi.mock('${m.target}', () => { throw new Error('${m.sideEffect}'); });`; + } + return ` vi.mock('${m.target}');`; + }) + .join('\n'); + } + + protected renderAssertion(assertion: Assertion, resultVar: string): string { + return renderTSAssertion(assertion, resultVar, 'vitest'); + } +} + +export class JestRenderer extends VitestRenderer { + protected generateImports(module: string, scenarios: EvalScenario[]): string { + const imports: string[] = []; + + const hasMocks = scenarios.some(s => s.setup?.mocks?.length); + if (hasMocks || this.options.generateMocks) { + imports.push(`import { jest } from '@jest/globals';`); + } + + const functions = [...new Set(scenarios.map(s => s.target.function))]; + const modulePath = module.replace(/\.(ts|tsx|js|jsx)$/, ''); + imports.push(`import { ${functions.join(', ')} } from '${modulePath}';`); + + return imports.join('\n'); + } + + protected renderMocks(scenario: EvalScenario): string | null { + if (!scenario.setup?.mocks?.length) return null; + + return scenario.setup.mocks + .map(m => { + if (m.returnValue !== undefined) { + return ` jest.mock('${m.target}', () => (${JSON.stringify(m.returnValue)}));`; + } + if (m.sideEffect) { + return ` jest.mock('${m.target}', () => { throw new Error('${m.sideEffect}'); });`; + } + return ` jest.mock('${m.target}');`; + }) + .join('\n'); + } + + protected renderAssertion(assertion: Assertion, resultVar: string): string { + return renderTSAssertion(assertion, resultVar, 'jest'); + } +}