improvements and promptfoo

2026-04-15 09:01:15 +00:00 · 2026-01-11 20:02:30 -05:00 · 2026-01-11 20:02:30 -05:00 · ff5300f4e0
commit ff5300f4e0
parent 6698c12e5b
13 changed files with 1082 additions and 117 deletions
--- a/rubrics/code-quality.yaml
+++ b/rubrics/code-quality.yaml
@ -0,0 +1,32 @@
+name: code-quality
+description: Evaluates generated code for quality and maintainability
+passingThreshold: 0.7
+
+criteria:
+  - name: readability
+    weight: 0.3
+    description: Code is easy to read and understand
+    examples:
+      good: "Clear variable names, logical flow, proper indentation"
+      bad: "Single-letter variables, deeply nested logic, inconsistent style"
+  
+  - name: correctness
+    weight: 0.4
+    description: Code correctly implements the intended behavior
+    examples:
+      good: "Handles edge cases, correct algorithm, proper error handling"
+      bad: "Missing edge cases, off-by-one errors, swallowed exceptions"
+  
+  - name: efficiency
+    weight: 0.2
+    description: Code uses appropriate data structures and algorithms
+    examples:
+      good: "O(n) where O(n) is optimal, avoids unnecessary allocations"
+      bad: "O(n²) when O(n) is possible, creates objects in tight loops"
+  
+  - name: maintainability
+    weight: 0.1
+    description: Code is easy to modify and extend
+    examples:
+      good: "Single responsibility, low coupling, clear interfaces"
+      bad: "God functions, tight coupling, magic numbers"
--- a/rubrics/documentation.yaml
+++ b/rubrics/documentation.yaml
@ -0,0 +1,32 @@
+name: documentation
+description: Evaluates quality of code documentation and docstrings
+passingThreshold: 0.65
+
+criteria:
+  - name: completeness
+    weight: 0.35
+    description: Documentation covers all parameters, return values, and exceptions
+    examples:
+      good: "Fully documents args, returns, raises, and includes usage example"
+      bad: "Missing parameter descriptions or return type"
+  
+  - name: accuracy
+    weight: 0.35
+    description: Documentation accurately describes the function's behavior
+    examples:
+      good: "Description matches implementation, types are correct"
+      bad: "Outdated docs that don't match current behavior"
+  
+  - name: examples
+    weight: 0.2
+    description: Includes helpful usage examples
+    examples:
+      good: "Shows common use cases with expected outputs"
+      bad: "No examples or only trivial ones"
+  
+  - name: style
+    weight: 0.1
+    description: Follows project/language documentation conventions
+    examples:
+      good: "Uses standard docstring format (Google, NumPy, or reStructuredText)"
+      bad: "Inconsistent or non-standard format"
--- a/rubrics/error-messages.yaml
+++ b/rubrics/error-messages.yaml
@ -0,0 +1,25 @@
+name: error-messages
+description: Evaluates quality of error messages
+passingThreshold: 0.6
+
+criteria:
+  - name: clarity
+    weight: 0.4
+    description: Error message clearly explains what went wrong
+    examples:
+      good: "Invalid email format: 'not-an-email' is missing '@' symbol"
+      bad: "Error: validation failed"
+  
+  - name: actionability
+    weight: 0.4
+    description: Error message suggests how to fix the problem
+    examples:
+      good: "File not found. Create the file or check the path spelling."
+      bad: "ENOENT"
+  
+  - name: context
+    weight: 0.2
+    description: Error message includes relevant context (file, line, values)
+    examples:
+      good: "TypeError at line 42 in auth.py: expected str, got int (value=123)"
+      bad: "type error"