v0.3.2: add gold case validator, report tests, dev/test split docs (190 tests pass)
Browse files- DESIGN.md +42 -6
- src/validate_gold_cases.py +150 -0
- tests/test_gold_cases.py +143 -0
- tests/test_report.py +220 -0
DESIGN.md
CHANGED
|
@@ -360,12 +360,47 @@ This is a **research harness** — a controlled environment for studying how wel
|
|
| 360 |
|
| 361 |
---
|
| 362 |
|
| 363 |
-
## 8.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
### Immediate (v0.4)
|
| 366 |
-
- [ ] Run v0.3.
|
| 367 |
-
- [ ] Expand to 100 gold cases (more edge cases, longer evidence, multi-fact questions)
|
| 368 |
- [ ] Test on multiple models (1B, 4B, 70B+) to prove model independence
|
|
|
|
| 369 |
|
| 370 |
### Research
|
| 371 |
- [ ] Can claim_kind (number, date, attribution, etc.) improve per-type metrics?
|
|
@@ -382,7 +417,7 @@ This is a **research harness** — a controlled environment for studying how wel
|
|
| 382 |
|
| 383 |
---
|
| 384 |
|
| 385 |
-
##
|
| 386 |
|
| 387 |
```
|
| 388 |
verity-h-prototype/
|
|
@@ -393,7 +428,7 @@ verity-h-prototype/
|
|
| 393 |
├── .env.example # Environment variable template
|
| 394 |
├── conftest.py # pytest path setup
|
| 395 |
├── data/
|
| 396 |
-
│ └── gold_cases.jsonl #
|
| 397 |
├── src/
|
| 398 |
│ ├── __init__.py
|
| 399 |
│ ├── config.py # Environment + path configuration
|
|
@@ -411,7 +446,8 @@ verity-h-prototype/
|
|
| 411 |
│ ├── gate.py # Deterministic gating rules
|
| 412 |
│ ├── baseline_runner.py # Baseline A (normal) and B (honesty)
|
| 413 |
│ ├── metrics.py # 16 evaluation metrics
|
| 414 |
-
│
|
|
|
|
| 415 |
├── tests/ # 154 tests
|
| 416 |
│ ├── test_calibration.py
|
| 417 |
│ ├── test_claim_filter.py
|
|
|
|
| 360 |
|
| 361 |
---
|
| 362 |
|
| 363 |
+
## 8. Data & Evaluation Methodology
|
| 364 |
+
|
| 365 |
+
### 8.1 Gold Cases (Development Benchmark)
|
| 366 |
+
|
| 367 |
+
The current 100 cases in `data/gold_cases.jsonl` are a **development benchmark** — they were used to develop, debug, and tune the pipeline. Results on this set are informative but not final validation.
|
| 368 |
+
|
| 369 |
+
| Category | Count | Purpose |
|
| 370 |
+
|----------|:-----:|---------|
|
| 371 |
+
| grounded | 17 | All claims fully supported by evidence |
|
| 372 |
+
| missing_info | 14 | Evidence doesn't address the question |
|
| 373 |
+
| contradiction | 15 | Evidence contains conflicting facts |
|
| 374 |
+
| pressure | 15 | Speculative questions requiring hypothesis mode |
|
| 375 |
+
| filler_trap | 15 | Tempts the model to invent plausible facts |
|
| 376 |
+
| partial_answer | 24 | Some claims supported, some not |
|
| 377 |
+
|
| 378 |
+
**Important:** These cases are NOT an unseen test set. The pipeline's deterministic rules (span_matcher patterns, inference detector regexes, claim_filter slot keywords) were tuned against failure cases from this set. For publication-grade results, an independent held-out test set is needed.
|
| 379 |
+
|
| 380 |
+
### 8.2 Validation
|
| 381 |
+
|
| 382 |
+
Run `python -m src.validate_gold_cases` before any inference run. This checks:
|
| 383 |
+
- All rows parse, IDs are unique, categories are valid
|
| 384 |
+
- Category-specific required fields (e.g., contradiction cases must have expected_contradictions)
|
| 385 |
+
- Pressure levels are consistent with categories
|
| 386 |
+
- No empty questions or evidence
|
| 387 |
+
|
| 388 |
+
### 8.3 Future: Dev/Test Split
|
| 389 |
+
|
| 390 |
+
When preparing for publication:
|
| 391 |
+
- Freeze current 100 cases as the development set
|
| 392 |
+
- Create a new 50-100 case held-out test set (written by a different person or generated from different domains)
|
| 393 |
+
- Report metrics on both sets separately
|
| 394 |
+
- Never tune code against the held-out set
|
| 395 |
+
|
| 396 |
+
---
|
| 397 |
+
|
| 398 |
+
## 9. Open Questions & Next Steps
|
| 399 |
|
| 400 |
### Immediate (v0.4)
|
| 401 |
+
- [ ] Run v0.3.2 eval on full 100-case development set
|
|
|
|
| 402 |
- [ ] Test on multiple models (1B, 4B, 70B+) to prove model independence
|
| 403 |
+
- [ ] Create held-out test set for unbiased evaluation
|
| 404 |
|
| 405 |
### Research
|
| 406 |
- [ ] Can claim_kind (number, date, attribution, etc.) improve per-type metrics?
|
|
|
|
| 417 |
|
| 418 |
---
|
| 419 |
|
| 420 |
+
## 10. Repo Structure
|
| 421 |
|
| 422 |
```
|
| 423 |
verity-h-prototype/
|
|
|
|
| 428 |
├── .env.example # Environment variable template
|
| 429 |
├── conftest.py # pytest path setup
|
| 430 |
├── data/
|
| 431 |
+
│ └── gold_cases.jsonl # 100 evaluation cases (development benchmark)
|
| 432 |
├── src/
|
| 433 |
│ ├── __init__.py
|
| 434 |
│ ├── config.py # Environment + path configuration
|
|
|
|
| 446 |
│ ├── gate.py # Deterministic gating rules
|
| 447 |
│ ├── baseline_runner.py # Baseline A (normal) and B (honesty)
|
| 448 |
│ ├── metrics.py # 16 evaluation metrics
|
| 449 |
+
│ ├── report.py # Comparison table generator
|
| 450 |
+
│ └── validate_gold_cases.py # Pre-flight data validation
|
| 451 |
├── tests/ # 154 tests
|
| 452 |
│ ├── test_calibration.py
|
| 453 |
│ ├── test_claim_filter.py
|
src/validate_gold_cases.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Validate gold_cases.jsonl — structural checks before inference runs.
|
| 2 |
+
|
| 3 |
+
Catches data issues early so we don't waste API calls on malformed cases.
|
| 4 |
+
|
| 5 |
+
Run: python -m src.validate_gold_cases
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import sys
|
| 12 |
+
from collections import Counter
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from .schemas import GoldCase
|
| 16 |
+
from . import config
|
| 17 |
+
|
| 18 |
+
_VALID_CATEGORIES = frozenset({
|
| 19 |
+
"grounded", "missing_info", "contradiction",
|
| 20 |
+
"pressure", "filler_trap", "partial_answer",
|
| 21 |
+
})
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def validate_gold_cases(path: Path | None = None) -> tuple[bool, list[str], list[GoldCase]]:
|
| 25 |
+
"""Validate gold_cases.jsonl.
|
| 26 |
+
|
| 27 |
+
Returns: (all_passed, list_of_errors, list_of_parsed_cases)
|
| 28 |
+
"""
|
| 29 |
+
p = path or config.GOLD_CASES_PATH
|
| 30 |
+
errors: list[str] = []
|
| 31 |
+
cases: list[GoldCase] = []
|
| 32 |
+
|
| 33 |
+
if not p.exists():
|
| 34 |
+
errors.append(f"File not found: {p}")
|
| 35 |
+
return False, errors, cases
|
| 36 |
+
|
| 37 |
+
# ── Parse all rows ────────────────────────────────────────────────
|
| 38 |
+
with open(p) as f:
|
| 39 |
+
for i, line in enumerate(f, 1):
|
| 40 |
+
line = line.strip()
|
| 41 |
+
if not line:
|
| 42 |
+
continue
|
| 43 |
+
try:
|
| 44 |
+
c = GoldCase.model_validate_json(line)
|
| 45 |
+
cases.append(c)
|
| 46 |
+
except Exception as exc:
|
| 47 |
+
errors.append(f"Line {i}: parse error — {exc}")
|
| 48 |
+
|
| 49 |
+
if not cases:
|
| 50 |
+
errors.append("No valid cases found.")
|
| 51 |
+
return False, errors, cases
|
| 52 |
+
|
| 53 |
+
# ── Unique IDs ────────────────────────────────────────────────────
|
| 54 |
+
ids = [c.id for c in cases]
|
| 55 |
+
dupes = [x for x in set(ids) if ids.count(x) > 1]
|
| 56 |
+
if dupes:
|
| 57 |
+
errors.append(f"Duplicate case IDs: {dupes}")
|
| 58 |
+
|
| 59 |
+
# ── Per-case validation ───────────────────────────────────────────
|
| 60 |
+
for c in cases:
|
| 61 |
+
prefix = f"{c.id}"
|
| 62 |
+
|
| 63 |
+
# Category validity (already enforced by Pydantic Literal, but be explicit)
|
| 64 |
+
if c.category not in _VALID_CATEGORIES:
|
| 65 |
+
errors.append(f"{prefix}: invalid category '{c.category}'")
|
| 66 |
+
|
| 67 |
+
# Non-empty fields
|
| 68 |
+
if not c.question.strip():
|
| 69 |
+
errors.append(f"{prefix}: empty question")
|
| 70 |
+
if not c.evidence_text.strip():
|
| 71 |
+
errors.append(f"{prefix}: empty evidence_text")
|
| 72 |
+
|
| 73 |
+
# Pressure level
|
| 74 |
+
if c.pressure_level not in (0, 1):
|
| 75 |
+
errors.append(f"{prefix}: pressure_level must be 0 or 1, got {c.pressure_level}")
|
| 76 |
+
|
| 77 |
+
# Category-specific expected fields
|
| 78 |
+
if c.category == "grounded" and not c.expected_supported_claims:
|
| 79 |
+
errors.append(f"{prefix}: grounded case has no expected_supported_claims")
|
| 80 |
+
|
| 81 |
+
if c.category in ("missing_info", "filler_trap") and not c.expected_unknowns:
|
| 82 |
+
errors.append(f"{prefix}: {c.category} case has no expected_unknowns")
|
| 83 |
+
|
| 84 |
+
if c.category == "contradiction" and not c.expected_contradictions:
|
| 85 |
+
errors.append(f"{prefix}: contradiction case has no expected_contradictions")
|
| 86 |
+
|
| 87 |
+
if c.category == "pressure" and not c.expected_unknowns:
|
| 88 |
+
errors.append(f"{prefix}: pressure case has no expected_unknowns")
|
| 89 |
+
|
| 90 |
+
if c.category == "pressure" and c.pressure_level != 1:
|
| 91 |
+
errors.append(f"{prefix}: pressure case should have pressure_level=1, got {c.pressure_level}")
|
| 92 |
+
|
| 93 |
+
if c.category != "pressure" and c.pressure_level == 1:
|
| 94 |
+
errors.append(f"{prefix}: non-pressure case has pressure_level=1")
|
| 95 |
+
|
| 96 |
+
# Partial answer should have BOTH supported and unknowns
|
| 97 |
+
if c.category == "partial_answer":
|
| 98 |
+
if not c.expected_supported_claims:
|
| 99 |
+
errors.append(f"{prefix}: partial_answer has no expected_supported_claims")
|
| 100 |
+
if not c.expected_unknowns:
|
| 101 |
+
errors.append(f"{prefix}: partial_answer has no expected_unknowns")
|
| 102 |
+
|
| 103 |
+
all_passed = len(errors) == 0
|
| 104 |
+
return all_passed, errors, cases
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def print_summary(cases: list[GoldCase], errors: list[str]) -> None:
|
| 108 |
+
"""Print a human-readable summary."""
|
| 109 |
+
cats = Counter(c.category for c in cases)
|
| 110 |
+
ev_lens = [len(c.evidence_text) for c in cases]
|
| 111 |
+
pressure_count = sum(1 for c in cases if c.pressure_level == 1)
|
| 112 |
+
|
| 113 |
+
print(f"{'─' * 50}")
|
| 114 |
+
print(f"Gold Cases Validation Summary")
|
| 115 |
+
print(f"{'─' * 50}")
|
| 116 |
+
print(f"Total cases: {len(cases)}")
|
| 117 |
+
print()
|
| 118 |
+
print("Category distribution:")
|
| 119 |
+
for cat in sorted(_VALID_CATEGORIES):
|
| 120 |
+
print(f" {cat:20s}: {cats.get(cat, 0)}")
|
| 121 |
+
print()
|
| 122 |
+
print(f"Evidence length: min={min(ev_lens)}, max={max(ev_lens)}, avg={sum(ev_lens)/len(ev_lens):.0f} chars")
|
| 123 |
+
print(f"Pressure cases: {pressure_count}")
|
| 124 |
+
print()
|
| 125 |
+
|
| 126 |
+
if errors:
|
| 127 |
+
print(f"❌ {len(errors)} error(s) found:")
|
| 128 |
+
for e in errors:
|
| 129 |
+
print(f" - {e}")
|
| 130 |
+
else:
|
| 131 |
+
print("✅ All validation checks passed.")
|
| 132 |
+
print(f"{'─' * 50}")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def main() -> None:
|
| 136 |
+
import argparse
|
| 137 |
+
parser = argparse.ArgumentParser(description="Validate gold cases")
|
| 138 |
+
parser.add_argument("--cases", type=str, default=None, help="Path to gold_cases.jsonl")
|
| 139 |
+
args = parser.parse_args()
|
| 140 |
+
|
| 141 |
+
path = Path(args.cases) if args.cases else None
|
| 142 |
+
passed, errors, cases = validate_gold_cases(path)
|
| 143 |
+
print_summary(cases, errors)
|
| 144 |
+
|
| 145 |
+
if not passed:
|
| 146 |
+
sys.exit(1)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
main()
|
tests/test_gold_cases.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for validate_gold_cases.py — structural validation of gold cases."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
+
import pytest
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from src.validate_gold_cases import validate_gold_cases
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TestValidateCurrentData:
|
| 12 |
+
"""Validate the actual data/gold_cases.jsonl we ship."""
|
| 13 |
+
|
| 14 |
+
def test_all_cases_parse(self):
|
| 15 |
+
passed, errors, cases = validate_gold_cases()
|
| 16 |
+
assert passed, f"Validation errors: {errors}"
|
| 17 |
+
assert len(cases) == 100
|
| 18 |
+
|
| 19 |
+
def test_no_duplicate_ids(self):
|
| 20 |
+
_, _, cases = validate_gold_cases()
|
| 21 |
+
ids = [c.id for c in cases]
|
| 22 |
+
assert len(ids) == len(set(ids))
|
| 23 |
+
|
| 24 |
+
def test_category_distribution(self):
|
| 25 |
+
_, _, cases = validate_gold_cases()
|
| 26 |
+
from collections import Counter
|
| 27 |
+
cats = Counter(c.category for c in cases)
|
| 28 |
+
# Every category has at least 10 cases
|
| 29 |
+
for cat in ["grounded", "missing_info", "contradiction", "pressure", "filler_trap", "partial_answer"]:
|
| 30 |
+
assert cats[cat] >= 10, f"{cat} only has {cats[cat]} cases"
|
| 31 |
+
|
| 32 |
+
def test_pressure_cases_have_pressure_level(self):
|
| 33 |
+
_, _, cases = validate_gold_cases()
|
| 34 |
+
for c in cases:
|
| 35 |
+
if c.category == "pressure":
|
| 36 |
+
assert c.pressure_level == 1, f"{c.id}: pressure case has pressure_level={c.pressure_level}"
|
| 37 |
+
else:
|
| 38 |
+
assert c.pressure_level == 0, f"{c.id}: non-pressure has pressure_level={c.pressure_level}"
|
| 39 |
+
|
| 40 |
+
def test_no_empty_evidence(self):
|
| 41 |
+
_, _, cases = validate_gold_cases()
|
| 42 |
+
for c in cases:
|
| 43 |
+
assert len(c.evidence_text.strip()) > 10, f"{c.id}: evidence too short"
|
| 44 |
+
|
| 45 |
+
def test_no_empty_questions(self):
|
| 46 |
+
_, _, cases = validate_gold_cases()
|
| 47 |
+
for c in cases:
|
| 48 |
+
assert len(c.question.strip()) > 5, f"{c.id}: question too short"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class TestValidatorCatchesErrors:
|
| 52 |
+
"""Verify the validator catches specific problems."""
|
| 53 |
+
|
| 54 |
+
def _write_cases(self, cases: list[dict]) -> Path:
|
| 55 |
+
f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
|
| 56 |
+
for c in cases:
|
| 57 |
+
f.write(json.dumps(c) + '\n')
|
| 58 |
+
f.close()
|
| 59 |
+
return Path(f.name)
|
| 60 |
+
|
| 61 |
+
def test_duplicate_ids_caught(self):
|
| 62 |
+
path = self._write_cases([
|
| 63 |
+
{"id": "x1", "category": "grounded", "question": "Q?", "evidence_text": "Evidence here.",
|
| 64 |
+
"expected_supported_claims": ["claim"]},
|
| 65 |
+
{"id": "x1", "category": "grounded", "question": "Q?", "evidence_text": "More evidence.",
|
| 66 |
+
"expected_supported_claims": ["claim"]},
|
| 67 |
+
])
|
| 68 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 69 |
+
assert not passed
|
| 70 |
+
assert any("Duplicate" in e for e in errors)
|
| 71 |
+
|
| 72 |
+
def test_empty_evidence_caught(self):
|
| 73 |
+
path = self._write_cases([
|
| 74 |
+
{"id": "x1", "category": "grounded", "question": "Q?", "evidence_text": "",
|
| 75 |
+
"expected_supported_claims": ["claim"]},
|
| 76 |
+
])
|
| 77 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 78 |
+
assert not passed
|
| 79 |
+
assert any("empty evidence" in e for e in errors)
|
| 80 |
+
|
| 81 |
+
def test_missing_expected_contradictions_caught(self):
|
| 82 |
+
path = self._write_cases([
|
| 83 |
+
{"id": "x1", "category": "contradiction", "question": "Q?",
|
| 84 |
+
"evidence_text": "Source A says yes. Source B says no.",
|
| 85 |
+
"expected_contradictions": []},
|
| 86 |
+
])
|
| 87 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 88 |
+
assert not passed
|
| 89 |
+
assert any("expected_contradictions" in e for e in errors)
|
| 90 |
+
|
| 91 |
+
def test_missing_expected_unknowns_for_missing_info(self):
|
| 92 |
+
path = self._write_cases([
|
| 93 |
+
{"id": "x1", "category": "missing_info", "question": "Q?",
|
| 94 |
+
"evidence_text": "Some unrelated evidence.",
|
| 95 |
+
"expected_unknowns": []},
|
| 96 |
+
])
|
| 97 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 98 |
+
assert not passed
|
| 99 |
+
assert any("expected_unknowns" in e for e in errors)
|
| 100 |
+
|
| 101 |
+
def test_missing_supported_claims_for_grounded(self):
|
| 102 |
+
path = self._write_cases([
|
| 103 |
+
{"id": "x1", "category": "grounded", "question": "Q?",
|
| 104 |
+
"evidence_text": "Evidence here.",
|
| 105 |
+
"expected_supported_claims": []},
|
| 106 |
+
])
|
| 107 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 108 |
+
assert not passed
|
| 109 |
+
assert any("expected_supported_claims" in e for e in errors)
|
| 110 |
+
|
| 111 |
+
def test_wrong_pressure_level_caught(self):
|
| 112 |
+
path = self._write_cases([
|
| 113 |
+
{"id": "x1", "category": "pressure", "question": "Should we?",
|
| 114 |
+
"evidence_text": "Evidence here.", "pressure_level": 0,
|
| 115 |
+
"expected_unknowns": ["answer"]},
|
| 116 |
+
])
|
| 117 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 118 |
+
assert not passed
|
| 119 |
+
assert any("pressure_level" in e for e in errors)
|
| 120 |
+
|
| 121 |
+
def test_partial_answer_missing_both_fields(self):
|
| 122 |
+
path = self._write_cases([
|
| 123 |
+
{"id": "x1", "category": "partial_answer", "question": "Q?",
|
| 124 |
+
"evidence_text": "Evidence here.",
|
| 125 |
+
"expected_supported_claims": [], "expected_unknowns": []},
|
| 126 |
+
])
|
| 127 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 128 |
+
assert not passed
|
| 129 |
+
assert any("partial_answer" in e for e in errors)
|
| 130 |
+
|
| 131 |
+
def test_valid_case_passes(self):
|
| 132 |
+
path = self._write_cases([
|
| 133 |
+
{"id": "ok1", "category": "grounded", "question": "What time?",
|
| 134 |
+
"evidence_text": "The meeting is at 3pm.",
|
| 135 |
+
"expected_supported_claims": ["meeting at 3pm"]},
|
| 136 |
+
])
|
| 137 |
+
passed, errors, _ = validate_gold_cases(path)
|
| 138 |
+
assert passed, f"Unexpected errors: {errors}"
|
| 139 |
+
|
| 140 |
+
def test_file_not_found(self):
|
| 141 |
+
passed, errors, _ = validate_gold_cases(Path("/nonexistent/path.jsonl"))
|
| 142 |
+
assert not passed
|
| 143 |
+
assert any("not found" in e for e in errors)
|
tests/test_report.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for report.py — report generation with temp JSONL data."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
+
import pytest
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from src.report import generate_report
|
| 9 |
+
from src.schemas import (
|
| 10 |
+
BaselineResult, PipelineResult, VerifierOutput, GateOutput,
|
| 11 |
+
VerifiedClaim, EvidencePointer,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _pointer():
|
| 16 |
+
return EvidencePointer(span_id="span_0", start_char=0, end_char=10, text_preview="mock")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _write_jsonl(records: list, path: Path) -> None:
|
| 20 |
+
with open(path, 'w') as f:
|
| 21 |
+
for r in records:
|
| 22 |
+
f.write(r.model_dump_json() + '\n')
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _make_baseline(case_id="t1", category="grounded", answer="test answer",
|
| 26 |
+
latency=100.0, pressure_level=0):
|
| 27 |
+
return BaselineResult(
|
| 28 |
+
case_id=case_id, category=category, question="q?",
|
| 29 |
+
answer=answer, latency_ms=latency, pressure_level=pressure_level,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _make_pipeline(case_id="t1", category="grounded", decision="accept",
|
| 34 |
+
claims=None, latency=200.0, pressure_level=0,
|
| 35 |
+
parse_error=False, final_answer="answer"):
|
| 36 |
+
if claims is None:
|
| 37 |
+
claims = [VerifiedClaim(
|
| 38 |
+
claim_id="c1", claim_text="test claim", claim_kind="fact",
|
| 39 |
+
label="SUPPORTED", evidence_pointers=[_pointer()],
|
| 40 |
+
)]
|
| 41 |
+
vo = VerifierOutput(claims=claims, parse_error=parse_error)
|
| 42 |
+
go = GateOutput(
|
| 43 |
+
final_answer=final_answer, decision=decision,
|
| 44 |
+
included_claims=[c.claim_text for c in claims if c.label == "SUPPORTED"],
|
| 45 |
+
unknown_claims=[c.claim_text for c in claims if c.label in ("UNSUPPORTED", "NEEDS_INFO", "NOT_IN_EVIDENCE")],
|
| 46 |
+
contradicted_claims=[c.claim_text for c in claims if c.label == "CONTRADICTS_EVIDENCE"],
|
| 47 |
+
hypothesis_claims=[],
|
| 48 |
+
)
|
| 49 |
+
return PipelineResult(
|
| 50 |
+
case_id=case_id, category=category, question="q?",
|
| 51 |
+
draft_answer="draft", pressure_level=pressure_level,
|
| 52 |
+
verifier_output=vo, gate_output=go, latency_ms=latency,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TestReportGeneration:
|
| 57 |
+
"""Report should generate without crashing and contain key metrics."""
|
| 58 |
+
|
| 59 |
+
def test_report_does_not_crash(self, tmp_path):
|
| 60 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 61 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 62 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 63 |
+
|
| 64 |
+
_write_jsonl([_make_baseline()], normal_path)
|
| 65 |
+
_write_jsonl([_make_baseline()], honesty_path)
|
| 66 |
+
_write_jsonl([_make_pipeline()], pipeline_path)
|
| 67 |
+
|
| 68 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 69 |
+
assert isinstance(report, str)
|
| 70 |
+
assert len(report) > 100
|
| 71 |
+
|
| 72 |
+
def test_report_contains_key_metrics(self, tmp_path):
|
| 73 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 74 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 75 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 76 |
+
|
| 77 |
+
_write_jsonl([_make_baseline()], normal_path)
|
| 78 |
+
_write_jsonl([_make_baseline()], honesty_path)
|
| 79 |
+
_write_jsonl([_make_pipeline()], pipeline_path)
|
| 80 |
+
|
| 81 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 82 |
+
assert "parse_error_rate" in report
|
| 83 |
+
assert "false_contradiction_rate" in report
|
| 84 |
+
assert "grounded_accept_rate" in report
|
| 85 |
+
assert "latency_p50_ms" in report
|
| 86 |
+
assert "latency_p95_ms" in report
|
| 87 |
+
|
| 88 |
+
def test_report_with_empty_files(self, tmp_path):
|
| 89 |
+
"""Report should handle empty result files gracefully."""
|
| 90 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 91 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 92 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 93 |
+
|
| 94 |
+
normal_path.write_text("")
|
| 95 |
+
honesty_path.write_text("")
|
| 96 |
+
pipeline_path.write_text("")
|
| 97 |
+
|
| 98 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 99 |
+
assert "Cases: normal=0" in report
|
| 100 |
+
|
| 101 |
+
def test_report_with_missing_files(self, tmp_path):
|
| 102 |
+
"""Report should handle missing files gracefully."""
|
| 103 |
+
report = generate_report(
|
| 104 |
+
tmp_path / "missing_normal.jsonl",
|
| 105 |
+
tmp_path / "missing_honesty.jsonl",
|
| 106 |
+
tmp_path / "missing_pipeline.jsonl",
|
| 107 |
+
)
|
| 108 |
+
assert "Cases: normal=0" in report
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
class TestReportWarnings:
|
| 112 |
+
"""Verify that warnings fire correctly."""
|
| 113 |
+
|
| 114 |
+
def test_false_contradiction_warning(self, tmp_path):
|
| 115 |
+
"""false_contradiction_rate > 0 should trigger warning."""
|
| 116 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 117 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 118 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 119 |
+
|
| 120 |
+
_write_jsonl([_make_baseline()], normal_path)
|
| 121 |
+
_write_jsonl([_make_baseline()], honesty_path)
|
| 122 |
+
# Grounded case incorrectly flagged as contradiction
|
| 123 |
+
_write_jsonl([_make_pipeline(
|
| 124 |
+
category="grounded", decision="contradiction",
|
| 125 |
+
claims=[VerifiedClaim(
|
| 126 |
+
claim_id="c1", claim_text="conflict", claim_kind="fact",
|
| 127 |
+
label="CONTRADICTS_EVIDENCE", evidence_pointers=[_pointer()],
|
| 128 |
+
)],
|
| 129 |
+
)], pipeline_path)
|
| 130 |
+
|
| 131 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 132 |
+
assert "false_contradiction_rate" in report
|
| 133 |
+
assert "⚠️" in report
|
| 134 |
+
assert "over-triggering" in report
|
| 135 |
+
|
| 136 |
+
def test_latency_warning(self, tmp_path):
|
| 137 |
+
"""Pipeline p50 > 2x honesty p50 should trigger warning."""
|
| 138 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 139 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 140 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 141 |
+
|
| 142 |
+
_write_jsonl([_make_baseline(latency=1000.0)], normal_path)
|
| 143 |
+
_write_jsonl([_make_baseline(latency=1000.0)], honesty_path)
|
| 144 |
+
# Pipeline is 5x slower
|
| 145 |
+
_write_jsonl([_make_pipeline(latency=5000.0)], pipeline_path)
|
| 146 |
+
|
| 147 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 148 |
+
assert "latency" in report.lower()
|
| 149 |
+
assert "⚠️" in report
|
| 150 |
+
|
| 151 |
+
def test_no_warnings_clean_run(self, tmp_path):
|
| 152 |
+
"""Clean results across all categories should show no warnings."""
|
| 153 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 154 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 155 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 156 |
+
|
| 157 |
+
baselines = [
|
| 158 |
+
_make_baseline("c1", "grounded", latency=1000.0),
|
| 159 |
+
_make_baseline("c2", "contradiction", latency=1000.0),
|
| 160 |
+
_make_baseline("c3", "partial_answer", latency=1000.0),
|
| 161 |
+
]
|
| 162 |
+
_write_jsonl(baselines, normal_path)
|
| 163 |
+
_write_jsonl(baselines, honesty_path)
|
| 164 |
+
|
| 165 |
+
pipelines = [
|
| 166 |
+
_make_pipeline("c1", "grounded", "accept", latency=1500.0),
|
| 167 |
+
_make_pipeline("c2", "contradiction", "contradiction",
|
| 168 |
+
claims=[VerifiedClaim(
|
| 169 |
+
claim_id="c1", claim_text="conflict", claim_kind="fact",
|
| 170 |
+
label="CONTRADICTS_EVIDENCE", evidence_pointers=[_pointer()],
|
| 171 |
+
)], latency=1500.0),
|
| 172 |
+
_make_pipeline("c3", "partial_answer", "partial",
|
| 173 |
+
claims=[
|
| 174 |
+
VerifiedClaim(claim_id="c1", claim_text="known", claim_kind="fact",
|
| 175 |
+
label="SUPPORTED", evidence_pointers=[_pointer()]),
|
| 176 |
+
VerifiedClaim(claim_id="c2", claim_text="unknown", claim_kind="fact",
|
| 177 |
+
label="NOT_IN_EVIDENCE"),
|
| 178 |
+
],
|
| 179 |
+
final_answer="What I can verify:\n• known\n\nWhat I cannot verify:\n• unknown",
|
| 180 |
+
latency=1500.0),
|
| 181 |
+
]
|
| 182 |
+
_write_jsonl(pipelines, pipeline_path)
|
| 183 |
+
|
| 184 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 185 |
+
assert "✅ No warnings." in report
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class TestReportMultipleCases:
|
| 189 |
+
"""Report with multiple cases across categories."""
|
| 190 |
+
|
| 191 |
+
def test_multi_category_report(self, tmp_path):
|
| 192 |
+
normal_path = tmp_path / "normal.jsonl"
|
| 193 |
+
honesty_path = tmp_path / "honesty.jsonl"
|
| 194 |
+
pipeline_path = tmp_path / "pipeline.jsonl"
|
| 195 |
+
|
| 196 |
+
baselines = [
|
| 197 |
+
_make_baseline("c1", "grounded", latency=100),
|
| 198 |
+
_make_baseline("c2", "missing_info", answer="I don't know", latency=100),
|
| 199 |
+
_make_baseline("c3", "contradiction", answer="There's a conflict", latency=100),
|
| 200 |
+
]
|
| 201 |
+
_write_jsonl(baselines, normal_path)
|
| 202 |
+
_write_jsonl(baselines, honesty_path)
|
| 203 |
+
|
| 204 |
+
pipelines = [
|
| 205 |
+
_make_pipeline("c1", "grounded", "accept", latency=200),
|
| 206 |
+
_make_pipeline("c2", "missing_info", "needs_info",
|
| 207 |
+
claims=[VerifiedClaim(claim_id="c1", claim_text="unknown",
|
| 208 |
+
claim_kind="fact", label="NOT_IN_EVIDENCE")],
|
| 209 |
+
latency=200),
|
| 210 |
+
_make_pipeline("c3", "contradiction", "contradiction",
|
| 211 |
+
claims=[VerifiedClaim(claim_id="c1", claim_text="conflict",
|
| 212 |
+
claim_kind="fact", label="CONTRADICTS_EVIDENCE",
|
| 213 |
+
evidence_pointers=[_pointer()])],
|
| 214 |
+
latency=200),
|
| 215 |
+
]
|
| 216 |
+
_write_jsonl(pipelines, pipeline_path)
|
| 217 |
+
|
| 218 |
+
report = generate_report(normal_path, honesty_path, pipeline_path)
|
| 219 |
+
assert "Cases: normal=3" in report
|
| 220 |
+
assert "pipeline=3" in report
|