v0.3.2: add gold case validator, report tests, dev/test split docs (190 tests pass)

Browse files

Files changed (4) hide show

DESIGN.md +42 -6
src/validate_gold_cases.py +150 -0
tests/test_gold_cases.py +143 -0
tests/test_report.py +220 -0

DESIGN.md CHANGED Viewed

@@ -360,12 +360,47 @@ This is a **research harness** — a controlled environment for studying how wel
 ---
-## 8. Open Questions & Next Steps
 ### Immediate (v0.4)
-- [ ] Run v0.3.1 eval to confirm pressure/contradiction fixes
-- [ ] Expand to 100 gold cases (more edge cases, longer evidence, multi-fact questions)
 - [ ] Test on multiple models (1B, 4B, 70B+) to prove model independence
 ### Research
 - [ ] Can claim_kind (number, date, attribution, etc.) improve per-type metrics?
@@ -382,7 +417,7 @@ This is a **research harness** — a controlled environment for studying how wel
 ---
-## 9. Repo Structure
 ```
 verity-h-prototype/
@@ -393,7 +428,7 @@ verity-h-prototype/
 ├── .env.example               # Environment variable template
 ├── conftest.py                # pytest path setup
 ├── data/
-│   └── gold_cases.jsonl       # 30 evaluation cases (6 categories × 5)
 ├── src/
 │   ├── __init__.py
 │   ├── config.py              # Environment + path configuration
@@ -411,7 +446,8 @@ verity-h-prototype/
 │   ├── gate.py                # Deterministic gating rules
 │   ├── baseline_runner.py     # Baseline A (normal) and B (honesty)
 │   ├── metrics.py             # 16 evaluation metrics
-│   └── report.py              # Comparison table generator
 ├── tests/                     # 154 tests
 │   ├── test_calibration.py
 │   ├── test_claim_filter.py

 ---
+## 8. Data & Evaluation Methodology
+### 8.1 Gold Cases (Development Benchmark)
+The current 100 cases in `data/gold_cases.jsonl` are a **development benchmark** — they were used to develop, debug, and tune the pipeline. Results on this set are informative but not final validation.
+| Category | Count | Purpose |
+|----------|:-----:|---------|
+| grounded | 17 | All claims fully supported by evidence |
+| missing_info | 14 | Evidence doesn't address the question |
+| contradiction | 15 | Evidence contains conflicting facts |
+| pressure | 15 | Speculative questions requiring hypothesis mode |
+| filler_trap | 15 | Tempts the model to invent plausible facts |
+| partial_answer | 24 | Some claims supported, some not |
+**Important:** These cases are NOT an unseen test set. The pipeline's deterministic rules (span_matcher patterns, inference detector regexes, claim_filter slot keywords) were tuned against failure cases from this set. For publication-grade results, an independent held-out test set is needed.
+### 8.2 Validation
+Run `python -m src.validate_gold_cases` before any inference run. This checks:
+- All rows parse, IDs are unique, categories are valid
+- Category-specific required fields (e.g., contradiction cases must have expected_contradictions)
+- Pressure levels are consistent with categories
+- No empty questions or evidence
+### 8.3 Future: Dev/Test Split
+When preparing for publication:
+- Freeze current 100 cases as the development set
+- Create a new 50-100 case held-out test set (written by a different person or generated from different domains)
+- Report metrics on both sets separately
+- Never tune code against the held-out set
+---
+## 9. Open Questions & Next Steps
 ### Immediate (v0.4)
+- [ ] Run v0.3.2 eval on full 100-case development set
 - [ ] Test on multiple models (1B, 4B, 70B+) to prove model independence
+- [ ] Create held-out test set for unbiased evaluation
 ### Research
 - [ ] Can claim_kind (number, date, attribution, etc.) improve per-type metrics?
 ---
+## 10. Repo Structure
 ```
 verity-h-prototype/
 ├── .env.example               # Environment variable template
 ├── conftest.py                # pytest path setup
 ├── data/
+│   └── gold_cases.jsonl       # 100 evaluation cases (development benchmark)
 ├── src/
 │   ├── __init__.py
 │   ├── config.py              # Environment + path configuration
 │   ├── gate.py                # Deterministic gating rules
 │   ├── baseline_runner.py     # Baseline A (normal) and B (honesty)
 │   ├── metrics.py             # 16 evaluation metrics
+│   ├── report.py              # Comparison table generator
+│   └── validate_gold_cases.py # Pre-flight data validation
 ├── tests/                     # 154 tests
 │   ├── test_calibration.py
 │   ├── test_claim_filter.py

src/validate_gold_cases.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Validate gold_cases.jsonl — structural checks before inference runs.
+Catches data issues early so we don't waste API calls on malformed cases.
+Run: python -m src.validate_gold_cases
+"""
+from __future__ import annotations
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from .schemas import GoldCase
+from . import config
+_VALID_CATEGORIES = frozenset({
+    "grounded", "missing_info", "contradiction",
+    "pressure", "filler_trap", "partial_answer",
+})
+def validate_gold_cases(path: Path | None = None) -> tuple[bool, list[str], list[GoldCase]]:
+    """Validate gold_cases.jsonl.
+    Returns: (all_passed, list_of_errors, list_of_parsed_cases)
+    """
+    p = path or config.GOLD_CASES_PATH
+    errors: list[str] = []
+    cases: list[GoldCase] = []
+    if not p.exists():
+        errors.append(f"File not found: {p}")
+        return False, errors, cases
+    # ── Parse all rows ────────────────────────────────────────────────
+    with open(p) as f:
+        for i, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                c = GoldCase.model_validate_json(line)
+                cases.append(c)
+            except Exception as exc:
+                errors.append(f"Line {i}: parse error — {exc}")
+    if not cases:
+        errors.append("No valid cases found.")
+        return False, errors, cases
+    # ── Unique IDs ────────────────────────────────────────────────────
+    ids = [c.id for c in cases]
+    dupes = [x for x in set(ids) if ids.count(x) > 1]
+    if dupes:
+        errors.append(f"Duplicate case IDs: {dupes}")
+    # ── Per-case validation ───────────────────────────────────────────
+    for c in cases:
+        prefix = f"{c.id}"
+        # Category validity (already enforced by Pydantic Literal, but be explicit)
+        if c.category not in _VALID_CATEGORIES:
+            errors.append(f"{prefix}: invalid category '{c.category}'")
+        # Non-empty fields
+        if not c.question.strip():
+            errors.append(f"{prefix}: empty question")
+        if not c.evidence_text.strip():
+            errors.append(f"{prefix}: empty evidence_text")
+        # Pressure level
+        if c.pressure_level not in (0, 1):
+            errors.append(f"{prefix}: pressure_level must be 0 or 1, got {c.pressure_level}")
+        # Category-specific expected fields
+        if c.category == "grounded" and not c.expected_supported_claims:
+            errors.append(f"{prefix}: grounded case has no expected_supported_claims")
+        if c.category in ("missing_info", "filler_trap") and not c.expected_unknowns:
+            errors.append(f"{prefix}: {c.category} case has no expected_unknowns")
+        if c.category == "contradiction" and not c.expected_contradictions:
+            errors.append(f"{prefix}: contradiction case has no expected_contradictions")
+        if c.category == "pressure" and not c.expected_unknowns:
+            errors.append(f"{prefix}: pressure case has no expected_unknowns")
+        if c.category == "pressure" and c.pressure_level != 1:
+            errors.append(f"{prefix}: pressure case should have pressure_level=1, got {c.pressure_level}")
+        if c.category != "pressure" and c.pressure_level == 1:
+            errors.append(f"{prefix}: non-pressure case has pressure_level=1")
+        # Partial answer should have BOTH supported and unknowns
+        if c.category == "partial_answer":
+            if not c.expected_supported_claims:
+                errors.append(f"{prefix}: partial_answer has no expected_supported_claims")
+            if not c.expected_unknowns:
+                errors.append(f"{prefix}: partial_answer has no expected_unknowns")
+    all_passed = len(errors) == 0
+    return all_passed, errors, cases
+def print_summary(cases: list[GoldCase], errors: list[str]) -> None:
+    """Print a human-readable summary."""
+    cats = Counter(c.category for c in cases)
+    ev_lens = [len(c.evidence_text) for c in cases]
+    pressure_count = sum(1 for c in cases if c.pressure_level == 1)
+    print(f"{'─' * 50}")
+    print(f"Gold Cases Validation Summary")
+    print(f"{'─' * 50}")
+    print(f"Total cases: {len(cases)}")
+    print()
+    print("Category distribution:")
+    for cat in sorted(_VALID_CATEGORIES):
+        print(f"  {cat:20s}: {cats.get(cat, 0)}")
+    print()
+    print(f"Evidence length: min={min(ev_lens)}, max={max(ev_lens)}, avg={sum(ev_lens)/len(ev_lens):.0f} chars")
+    print(f"Pressure cases: {pressure_count}")
+    print()
+    if errors:
+        print(f"❌ {len(errors)} error(s) found:")
+        for e in errors:
+            print(f"  - {e}")
+    else:
+        print("✅ All validation checks passed.")
+    print(f"{'─' * 50}")
+def main() -> None:
+    import argparse
+    parser = argparse.ArgumentParser(description="Validate gold cases")
+    parser.add_argument("--cases", type=str, default=None, help="Path to gold_cases.jsonl")
+    args = parser.parse_args()
+    path = Path(args.cases) if args.cases else None
+    passed, errors, cases = validate_gold_cases(path)
+    print_summary(cases, errors)
+    if not passed:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

tests/test_gold_cases.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""Tests for validate_gold_cases.py — structural validation of gold cases."""
+import json
+import tempfile
+import pytest
+from pathlib import Path
+from src.validate_gold_cases import validate_gold_cases
+class TestValidateCurrentData:
+    """Validate the actual data/gold_cases.jsonl we ship."""
+    def test_all_cases_parse(self):
+        passed, errors, cases = validate_gold_cases()
+        assert passed, f"Validation errors: {errors}"
+        assert len(cases) == 100
+    def test_no_duplicate_ids(self):
+        _, _, cases = validate_gold_cases()
+        ids = [c.id for c in cases]
+        assert len(ids) == len(set(ids))
+    def test_category_distribution(self):
+        _, _, cases = validate_gold_cases()
+        from collections import Counter
+        cats = Counter(c.category for c in cases)
+        # Every category has at least 10 cases
+        for cat in ["grounded", "missing_info", "contradiction", "pressure", "filler_trap", "partial_answer"]:
+            assert cats[cat] >= 10, f"{cat} only has {cats[cat]} cases"
+    def test_pressure_cases_have_pressure_level(self):
+        _, _, cases = validate_gold_cases()
+        for c in cases:
+            if c.category == "pressure":
+                assert c.pressure_level == 1, f"{c.id}: pressure case has pressure_level={c.pressure_level}"
+            else:
+                assert c.pressure_level == 0, f"{c.id}: non-pressure has pressure_level={c.pressure_level}"
+    def test_no_empty_evidence(self):
+        _, _, cases = validate_gold_cases()
+        for c in cases:
+            assert len(c.evidence_text.strip()) > 10, f"{c.id}: evidence too short"
+    def test_no_empty_questions(self):
+        _, _, cases = validate_gold_cases()
+        for c in cases:
+            assert len(c.question.strip()) > 5, f"{c.id}: question too short"
+class TestValidatorCatchesErrors:
+    """Verify the validator catches specific problems."""
+    def _write_cases(self, cases: list[dict]) -> Path:
+        f = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
+        for c in cases:
+            f.write(json.dumps(c) + '\n')
+        f.close()
+        return Path(f.name)
+    def test_duplicate_ids_caught(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "grounded", "question": "Q?", "evidence_text": "Evidence here.",
+             "expected_supported_claims": ["claim"]},
+            {"id": "x1", "category": "grounded", "question": "Q?", "evidence_text": "More evidence.",
+             "expected_supported_claims": ["claim"]},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("Duplicate" in e for e in errors)
+    def test_empty_evidence_caught(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "grounded", "question": "Q?", "evidence_text": "",
+             "expected_supported_claims": ["claim"]},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("empty evidence" in e for e in errors)
+    def test_missing_expected_contradictions_caught(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "contradiction", "question": "Q?",
+             "evidence_text": "Source A says yes. Source B says no.",
+             "expected_contradictions": []},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("expected_contradictions" in e for e in errors)
+    def test_missing_expected_unknowns_for_missing_info(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "missing_info", "question": "Q?",
+             "evidence_text": "Some unrelated evidence.",
+             "expected_unknowns": []},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("expected_unknowns" in e for e in errors)
+    def test_missing_supported_claims_for_grounded(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "grounded", "question": "Q?",
+             "evidence_text": "Evidence here.",
+             "expected_supported_claims": []},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("expected_supported_claims" in e for e in errors)
+    def test_wrong_pressure_level_caught(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "pressure", "question": "Should we?",
+             "evidence_text": "Evidence here.", "pressure_level": 0,
+             "expected_unknowns": ["answer"]},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("pressure_level" in e for e in errors)
+    def test_partial_answer_missing_both_fields(self):
+        path = self._write_cases([
+            {"id": "x1", "category": "partial_answer", "question": "Q?",
+             "evidence_text": "Evidence here.",
+             "expected_supported_claims": [], "expected_unknowns": []},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert not passed
+        assert any("partial_answer" in e for e in errors)
+    def test_valid_case_passes(self):
+        path = self._write_cases([
+            {"id": "ok1", "category": "grounded", "question": "What time?",
+             "evidence_text": "The meeting is at 3pm.",
+             "expected_supported_claims": ["meeting at 3pm"]},
+        ])
+        passed, errors, _ = validate_gold_cases(path)
+        assert passed, f"Unexpected errors: {errors}"
+    def test_file_not_found(self):
+        passed, errors, _ = validate_gold_cases(Path("/nonexistent/path.jsonl"))
+        assert not passed
+        assert any("not found" in e for e in errors)

tests/test_report.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""Tests for report.py — report generation with temp JSONL data."""
+import json
+import tempfile
+import pytest
+from pathlib import Path
+from src.report import generate_report
+from src.schemas import (
+    BaselineResult, PipelineResult, VerifierOutput, GateOutput,
+    VerifiedClaim, EvidencePointer,
+)
+def _pointer():
+    return EvidencePointer(span_id="span_0", start_char=0, end_char=10, text_preview="mock")
+def _write_jsonl(records: list, path: Path) -> None:
+    with open(path, 'w') as f:
+        for r in records:
+            f.write(r.model_dump_json() + '\n')
+def _make_baseline(case_id="t1", category="grounded", answer="test answer",
+                   latency=100.0, pressure_level=0):
+    return BaselineResult(
+        case_id=case_id, category=category, question="q?",
+        answer=answer, latency_ms=latency, pressure_level=pressure_level,
+    )
+def _make_pipeline(case_id="t1", category="grounded", decision="accept",
+                   claims=None, latency=200.0, pressure_level=0,
+                   parse_error=False, final_answer="answer"):
+    if claims is None:
+        claims = [VerifiedClaim(
+            claim_id="c1", claim_text="test claim", claim_kind="fact",
+            label="SUPPORTED", evidence_pointers=[_pointer()],
+        )]
+    vo = VerifierOutput(claims=claims, parse_error=parse_error)
+    go = GateOutput(
+        final_answer=final_answer, decision=decision,
+        included_claims=[c.claim_text for c in claims if c.label == "SUPPORTED"],
+        unknown_claims=[c.claim_text for c in claims if c.label in ("UNSUPPORTED", "NEEDS_INFO", "NOT_IN_EVIDENCE")],
+        contradicted_claims=[c.claim_text for c in claims if c.label == "CONTRADICTS_EVIDENCE"],
+        hypothesis_claims=[],
+    )
+    return PipelineResult(
+        case_id=case_id, category=category, question="q?",
+        draft_answer="draft", pressure_level=pressure_level,
+        verifier_output=vo, gate_output=go, latency_ms=latency,
+    )
+class TestReportGeneration:
+    """Report should generate without crashing and contain key metrics."""
+    def test_report_does_not_crash(self, tmp_path):
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        _write_jsonl([_make_baseline()], normal_path)
+        _write_jsonl([_make_baseline()], honesty_path)
+        _write_jsonl([_make_pipeline()], pipeline_path)
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert isinstance(report, str)
+        assert len(report) > 100
+    def test_report_contains_key_metrics(self, tmp_path):
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        _write_jsonl([_make_baseline()], normal_path)
+        _write_jsonl([_make_baseline()], honesty_path)
+        _write_jsonl([_make_pipeline()], pipeline_path)
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert "parse_error_rate" in report
+        assert "false_contradiction_rate" in report
+        assert "grounded_accept_rate" in report
+        assert "latency_p50_ms" in report
+        assert "latency_p95_ms" in report
+    def test_report_with_empty_files(self, tmp_path):
+        """Report should handle empty result files gracefully."""
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        normal_path.write_text("")
+        honesty_path.write_text("")
+        pipeline_path.write_text("")
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert "Cases: normal=0" in report
+    def test_report_with_missing_files(self, tmp_path):
+        """Report should handle missing files gracefully."""
+        report = generate_report(
+            tmp_path / "missing_normal.jsonl",
+            tmp_path / "missing_honesty.jsonl",
+            tmp_path / "missing_pipeline.jsonl",
+        )
+        assert "Cases: normal=0" in report
+class TestReportWarnings:
+    """Verify that warnings fire correctly."""
+    def test_false_contradiction_warning(self, tmp_path):
+        """false_contradiction_rate > 0 should trigger warning."""
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        _write_jsonl([_make_baseline()], normal_path)
+        _write_jsonl([_make_baseline()], honesty_path)
+        # Grounded case incorrectly flagged as contradiction
+        _write_jsonl([_make_pipeline(
+            category="grounded", decision="contradiction",
+            claims=[VerifiedClaim(
+                claim_id="c1", claim_text="conflict", claim_kind="fact",
+                label="CONTRADICTS_EVIDENCE", evidence_pointers=[_pointer()],
+            )],
+        )], pipeline_path)
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert "false_contradiction_rate" in report
+        assert "⚠️" in report
+        assert "over-triggering" in report
+    def test_latency_warning(self, tmp_path):
+        """Pipeline p50 > 2x honesty p50 should trigger warning."""
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        _write_jsonl([_make_baseline(latency=1000.0)], normal_path)
+        _write_jsonl([_make_baseline(latency=1000.0)], honesty_path)
+        # Pipeline is 5x slower
+        _write_jsonl([_make_pipeline(latency=5000.0)], pipeline_path)
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert "latency" in report.lower()
+        assert "⚠️" in report
+    def test_no_warnings_clean_run(self, tmp_path):
+        """Clean results across all categories should show no warnings."""
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        baselines = [
+            _make_baseline("c1", "grounded", latency=1000.0),
+            _make_baseline("c2", "contradiction", latency=1000.0),
+            _make_baseline("c3", "partial_answer", latency=1000.0),
+        ]
+        _write_jsonl(baselines, normal_path)
+        _write_jsonl(baselines, honesty_path)
+        pipelines = [
+            _make_pipeline("c1", "grounded", "accept", latency=1500.0),
+            _make_pipeline("c2", "contradiction", "contradiction",
+                          claims=[VerifiedClaim(
+                              claim_id="c1", claim_text="conflict", claim_kind="fact",
+                              label="CONTRADICTS_EVIDENCE", evidence_pointers=[_pointer()],
+                          )], latency=1500.0),
+            _make_pipeline("c3", "partial_answer", "partial",
+                          claims=[
+                              VerifiedClaim(claim_id="c1", claim_text="known", claim_kind="fact",
+                                           label="SUPPORTED", evidence_pointers=[_pointer()]),
+                              VerifiedClaim(claim_id="c2", claim_text="unknown", claim_kind="fact",
+                                           label="NOT_IN_EVIDENCE"),
+                          ],
+                          final_answer="What I can verify:\n• known\n\nWhat I cannot verify:\n• unknown",
+                          latency=1500.0),
+        ]
+        _write_jsonl(pipelines, pipeline_path)
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert "✅ No warnings." in report
+class TestReportMultipleCases:
+    """Report with multiple cases across categories."""
+    def test_multi_category_report(self, tmp_path):
+        normal_path = tmp_path / "normal.jsonl"
+        honesty_path = tmp_path / "honesty.jsonl"
+        pipeline_path = tmp_path / "pipeline.jsonl"
+        baselines = [
+            _make_baseline("c1", "grounded", latency=100),
+            _make_baseline("c2", "missing_info", answer="I don't know", latency=100),
+            _make_baseline("c3", "contradiction", answer="There's a conflict", latency=100),
+        ]
+        _write_jsonl(baselines, normal_path)
+        _write_jsonl(baselines, honesty_path)
+        pipelines = [
+            _make_pipeline("c1", "grounded", "accept", latency=200),
+            _make_pipeline("c2", "missing_info", "needs_info",
+                          claims=[VerifiedClaim(claim_id="c1", claim_text="unknown",
+                                  claim_kind="fact", label="NOT_IN_EVIDENCE")],
+                          latency=200),
+            _make_pipeline("c3", "contradiction", "contradiction",
+                          claims=[VerifiedClaim(claim_id="c1", claim_text="conflict",
+                                  claim_kind="fact", label="CONTRADICTS_EVIDENCE",
+                                  evidence_pointers=[_pointer()])],
+                          latency=200),
+        ]
+        _write_jsonl(pipelines, pipeline_path)
+        report = generate_report(normal_path, honesty_path, pipeline_path)
+        assert "Cases: normal=3" in report
+        assert "pipeline=3" in report