Sravanth18
/

verity-h-prototype

Model card Files Files and versions

xet

Community

Sravanth18 commited on 29 days ago

Commit

da63ba8

verified ·

1 Parent(s): f7d0296

Upload tests/test_contradiction_checks.py

Browse files

Files changed (1) hide show

tests/test_contradiction_checks.py +104 -125

tests/test_contradiction_checks.py CHANGED Viewed

@@ -1,11 +1,12 @@
-"""Tests for contradiction_checks.py — frame-based, conservative.
-v0.1.4: no hardcoded slot expansion. Tests verify that frame-based
-comparison only flags when subject+attribute clearly match.
 """
 import pytest
-from src.contradiction_checks import check_contradictions
 from src.schemas import EvidenceSpan
@@ -13,174 +14,152 @@ def _span(sid, text, start=0):
     return EvidenceSpan(span_id=sid, text=text, start_char=start, end_char=start + len(text))
-class TestFalsePositivesPrevented:
-    """These MUST NOT be flagged as contradictions."""
-    def test_arr_vs_exit_value(self):
-        """$2M ARR vs $50M founder exit are different financial slots."""
         spans = [
-            _span("span_0", "The startup has $2M ARR and 30% month-over-month growth."),
-            _span("span_1", "The founder previously exited a company for $50M.", start=80),
         ]
-        results = check_contradictions(spans, "Should we invest in this startup?")
-        assert len(results) == 0
-    def test_construction_year_vs_award_year(self):
-        """1985 construction vs 1987 award are different events."""
         spans = [
-            _span("span_0", "The building was constructed in 1985 and has 12 floors."),
-            _span("span_1", "It won an architecture award in 1987.", start=60),
         ]
-        results = check_contradictions(spans, "How tall is the building in meters?")
-        assert len(results) == 0
-    def test_model_year_vs_mileage(self):
-        """2019 model year vs 45000 miles are different attributes."""
         spans = [
-            _span("span_0", "The car is a 2019 model with 45,000 miles on the odometer."),
-            _span("span_1", "It has been serviced regularly at the dealership.", start=70),
         ]
-        results = check_contradictions(spans, "What is the car's mileage and year?")
-        assert len(results) == 0
-    def test_year_vs_employee_count(self):
-        """Q1 2024 year vs 47 employees are different slots."""
         spans = [
-            _span("span_0", "HR records indicate 47 new employees were hired in Q1 2024."),
-            _span("span_1", "All 47 passed background checks.", start=70),
         ]
-        results = check_contradictions(spans, "How many employees were hired?")
-        assert len(results) == 0
-    def test_schools_vs_hospitals(self):
-        """45 schools vs 3 hospitals are different count attributes."""
         spans = [
-            _span("span_0", "The city has 45 public schools and 3 hospitals."),
-            _span("span_1", "The mayor is Linda Chen.", start=50),
         ]
-        results = check_contradictions(spans, "What is the population?")
-        assert len(results) == 0
-    def test_revenue_vs_growth_pct(self):
-        """$4.2M revenue vs 12% growth are money vs percentage."""
         spans = [
-            _span("span_0", "Q3 revenue was $4.2 million, up 12% from Q2."),
-            _span("span_1", "The CFO confirmed the figures on October 5th.", start=50),
         ]
-        results = check_contradictions(spans, "What was the revenue?")
-        assert len(results) == 0
-    def test_money_not_detected_by_frames(self):
-        """Money/percentage conflicts are NOT detected by frame detector —
-        too many false positives. Rely on verifier LLM or status-pair detection."""
-        spans = [
-            _span("span_0", "The company's annual revenue target is $100 million."),
-            _span("span_1", "Through Q3, actual revenue is $82.4 million, representing 82.4% of the target with one quarter remaining.", start=80),
-        ]
-        results = check_contradictions(spans, "Will the company hit its revenue target this year?")
-        assert len(results) == 0
-    def test_money_different_scales_no_contradiction(self):
-        """$200M company revenue vs $4.8B industry revenue — money never flagged by frames."""
-        spans = [
-            _span("span_0", "The company's revenue grew 25% to $200 million."),
-            _span("span_1", "Total industry revenue is estimated at $4.8 billion.", start=60),
-        ]
-        results = check_contradictions(spans, "What is the company's market share?")
-        assert len(results) == 0
-    def test_percentage_different_contexts_no_contradiction(self):
-        """25% revenue growth vs 15% employee growth — percentage never flagged by frames."""
         spans = [
-            _span("span_0", "The company's revenue grew 25% to $200 million."),
-            _span("span_1", "The company has 15% more employees than last year.", start=60),
         ]
-        results = check_contradictions(spans, "What is the company's market share?")
-        assert len(results) == 0
-    def test_temperature_different_time_periods_no_contradiction(self):
-        """Yesterday's high vs forecast range vs historical average are not contradictory."""
         spans = [
-            _span("span_0", "Yesterday's high temperature in Tokyo was 28°C."),
-            _span("span_1", "The weekly forecast predicts temperatures between 24°C and 30°C.", start=60),
-            _span("span_2", "The average temperature for this month historically is 26.5°C.", start=120),
         ]
-        results = check_contradictions(spans, "What is the current temperature in Tokyo?")
-        assert len(results) == 0
-    def test_different_dates_different_events_no_contradiction(self):
-        """Version 2.0 release date vs Version 2.1 release date are different events."""
         spans = [
-            _span("span_0", "Version 2.0 was released on January 15, 2024."),
-            _span("span_1", "Version 2.1 was a bug fix release on March 3, 2024.", start=60),
         ]
-        results = check_contradictions(spans, "When will the next version be released?")
-        assert len(results) == 0
-class TestTrueContradictions:
-    """These MUST be detected."""
-    def test_conflicting_dates_same_event(self):
-        """Same event described with different words — frame detector may miss
-        if subject words don't overlap. Verifier should catch this."""
-        spans = [
-            _span("span_0", "The project launched on January 10, 2024 per press release."),
-            _span("span_1", "Internal logs show the first deployment completed on January 17, 2024.", start=80),
-        ]
-        results = check_contradictions(spans, "When did the project launch?")
-        # Frame detector is conservative for dates (different words = different events).
-        # The verifier (LLM) should catch this contradiction instead.
-        # We accept either detection OR no detection here — the key is no false positives.
-        if results:
-            assert all(c.label == "CONTRADICTS_EVIDENCE" for c in results)
-    def test_conflicting_dates_strong_subject_overlap(self):
-        """Dates with ≥2 shared subject words should be caught."""
         spans = [
-            _span("span_0", "The building was completed on February 15, 2024 per architect report."),
-            _span("span_1", "The city certificate says the building was completed February 28, 2024.", start=80),
         ]
-        results = check_contradictions(spans, "When was the building completed?")
-        assert len(results) >= 1
-        assert all(c.label == "CONTRADICTS_EVIDENCE" for c in results)
-    def test_conflicting_counts_same_subject(self):
         spans = [
-            _span("span_0", "The sales report states 1,200 units were sold in March."),
-            _span("span_1", "Warehouse shipping log shows 980 units dispatched in March.", start=60),
         ]
-        results = check_contradictions(spans, "How many units were sold?")
-        assert len(results) >= 1
-    def test_conflicting_temperatures(self):
         spans = [
-            _span("span_0", "Sensor A recorded 23.5 degrees C at noon."),
-            _span("span_1", "Sensor B recorded 28.1 degrees C at noon.", start=50),
         ]
-        results = check_contradictions(spans, "What temperature was recorded?")
-        assert len(results) >= 1
-    def test_open_vs_closed(self):
         spans = [
-            _span("span_0", "The bridge opened to traffic on June 1."),
-            _span("span_1", "The bridge failed inspection and remains closed.", start=50),
         ]
-        results = check_contradictions(spans, "Is the bridge open?")
-        assert len(results) >= 1
-    def test_approved_vs_not_approved(self):
         spans = [
-            _span("span_0", "Budget was approved by the CFO on April 2."),
-            _span("span_1", "No budget has been approved yet per CEO email.", start=50),
         ]
-        results = check_contradictions(spans, "Who approved the budget?")
-        assert len(results) >= 1
 class TestEdgeCases:
     def test_single_span(self):
-        assert check_contradictions([_span("span_0", "Test.")], "q?") == []
     def test_empty(self):
-        assert check_contradictions([], "q?") == []

+"""Tests for contradiction_checks.py — v0.4 simplified.
+Only status-pair contradictions are FORCED into gate decision.
+Numeric/date/money conflicts are logged as possible_conflict but
+MUST NOT force gate decision=contradiction.
 """
 import pytest
+from src.contradiction_checks import check_contradictions, ConflictResult
 from src.schemas import EvidenceSpan
     return EvidenceSpan(span_id=sid, text=text, start_char=start, end_char=start + len(text))
+class TestForcedStatusPairContradictions:
+    """These MUST be detected and forced into gate."""
+    def test_open_vs_closed(self):
         spans = [
+            _span("span_0", "The bridge opened to traffic on June 1."),
+            _span("span_1", "The bridge failed inspection and remains closed.", start=50),
         ]
+        result = check_contradictions(spans, "Is the bridge open?")
+        assert len(result.forced) >= 1
+        assert all(c.label == "CONTRADICTS_EVIDENCE" for c in result.forced)
+    def test_approved_vs_rejected(self):
         spans = [
+            _span("span_0", "Budget was approved by the CFO on April 2."),
+            _span("span_1", "Budget was rejected by the board on April 3.", start=50),
         ]
+        result = check_contradictions(spans, "Was the budget approved?")
+        assert len(result.forced) >= 1
+    def test_passed_vs_failed(self):
         spans = [
+            _span("span_0", "All candidates passed the exam."),
+            _span("span_1", "Three candidates failed the practical assessment.", start=50),
         ]
+        result = check_contradictions(spans, "Did everyone pass?")
+        assert len(result.forced) >= 1
+    def test_available_vs_unavailable(self):
         spans = [
+            _span("span_0", "The product is available for purchase."),
+            _span("span_1", "The product is unavailable due to supply issues.", start=50),
         ]
+        result = check_contradictions(spans, "Is the product available?")
+        assert len(result.forced) >= 1
+    def test_launched_vs_not_launched(self):
         spans = [
+            _span("span_0", "The satellite was launched on January 10."),
+            _span("span_1", "The satellite was not launched due to weather.", start=50),
         ]
+        result = check_contradictions(spans, "Was the satellite launched?")
+        assert len(result.forced) >= 1
+    def test_enabled_vs_disabled(self):
         spans = [
+            _span("span_0", "The feature is enabled for all users."),
+            _span("span_1", "The feature was disabled by the admin.", start=50),
         ]
+        result = check_contradictions(spans, "Is the feature enabled?")
+        assert len(result.forced) >= 1
+class TestPossibleConflictsLoggedOnly:
+    """Numeric/date/money conflicts are logged, NOT forced into gate."""
+    def test_date_conflict_not_forced(self):
+        """Date conflicts may appear in possible, but NOT forced."""
         spans = [
+            _span("span_0", "The building was completed on February 15, 2024."),
+            _span("span_1", "The city certificate says completion was February 28, 2024.", start=80),
         ]
+        result = check_contradictions(spans, "When was the building completed?")
+        # v0.4: Date conflicts are NOT forced (too many false positives).
+        # They may appear in possible for audit.
+        assert len(result.forced) == 0
+    def test_numeric_conflict_not_forced(self):
+        """Numeric value conflicts are NOT forced."""
         spans = [
+            _span("span_0", "The sales report states 1,200 units were sold."),
+            _span("span_1", "Warehouse log shows 980 units dispatched.", start=60),
         ]
+        result = check_contradictions(spans, "How many units were sold?")
+        assert len(result.forced) == 0
+    def test_money_conflict_not_forced(self):
+        """Money conflicts are NOT forced."""
         spans = [
+            _span("span_0", "Revenue target is $100 million."),
+            _span("span_1", "Actual revenue through Q3 is $82.4 million.", start=60),
         ]
+        result = check_contradictions(spans, "What is the revenue?")
+        assert len(result.forced) == 0
+    def test_temperature_conflict_not_forced(self):
+        """Temperature conflicts are NOT forced."""
         spans = [
+            _span("span_0", "Sensor A recorded 23.5 degrees C."),
+            _span("span_1", "Sensor B recorded 28.1 degrees C.", start=50),
         ]
+        result = check_contradictions(spans, "What is the temperature?")
+        assert len(result.forced) == 0
+    def test_percentage_conflict_not_forced(self):
+        """Percentage conflicts are NOT forced."""
         spans = [
+            _span("span_0", "Revenue grew 25% year over year."),
+            _span("span_1", "Employee count grew 15% year over year.", start=60),
         ]
+        result = check_contradictions(spans, "What is the growth rate?")
+        assert len(result.forced) == 0
+class TestNoFalsePositives:
+    """Unrelated spans must not trigger either forced or possible."""
+    def test_unrelated_spans(self):
         spans = [
+            _span("span_0", "The startup has $2M ARR."),
+            _span("span_1", "The founder exited a company for $50M.", start=80),
         ]
+        result = check_contradictions(spans, "Should we invest?")
+        assert len(result.forced) == 0
+    def test_different_events_different_dates(self):
         spans = [
+            _span("span_0", "Version 2.0 was released on January 15, 2024."),
+            _span("span_1", "Version 2.1 was released on March 3, 2024.", start=60),
         ]
+        result = check_contradictions(spans, "When was the last release?")
+        assert len(result.forced) == 0
+    def test_different_attributes_same_entity(self):
         spans = [
+            _span("span_0", "The car is a 2019 model with 45,000 miles."),
+            _span("span_1", "The car was serviced in 2023.", start=70),
         ]
+        result = check_contradictions(spans, "What is the car's mileage?")
+        assert len(result.forced) == 0
 class TestEdgeCases:
     def test_single_span(self):
+        result = check_contradictions([_span("span_0", "Test.")], "q?")
+        assert result.forced == []
+        assert result.possible == []
     def test_empty(self):
+        result = check_contradictions([], "q?")
+        assert result.forced == []
+        assert result.possible == []
+    def test_conflict_result_is_named_tuple(self):
+        """API returns ConflictResult with .forced and .possible."""
+        result = check_contradictions([], "q?")
+        assert isinstance(result, ConflictResult)
+        assert hasattr(result, "forced")
+        assert hasattr(result, "possible")