Sravanth18 commited on
Commit
da63ba8
·
verified ·
1 Parent(s): f7d0296

Upload tests/test_contradiction_checks.py

Browse files
Files changed (1) hide show
  1. tests/test_contradiction_checks.py +104 -125
tests/test_contradiction_checks.py CHANGED
@@ -1,11 +1,12 @@
1
- """Tests for contradiction_checks.py — frame-based, conservative.
2
 
3
- v0.1.4: no hardcoded slot expansion. Tests verify that frame-based
4
- comparison only flags when subject+attribute clearly match.
 
5
  """
6
 
7
  import pytest
8
- from src.contradiction_checks import check_contradictions
9
  from src.schemas import EvidenceSpan
10
 
11
 
@@ -13,174 +14,152 @@ def _span(sid, text, start=0):
13
  return EvidenceSpan(span_id=sid, text=text, start_char=start, end_char=start + len(text))
14
 
15
 
16
- class TestFalsePositivesPrevented:
17
- """These MUST NOT be flagged as contradictions."""
18
 
19
- def test_arr_vs_exit_value(self):
20
- """$2M ARR vs $50M founder exit are different financial slots."""
21
  spans = [
22
- _span("span_0", "The startup has $2M ARR and 30% month-over-month growth."),
23
- _span("span_1", "The founder previously exited a company for $50M.", start=80),
24
  ]
25
- results = check_contradictions(spans, "Should we invest in this startup?")
26
- assert len(results) == 0
 
27
 
28
- def test_construction_year_vs_award_year(self):
29
- """1985 construction vs 1987 award are different events."""
30
  spans = [
31
- _span("span_0", "The building was constructed in 1985 and has 12 floors."),
32
- _span("span_1", "It won an architecture award in 1987.", start=60),
33
  ]
34
- results = check_contradictions(spans, "How tall is the building in meters?")
35
- assert len(results) == 0
36
 
37
- def test_model_year_vs_mileage(self):
38
- """2019 model year vs 45000 miles are different attributes."""
39
  spans = [
40
- _span("span_0", "The car is a 2019 model with 45,000 miles on the odometer."),
41
- _span("span_1", "It has been serviced regularly at the dealership.", start=70),
42
  ]
43
- results = check_contradictions(spans, "What is the car's mileage and year?")
44
- assert len(results) == 0
45
 
46
- def test_year_vs_employee_count(self):
47
- """Q1 2024 year vs 47 employees are different slots."""
48
  spans = [
49
- _span("span_0", "HR records indicate 47 new employees were hired in Q1 2024."),
50
- _span("span_1", "All 47 passed background checks.", start=70),
51
  ]
52
- results = check_contradictions(spans, "How many employees were hired?")
53
- assert len(results) == 0
54
 
55
- def test_schools_vs_hospitals(self):
56
- """45 schools vs 3 hospitals are different count attributes."""
57
  spans = [
58
- _span("span_0", "The city has 45 public schools and 3 hospitals."),
59
- _span("span_1", "The mayor is Linda Chen.", start=50),
60
  ]
61
- results = check_contradictions(spans, "What is the population?")
62
- assert len(results) == 0
63
 
64
- def test_revenue_vs_growth_pct(self):
65
- """$4.2M revenue vs 12% growth are money vs percentage."""
66
  spans = [
67
- _span("span_0", "Q3 revenue was $4.2 million, up 12% from Q2."),
68
- _span("span_1", "The CFO confirmed the figures on October 5th.", start=50),
69
  ]
70
- results = check_contradictions(spans, "What was the revenue?")
71
- assert len(results) == 0
72
 
73
- def test_money_not_detected_by_frames(self):
74
- """Money/percentage conflicts are NOT detected by frame detector —
75
- too many false positives. Rely on verifier LLM or status-pair detection."""
76
- spans = [
77
- _span("span_0", "The company's annual revenue target is $100 million."),
78
- _span("span_1", "Through Q3, actual revenue is $82.4 million, representing 82.4% of the target with one quarter remaining.", start=80),
79
- ]
80
- results = check_contradictions(spans, "Will the company hit its revenue target this year?")
81
- assert len(results) == 0
82
 
83
- def test_money_different_scales_no_contradiction(self):
84
- """$200M company revenue vs $4.8B industry revenue — money never flagged by frames."""
85
- spans = [
86
- _span("span_0", "The company's revenue grew 25% to $200 million."),
87
- _span("span_1", "Total industry revenue is estimated at $4.8 billion.", start=60),
88
- ]
89
- results = check_contradictions(spans, "What is the company's market share?")
90
- assert len(results) == 0
91
 
92
- def test_percentage_different_contexts_no_contradiction(self):
93
- """25% revenue growth vs 15% employee growth percentage never flagged by frames."""
94
  spans = [
95
- _span("span_0", "The company's revenue grew 25% to $200 million."),
96
- _span("span_1", "The company has 15% more employees than last year.", start=60),
97
  ]
98
- results = check_contradictions(spans, "What is the company's market share?")
99
- assert len(results) == 0
 
 
100
 
101
- def test_temperature_different_time_periods_no_contradiction(self):
102
- """Yesterday's high vs forecast range vs historical average are not contradictory."""
103
  spans = [
104
- _span("span_0", "Yesterday's high temperature in Tokyo was 28°C."),
105
- _span("span_1", "The weekly forecast predicts temperatures between 24°C and 30°C.", start=60),
106
- _span("span_2", "The average temperature for this month historically is 26.5°C.", start=120),
107
  ]
108
- results = check_contradictions(spans, "What is the current temperature in Tokyo?")
109
- assert len(results) == 0
110
 
111
- def test_different_dates_different_events_no_contradiction(self):
112
- """Version 2.0 release date vs Version 2.1 release date are different events."""
113
  spans = [
114
- _span("span_0", "Version 2.0 was released on January 15, 2024."),
115
- _span("span_1", "Version 2.1 was a bug fix release on March 3, 2024.", start=60),
116
  ]
117
- results = check_contradictions(spans, "When will the next version be released?")
118
- assert len(results) == 0
119
-
120
 
121
- class TestTrueContradictions:
122
- """These MUST be detected."""
123
-
124
- def test_conflicting_dates_same_event(self):
125
- """Same event described with different words — frame detector may miss
126
- if subject words don't overlap. Verifier should catch this."""
127
- spans = [
128
- _span("span_0", "The project launched on January 10, 2024 per press release."),
129
- _span("span_1", "Internal logs show the first deployment completed on January 17, 2024.", start=80),
130
- ]
131
- results = check_contradictions(spans, "When did the project launch?")
132
- # Frame detector is conservative for dates (different words = different events).
133
- # The verifier (LLM) should catch this contradiction instead.
134
- # We accept either detection OR no detection here — the key is no false positives.
135
- if results:
136
- assert all(c.label == "CONTRADICTS_EVIDENCE" for c in results)
137
-
138
- def test_conflicting_dates_strong_subject_overlap(self):
139
- """Dates with ≥2 shared subject words should be caught."""
140
  spans = [
141
- _span("span_0", "The building was completed on February 15, 2024 per architect report."),
142
- _span("span_1", "The city certificate says the building was completed February 28, 2024.", start=80),
143
  ]
144
- results = check_contradictions(spans, "When was the building completed?")
145
- assert len(results) >= 1
146
- assert all(c.label == "CONTRADICTS_EVIDENCE" for c in results)
147
 
148
- def test_conflicting_counts_same_subject(self):
 
149
  spans = [
150
- _span("span_0", "The sales report states 1,200 units were sold in March."),
151
- _span("span_1", "Warehouse shipping log shows 980 units dispatched in March.", start=60),
152
  ]
153
- results = check_contradictions(spans, "How many units were sold?")
154
- assert len(results) >= 1
 
 
 
 
155
 
156
- def test_conflicting_temperatures(self):
157
  spans = [
158
- _span("span_0", "Sensor A recorded 23.5 degrees C at noon."),
159
- _span("span_1", "Sensor B recorded 28.1 degrees C at noon.", start=50),
160
  ]
161
- results = check_contradictions(spans, "What temperature was recorded?")
162
- assert len(results) >= 1
163
 
164
- def test_open_vs_closed(self):
165
  spans = [
166
- _span("span_0", "The bridge opened to traffic on June 1."),
167
- _span("span_1", "The bridge failed inspection and remains closed.", start=50),
168
  ]
169
- results = check_contradictions(spans, "Is the bridge open?")
170
- assert len(results) >= 1
171
 
172
- def test_approved_vs_not_approved(self):
173
  spans = [
174
- _span("span_0", "Budget was approved by the CFO on April 2."),
175
- _span("span_1", "No budget has been approved yet per CEO email.", start=50),
176
  ]
177
- results = check_contradictions(spans, "Who approved the budget?")
178
- assert len(results) >= 1
179
 
180
 
181
  class TestEdgeCases:
182
  def test_single_span(self):
183
- assert check_contradictions([_span("span_0", "Test.")], "q?") == []
 
 
184
 
185
  def test_empty(self):
186
- assert check_contradictions([], "q?") == []
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for contradiction_checks.py — v0.4 simplified.
2
 
3
+ Only status-pair contradictions are FORCED into gate decision.
4
+ Numeric/date/money conflicts are logged as possible_conflict but
5
+ MUST NOT force gate decision=contradiction.
6
  """
7
 
8
  import pytest
9
+ from src.contradiction_checks import check_contradictions, ConflictResult
10
  from src.schemas import EvidenceSpan
11
 
12
 
 
14
  return EvidenceSpan(span_id=sid, text=text, start_char=start, end_char=start + len(text))
15
 
16
 
17
+ class TestForcedStatusPairContradictions:
18
+ """These MUST be detected and forced into gate."""
19
 
20
+ def test_open_vs_closed(self):
 
21
  spans = [
22
+ _span("span_0", "The bridge opened to traffic on June 1."),
23
+ _span("span_1", "The bridge failed inspection and remains closed.", start=50),
24
  ]
25
+ result = check_contradictions(spans, "Is the bridge open?")
26
+ assert len(result.forced) >= 1
27
+ assert all(c.label == "CONTRADICTS_EVIDENCE" for c in result.forced)
28
 
29
+ def test_approved_vs_rejected(self):
 
30
  spans = [
31
+ _span("span_0", "Budget was approved by the CFO on April 2."),
32
+ _span("span_1", "Budget was rejected by the board on April 3.", start=50),
33
  ]
34
+ result = check_contradictions(spans, "Was the budget approved?")
35
+ assert len(result.forced) >= 1
36
 
37
+ def test_passed_vs_failed(self):
 
38
  spans = [
39
+ _span("span_0", "All candidates passed the exam."),
40
+ _span("span_1", "Three candidates failed the practical assessment.", start=50),
41
  ]
42
+ result = check_contradictions(spans, "Did everyone pass?")
43
+ assert len(result.forced) >= 1
44
 
45
+ def test_available_vs_unavailable(self):
 
46
  spans = [
47
+ _span("span_0", "The product is available for purchase."),
48
+ _span("span_1", "The product is unavailable due to supply issues.", start=50),
49
  ]
50
+ result = check_contradictions(spans, "Is the product available?")
51
+ assert len(result.forced) >= 1
52
 
53
+ def test_launched_vs_not_launched(self):
 
54
  spans = [
55
+ _span("span_0", "The satellite was launched on January 10."),
56
+ _span("span_1", "The satellite was not launched due to weather.", start=50),
57
  ]
58
+ result = check_contradictions(spans, "Was the satellite launched?")
59
+ assert len(result.forced) >= 1
60
 
61
+ def test_enabled_vs_disabled(self):
 
62
  spans = [
63
+ _span("span_0", "The feature is enabled for all users."),
64
+ _span("span_1", "The feature was disabled by the admin.", start=50),
65
  ]
66
+ result = check_contradictions(spans, "Is the feature enabled?")
67
+ assert len(result.forced) >= 1
68
 
 
 
 
 
 
 
 
 
 
69
 
70
+ class TestPossibleConflictsLoggedOnly:
71
+ """Numeric/date/money conflicts are logged, NOT forced into gate."""
 
 
 
 
 
 
72
 
73
+ def test_date_conflict_not_forced(self):
74
+ """Date conflicts may appear in possible, but NOT forced."""
75
  spans = [
76
+ _span("span_0", "The building was completed on February 15, 2024."),
77
+ _span("span_1", "The city certificate says completion was February 28, 2024.", start=80),
78
  ]
79
+ result = check_contradictions(spans, "When was the building completed?")
80
+ # v0.4: Date conflicts are NOT forced (too many false positives).
81
+ # They may appear in possible for audit.
82
+ assert len(result.forced) == 0
83
 
84
+ def test_numeric_conflict_not_forced(self):
85
+ """Numeric value conflicts are NOT forced."""
86
  spans = [
87
+ _span("span_0", "The sales report states 1,200 units were sold."),
88
+ _span("span_1", "Warehouse log shows 980 units dispatched.", start=60),
 
89
  ]
90
+ result = check_contradictions(spans, "How many units were sold?")
91
+ assert len(result.forced) == 0
92
 
93
+ def test_money_conflict_not_forced(self):
94
+ """Money conflicts are NOT forced."""
95
  spans = [
96
+ _span("span_0", "Revenue target is $100 million."),
97
+ _span("span_1", "Actual revenue through Q3 is $82.4 million.", start=60),
98
  ]
99
+ result = check_contradictions(spans, "What is the revenue?")
100
+ assert len(result.forced) == 0
 
101
 
102
+ def test_temperature_conflict_not_forced(self):
103
+ """Temperature conflicts are NOT forced."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  spans = [
105
+ _span("span_0", "Sensor A recorded 23.5 degrees C."),
106
+ _span("span_1", "Sensor B recorded 28.1 degrees C.", start=50),
107
  ]
108
+ result = check_contradictions(spans, "What is the temperature?")
109
+ assert len(result.forced) == 0
 
110
 
111
+ def test_percentage_conflict_not_forced(self):
112
+ """Percentage conflicts are NOT forced."""
113
  spans = [
114
+ _span("span_0", "Revenue grew 25% year over year."),
115
+ _span("span_1", "Employee count grew 15% year over year.", start=60),
116
  ]
117
+ result = check_contradictions(spans, "What is the growth rate?")
118
+ assert len(result.forced) == 0
119
+
120
+
121
+ class TestNoFalsePositives:
122
+ """Unrelated spans must not trigger either forced or possible."""
123
 
124
+ def test_unrelated_spans(self):
125
  spans = [
126
+ _span("span_0", "The startup has $2M ARR."),
127
+ _span("span_1", "The founder exited a company for $50M.", start=80),
128
  ]
129
+ result = check_contradictions(spans, "Should we invest?")
130
+ assert len(result.forced) == 0
131
 
132
+ def test_different_events_different_dates(self):
133
  spans = [
134
+ _span("span_0", "Version 2.0 was released on January 15, 2024."),
135
+ _span("span_1", "Version 2.1 was released on March 3, 2024.", start=60),
136
  ]
137
+ result = check_contradictions(spans, "When was the last release?")
138
+ assert len(result.forced) == 0
139
 
140
+ def test_different_attributes_same_entity(self):
141
  spans = [
142
+ _span("span_0", "The car is a 2019 model with 45,000 miles."),
143
+ _span("span_1", "The car was serviced in 2023.", start=70),
144
  ]
145
+ result = check_contradictions(spans, "What is the car's mileage?")
146
+ assert len(result.forced) == 0
147
 
148
 
149
  class TestEdgeCases:
150
  def test_single_span(self):
151
+ result = check_contradictions([_span("span_0", "Test.")], "q?")
152
+ assert result.forced == []
153
+ assert result.possible == []
154
 
155
  def test_empty(self):
156
+ result = check_contradictions([], "q?")
157
+ assert result.forced == []
158
+ assert result.possible == []
159
+
160
+ def test_conflict_result_is_named_tuple(self):
161
+ """API returns ConflictResult with .forced and .possible."""
162
+ result = check_contradictions([], "q?")
163
+ assert isinstance(result, ConflictResult)
164
+ assert hasattr(result, "forced")
165
+ assert hasattr(result, "possible")