padmapriyagosakan commited on
Commit
71ea0d8
·
1 Parent(s): 0fd745c

fix: clamp all grader scores to strict (0,1) open interval

Browse files

Platform requires: at least 3 tasks with graders AND each score
strictly between 0 and 1 (never exactly 0.0 or 1.0).

Changes:
- openenv.yaml: restore 30 tasks with grader references
- server/graders.py: _BaseGrader returns 0.999 (correct), 0.001
(wrong) instead of 1.0/0.0; _clamp() helper enforces boundary
- grader.py: normalised_score clamped to [0.001, 0.999] instead
of [0.0, 1.0]

Verified locally:
- All grader scores in (0, 1): correct=0.999, wrong=0.001
- Episode normalised_score: perfect=0.873, worst=0.270, empty=0.001

Files changed (3) hide show
  1. grader.py +6 -2
  2. openenv.yaml +151 -0
  3. server/graders.py +18 -4
grader.py CHANGED
@@ -334,8 +334,12 @@ def grade_episode(
334
  for t in tasks
335
  )
336
 
337
- # Strict [0, 1] clamp grader can NEVER exceed 1.0 or go below 0.0
338
- normalised = max(0.0, min(1.0, total / max_possible)) if max_possible > 0 else 0.0
 
 
 
 
339
 
340
  # Build per-task rewards with grader config included.
341
  # zip is safe because per_task_details always has exactly len(tasks) entries
 
334
  for t in tasks
335
  )
336
 
337
+ # Strict open interval (0, 1)platform rejects exactly 0.0 and 1.0
338
+ if max_possible > 0:
339
+ normalised = total / max_possible
340
+ normalised = max(0.001, min(0.999, normalised))
341
+ else:
342
+ normalised = 0.001
343
 
344
  # Build per-task rewards with grader config included.
345
  # zip is safe because per_task_details always has exactly len(tasks) entries
openenv.yaml CHANGED
@@ -4,3 +4,154 @@ type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 7860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 7860
7
+ tasks:
8
+ - id: EASY-001
9
+ task_id: EASY-001
10
+ name: EASY-001
11
+ difficulty: easy
12
+ grader: server.graders:EASY001Grader
13
+ - id: EASY-002
14
+ task_id: EASY-002
15
+ name: EASY-002
16
+ difficulty: easy
17
+ grader: server.graders:EASY002Grader
18
+ - id: EASY-003
19
+ task_id: EASY-003
20
+ name: EASY-003
21
+ difficulty: easy
22
+ grader: server.graders:EASY003Grader
23
+ - id: EASY-004
24
+ task_id: EASY-004
25
+ name: EASY-004
26
+ difficulty: easy
27
+ grader: server.graders:EASY004Grader
28
+ - id: EASY-005
29
+ task_id: EASY-005
30
+ name: EASY-005
31
+ difficulty: easy
32
+ grader: server.graders:EASY005Grader
33
+ - id: EASY-006
34
+ task_id: EASY-006
35
+ name: EASY-006
36
+ difficulty: easy
37
+ grader: server.graders:EASY006Grader
38
+ - id: MED-001
39
+ task_id: MED-001
40
+ name: MED-001
41
+ difficulty: medium
42
+ grader: server.graders:MED001Grader
43
+ - id: MED-002
44
+ task_id: MED-002
45
+ name: MED-002
46
+ difficulty: medium
47
+ grader: server.graders:MED002Grader
48
+ - id: MED-003
49
+ task_id: MED-003
50
+ name: MED-003
51
+ difficulty: medium
52
+ grader: server.graders:MED003Grader
53
+ - id: MED-004
54
+ task_id: MED-004
55
+ name: MED-004
56
+ difficulty: medium
57
+ grader: server.graders:MED004Grader
58
+ - id: MED-005
59
+ task_id: MED-005
60
+ name: MED-005
61
+ difficulty: medium
62
+ grader: server.graders:MED005Grader
63
+ - id: MED-006
64
+ task_id: MED-006
65
+ name: MED-006
66
+ difficulty: medium
67
+ grader: server.graders:MED006Grader
68
+ - id: MED-007
69
+ task_id: MED-007
70
+ name: MED-007
71
+ difficulty: medium
72
+ grader: server.graders:MED007Grader
73
+ - id: MED-008
74
+ task_id: MED-008
75
+ name: MED-008
76
+ difficulty: medium
77
+ grader: server.graders:MED008Grader
78
+ - id: HARD-001
79
+ task_id: HARD-001
80
+ name: HARD-001
81
+ difficulty: hard
82
+ grader: server.graders:HARD001Grader
83
+ - id: HARD-002
84
+ task_id: HARD-002
85
+ name: HARD-002
86
+ difficulty: hard
87
+ grader: server.graders:HARD002Grader
88
+ - id: HARD-003
89
+ task_id: HARD-003
90
+ name: HARD-003
91
+ difficulty: hard
92
+ grader: server.graders:HARD003Grader
93
+ - id: HARD-004
94
+ task_id: HARD-004
95
+ name: HARD-004
96
+ difficulty: hard
97
+ grader: server.graders:HARD004Grader
98
+ - id: HARD-005
99
+ task_id: HARD-005
100
+ name: HARD-005
101
+ difficulty: hard
102
+ grader: server.graders:HARD005Grader
103
+ - id: HARD-006
104
+ task_id: HARD-006
105
+ name: HARD-006
106
+ difficulty: hard
107
+ grader: server.graders:HARD006Grader
108
+ - id: HARD-007
109
+ task_id: HARD-007
110
+ name: HARD-007
111
+ difficulty: hard
112
+ grader: server.graders:HARD007Grader
113
+ - id: HARD-008
114
+ task_id: HARD-008
115
+ name: HARD-008
116
+ difficulty: hard
117
+ grader: server.graders:HARD008Grader
118
+ - id: HARD-009
119
+ task_id: HARD-009
120
+ name: HARD-009
121
+ difficulty: hard
122
+ grader: server.graders:HARD009Grader
123
+ - id: HARD-010
124
+ task_id: HARD-010
125
+ name: HARD-010
126
+ difficulty: hard
127
+ grader: server.graders:HARD010Grader
128
+ - id: CRIT-001
129
+ task_id: CRIT-001
130
+ name: CRIT-001
131
+ difficulty: critical
132
+ grader: server.graders:CRIT001Grader
133
+ - id: CRIT-002
134
+ task_id: CRIT-002
135
+ name: CRIT-002
136
+ difficulty: critical
137
+ grader: server.graders:CRIT002Grader
138
+ - id: CRIT-003
139
+ task_id: CRIT-003
140
+ name: CRIT-003
141
+ difficulty: critical
142
+ grader: server.graders:CRIT003Grader
143
+ - id: CRIT-004
144
+ task_id: CRIT-004
145
+ name: CRIT-004
146
+ difficulty: critical
147
+ grader: server.graders:CRIT004Grader
148
+ - id: CRIT-005
149
+ task_id: CRIT-005
150
+ name: CRIT-005
151
+ difficulty: critical
152
+ grader: server.graders:CRIT005Grader
153
+ - id: CRIT-006
154
+ task_id: CRIT-006
155
+ name: CRIT-006
156
+ difficulty: critical
157
+ grader: server.graders:CRIT006Grader
server/graders.py CHANGED
@@ -7,10 +7,23 @@ Each grader is fully self-contained (zero external imports) so the platform
7
  validator can import and call them in any isolated environment.
8
 
9
  Returns {"score": float, "feedback": str} as required by the platform.
10
- Scores are always in [0.0, 1.0].
11
  """
12
  from __future__ import annotations
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  class _BaseGrader:
16
  """
@@ -26,11 +39,12 @@ class _BaseGrader:
26
 
27
  def grade(self, action: str, **kwargs):
28
  if action == self.correct_action:
29
- return {"score": 1.0, "feedback": "Correct action"}
30
- score = float(self.partial_credit.get(action, 0.0))
 
31
  return {
32
  "score": score,
33
- "feedback": "Partial credit" if score > 0 else "Incorrect action",
34
  }
35
 
36
  def __call__(self, action: str, **kwargs):
 
7
  validator can import and call them in any isolated environment.
8
 
9
  Returns {"score": float, "feedback": str} as required by the platform.
10
+ Scores are always strictly in (0, 1) — never exactly 0.0 or 1.0.
11
  """
12
  from __future__ import annotations
13
 
14
+ # Platform requires scores strictly between 0 and 1 (exclusive).
15
+ _SCORE_MIN = 0.001
16
+ _SCORE_MAX = 0.999
17
+
18
+
19
+ def _clamp(score: float) -> float:
20
+ """Clamp score to the open interval (0, 1)."""
21
+ if score <= 0.0:
22
+ return _SCORE_MIN
23
+ if score >= 1.0:
24
+ return _SCORE_MAX
25
+ return score
26
+
27
 
28
  class _BaseGrader:
29
  """
 
39
 
40
  def grade(self, action: str, **kwargs):
41
  if action == self.correct_action:
42
+ return {"score": _SCORE_MAX, "feedback": "Correct action"}
43
+ raw = float(self.partial_credit.get(action, 0.0))
44
+ score = _clamp(raw)
45
  return {
46
  "score": score,
47
+ "feedback": "Partial credit" if raw > 0 else "Incorrect action",
48
  }
49
 
50
  def __call__(self, action: str, **kwargs):