ChibuUkachi commited on
Commit
61c152a
·
1 Parent(s): 9e39fcc

example every eval results for lmeval and lighteval

Browse files
every_eval_ever_results/aime25.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "aime25/inference-optimization/MiniMax-M2.5.w4a16/1777302637.214952",
4
+ "evaluation_timestamp": "5872197",
5
+ "retrieved_timestamp": "1777302637.214952",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "unknown",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "unknown"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w4a16",
18
+ "id": "inference-optimization/MiniMax-M2.5.w4a16",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8000/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "8"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "aime25",
38
+ "source_data": {
39
+ "dataset_name": "aime25",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "yentinglin/aime_2025",
42
+ "hf_split": "train"
43
+ },
44
+ "evaluation_timestamp": "5877463",
45
+ "metric_config": {
46
+ "evaluation_description": "pass@k:k=1&n=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.8416666666666667,
54
+ "details": {
55
+ "seed_scores": "[0.9, 0.8, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8666666666666667, 0.8333333333333334, 0.8333333333333334]",
56
+ "seed_values": "[1234, 1356, 3344, 4158, 42, 5322, 5678, 9843]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.01044638617546681,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 8
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ },
82
+ {
83
+ "evaluation_name": "aime25",
84
+ "source_data": {
85
+ "dataset_name": "aime25",
86
+ "source_type": "hf_dataset",
87
+ "hf_repo": "yentinglin/aime_2025",
88
+ "hf_split": "train"
89
+ },
90
+ "evaluation_timestamp": "5877463",
91
+ "metric_config": {
92
+ "evaluation_description": "avg@n:n=1",
93
+ "lower_is_better": false,
94
+ "score_type": "continuous",
95
+ "min_score": 0.0,
96
+ "max_score": 1.0
97
+ },
98
+ "score_details": {
99
+ "score": 0.8416666666666667,
100
+ "details": {
101
+ "seed_scores": "[0.9, 0.8, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8666666666666667, 0.8333333333333334, 0.8333333333333334]",
102
+ "seed_values": "[1234, 1356, 3344, 4158, 42, 5322, 5678, 9843]"
103
+ },
104
+ "uncertainty": {
105
+ "standard_error": {
106
+ "value": 0.01044638617546681,
107
+ "method": "across_seeds"
108
+ },
109
+ "num_samples": 8
110
+ }
111
+ },
112
+ "generation_config": {
113
+ "generation_args": {
114
+ "temperature": 1.0,
115
+ "top_p": 0.95,
116
+ "top_k": 40.0,
117
+ "max_tokens": 64000,
118
+ "max_attempts": 1
119
+ },
120
+ "additional_details": {
121
+ "repetition_penalty": "1.0",
122
+ "presence_penalty": "1.5",
123
+ "seed": "1234",
124
+ "min_p": "0.0"
125
+ }
126
+ }
127
+ }
128
+ ]
129
+ }
every_eval_ever_results/gpqa_diamond.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "gpqa:diamond/inference-optimization/MiniMax-M2.5.w4a16/1777302658.000854",
4
+ "evaluation_timestamp": "5965114",
5
+ "retrieved_timestamp": "1777302658.000854",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "unknown",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "unknown"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w4a16",
18
+ "id": "inference-optimization/MiniMax-M2.5.w4a16",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8000/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "3"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "gpqa:diamond",
38
+ "source_data": {
39
+ "dataset_name": "gpqa:diamond",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "Idavidrein/gpqa",
42
+ "hf_split": "train"
43
+ },
44
+ "evaluation_timestamp": "5981739",
45
+ "metric_config": {
46
+ "evaluation_description": "gpqa_pass@k:k=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.845117845117845,
54
+ "details": {
55
+ "seed_scores": "[0.8535353535353535, 0.8737373737373737, 0.8080808080808081]",
56
+ "seed_values": "[1234, 4158, 42]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.01941508854321682,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 3
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }
every_eval_ever_results/gsm8k_platinum_cot_llama.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "gsm8k_platinum_cot_llama/inference-optimization/MiniMax-M2.5.w4a16/1777302614.762325",
4
+ "evaluation_timestamp": "1776885795",
5
+ "retrieved_timestamp": "1777302614.762325",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "unknown",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w4a16",
18
+ "id": "inference-optimization/MiniMax-M2.5.w4a16",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5.w4a16', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8000/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "gsm8k_platinum_cot_llama/strict-match",
29
+ "source_data": {
30
+ "dataset_name": "gsm8k_platinum_cot_llama",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "madrylab/gsm8k-platinum",
33
+ "hf_split": "test"
34
+ },
35
+ "evaluation_timestamp": "1776887868",
36
+ "metric_config": {
37
+ "evaluation_description": "exact_match (filter: strict-match)",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.9636062861869313,
45
+ "details": {
46
+ "seed_scores": "[0.9602977667493796, 0.9669148056244831, 0.9636062861869313]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.001910174587889599,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "do_sample": "true",
67
+ "until": "[\"<|eot_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"Q:\", \"</s>\", \"<|im_end|>\"]",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "gsm8k_platinum_cot_llama/flexible-extract",
78
+ "source_data": {
79
+ "dataset_name": "gsm8k_platinum_cot_llama",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "madrylab/gsm8k-platinum",
82
+ "hf_split": "test"
83
+ },
84
+ "evaluation_timestamp": "1776887868",
85
+ "metric_config": {
86
+ "evaluation_description": "exact_match (filter: flexible-extract)",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.9751861042183623,
94
+ "details": {
95
+ "seed_scores": "[0.9735318444995864, 0.9735318444995864, 0.978494623655914]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.0016542597187758634,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "do_sample": "true",
116
+ "until": "[\"<|eot_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"Q:\", \"</s>\", \"<|im_end|>\"]",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ }
125
+ ]
126
+ }
every_eval_ever_results/ifeval.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "ifeval/inference-optimization/MiniMax-M2.5.w4a16/1777302591.528493",
4
+ "evaluation_timestamp": "1776886012",
5
+ "retrieved_timestamp": "1777302591.528493",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "unknown",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w4a16",
18
+ "id": "inference-optimization/MiniMax-M2.5.w4a16",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5.w4a16', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8000/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "ifeval",
29
+ "source_data": {
30
+ "dataset_name": "ifeval",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "google/IFEval",
33
+ "hf_split": "train"
34
+ },
35
+ "evaluation_timestamp": "1776888058",
36
+ "metric_config": {
37
+ "evaluation_description": "prompt_level_strict_acc",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.8558225508317929,
45
+ "details": {
46
+ "seed_scores": "[0.8595194085027726, 0.8576709796672828, 0.8502772643253235]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.0028235216851237458,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "until": "[]",
67
+ "do_sample": "true",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "ifeval",
78
+ "source_data": {
79
+ "dataset_name": "ifeval",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "google/IFEval",
82
+ "hf_split": "train"
83
+ },
84
+ "evaluation_timestamp": "1776888058",
85
+ "metric_config": {
86
+ "evaluation_description": "inst_level_strict_acc",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.904476418864908,
94
+ "details": {
95
+ "seed_scores": "[0.9088729016786571, 0.9040767386091128, 0.9004796163069544]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.0024311600840520617,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "until": "[]",
116
+ "do_sample": "true",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ },
125
+ {
126
+ "evaluation_name": "ifeval",
127
+ "source_data": {
128
+ "dataset_name": "ifeval",
129
+ "source_type": "hf_dataset",
130
+ "hf_repo": "google/IFEval",
131
+ "hf_split": "train"
132
+ },
133
+ "evaluation_timestamp": "1776888058",
134
+ "metric_config": {
135
+ "evaluation_description": "prompt_level_loose_acc",
136
+ "lower_is_better": false,
137
+ "score_type": "continuous",
138
+ "min_score": 0.0,
139
+ "max_score": 1.0
140
+ },
141
+ "score_details": {
142
+ "score": 0.88909426987061,
143
+ "details": {
144
+ "seed_scores": "[0.8909426987060998, 0.88909426987061, 0.8872458410351202]",
145
+ "seed_values": "[1234, 4158, 42]"
146
+ },
147
+ "uncertainty": {
148
+ "standard_error": {
149
+ "value": 0.0010671908857479034,
150
+ "method": "across_seeds"
151
+ },
152
+ "num_samples": 3
153
+ }
154
+ },
155
+ "generation_config": {
156
+ "generation_args": {
157
+ "temperature": 1.0,
158
+ "top_p": 0.95,
159
+ "top_k": 40.0,
160
+ "max_tokens": 64000,
161
+ "max_attempts": 1
162
+ },
163
+ "additional_details": {
164
+ "until": "[]",
165
+ "do_sample": "true",
166
+ "min_p": "0.0",
167
+ "presence_penalty": "1.5",
168
+ "repetition_penalty": "1.0",
169
+ "seed": "1234",
170
+ "num_fewshot": "0"
171
+ }
172
+ }
173
+ },
174
+ {
175
+ "evaluation_name": "ifeval",
176
+ "source_data": {
177
+ "dataset_name": "ifeval",
178
+ "source_type": "hf_dataset",
179
+ "hf_repo": "google/IFEval",
180
+ "hf_split": "train"
181
+ },
182
+ "evaluation_timestamp": "1776888058",
183
+ "metric_config": {
184
+ "evaluation_description": "inst_level_loose_acc",
185
+ "lower_is_better": false,
186
+ "score_type": "continuous",
187
+ "min_score": 0.0,
188
+ "max_score": 1.0
189
+ },
190
+ "score_details": {
191
+ "score": 0.9260591526778578,
192
+ "details": {
193
+ "seed_scores": "[0.9292565947242206, 0.9244604316546763, 0.9244604316546763]",
194
+ "seed_values": "[1234, 4158, 42]"
195
+ },
196
+ "uncertainty": {
197
+ "standard_error": {
198
+ "value": 0.001598721023181445,
199
+ "method": "across_seeds"
200
+ },
201
+ "num_samples": 3
202
+ }
203
+ },
204
+ "generation_config": {
205
+ "generation_args": {
206
+ "temperature": 1.0,
207
+ "top_p": 0.95,
208
+ "top_k": 40.0,
209
+ "max_tokens": 64000,
210
+ "max_attempts": 1
211
+ },
212
+ "additional_details": {
213
+ "until": "[]",
214
+ "do_sample": "true",
215
+ "min_p": "0.0",
216
+ "presence_penalty": "1.5",
217
+ "repetition_penalty": "1.0",
218
+ "seed": "1234",
219
+ "num_fewshot": "0"
220
+ }
221
+ }
222
+ }
223
+ ]
224
+ }
every_eval_ever_results/math_500.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "math_500/inference-optimization/MiniMax-M2.5.w4a16/1777302681.44404",
4
+ "evaluation_timestamp": "5922700",
5
+ "retrieved_timestamp": "1777302681.44404",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "unknown",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "unknown"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w4a16",
18
+ "id": "inference-optimization/MiniMax-M2.5.w4a16",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8000/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "3"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "math_500",
38
+ "source_data": {
39
+ "dataset_name": "math_500",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "HuggingFaceH4/MATH-500",
42
+ "hf_split": "test"
43
+ },
44
+ "evaluation_timestamp": "5936819",
45
+ "metric_config": {
46
+ "evaluation_description": "pass@k:k=1&n=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.876,
54
+ "details": {
55
+ "seed_scores": "[0.878, 0.87, 0.88]",
56
+ "seed_values": "[1234, 4158, 42]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.0030550504633038962,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 3
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }
every_eval_ever_results/mmlu_pro_chat.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "mmlu_pro_chat/inference-optimization/MiniMax-M2.5.w4a16/1777302998.11639",
4
+ "evaluation_timestamp": "1777037460",
5
+ "retrieved_timestamp": "1777302998.11639",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "unknown",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5.w4a16",
18
+ "id": "inference-optimization/MiniMax-M2.5.w4a16",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5.w4a16', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8000/v1/chat/completions', 'num_concurrent': 28, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "mmlu_pro_chat_biology/custom-extract",
29
+ "source_data": {
30
+ "dataset_name": "mmlu_pro_chat",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
33
+ "hf_split": "test"
34
+ },
35
+ "evaluation_timestamp": "1777088443",
36
+ "metric_config": {
37
+ "evaluation_description": "exact_match (filter: custom-extract)",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.9079497907949791,
45
+ "details": {
46
+ "seed_scores": "[0.9093444909344491, 0.9065550906555091, 0.9079497907949791]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.0008052305009618233,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "until": "[]",
67
+ "do_sample": "true",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "mmlu_pro_chat_business/custom-extract",
78
+ "source_data": {
79
+ "dataset_name": "mmlu_pro_chat",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
82
+ "hf_split": "test"
83
+ },
84
+ "evaluation_timestamp": "1777088443",
85
+ "metric_config": {
86
+ "evaluation_description": "exact_match (filter: custom-extract)",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.8719898605830165,
94
+ "details": {
95
+ "seed_scores": "[0.8694550063371356, 0.8719898605830165, 0.8745247148288974]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.0014634987812158046,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "until": "[]",
116
+ "do_sample": "true",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ },
125
+ {
126
+ "evaluation_name": "mmlu_pro_chat_chemistry/custom-extract",
127
+ "source_data": {
128
+ "dataset_name": "mmlu_pro_chat",
129
+ "source_type": "hf_dataset",
130
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
131
+ "hf_split": "test"
132
+ },
133
+ "evaluation_timestamp": "1777088443",
134
+ "metric_config": {
135
+ "evaluation_description": "exact_match (filter: custom-extract)",
136
+ "lower_is_better": false,
137
+ "score_type": "continuous",
138
+ "min_score": 0.0,
139
+ "max_score": 1.0
140
+ },
141
+ "score_details": {
142
+ "score": 0.8695524146054181,
143
+ "details": {
144
+ "seed_scores": "[0.8692579505300353, 0.8719081272084805, 0.8674911660777385]",
145
+ "seed_values": "[1234, 4158, 42]"
146
+ },
147
+ "uncertainty": {
148
+ "standard_error": {
149
+ "value": 0.0012835391470967918,
150
+ "method": "across_seeds"
151
+ },
152
+ "num_samples": 3
153
+ }
154
+ },
155
+ "generation_config": {
156
+ "generation_args": {
157
+ "temperature": 1.0,
158
+ "top_p": 0.95,
159
+ "top_k": 40.0,
160
+ "max_tokens": 64000,
161
+ "max_attempts": 1
162
+ },
163
+ "additional_details": {
164
+ "until": "[]",
165
+ "do_sample": "true",
166
+ "min_p": "0.0",
167
+ "presence_penalty": "1.5",
168
+ "repetition_penalty": "1.0",
169
+ "seed": "1234",
170
+ "num_fewshot": "0"
171
+ }
172
+ }
173
+ },
174
+ {
175
+ "evaluation_name": "mmlu_pro_chat_computer_science/custom-extract",
176
+ "source_data": {
177
+ "dataset_name": "mmlu_pro_chat",
178
+ "source_type": "hf_dataset",
179
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
180
+ "hf_split": "test"
181
+ },
182
+ "evaluation_timestamp": "1777088443",
183
+ "metric_config": {
184
+ "evaluation_description": "exact_match (filter: custom-extract)",
185
+ "lower_is_better": false,
186
+ "score_type": "continuous",
187
+ "min_score": 0.0,
188
+ "max_score": 1.0
189
+ },
190
+ "score_details": {
191
+ "score": 0.8601626016260162,
192
+ "details": {
193
+ "seed_scores": "[0.8658536585365854, 0.8609756097560975, 0.8536585365853658]",
194
+ "seed_values": "[1234, 4158, 42]"
195
+ },
196
+ "uncertainty": {
197
+ "standard_error": {
198
+ "value": 0.003543820279301364,
199
+ "method": "across_seeds"
200
+ },
201
+ "num_samples": 3
202
+ }
203
+ },
204
+ "generation_config": {
205
+ "generation_args": {
206
+ "temperature": 1.0,
207
+ "top_p": 0.95,
208
+ "top_k": 40.0,
209
+ "max_tokens": 64000,
210
+ "max_attempts": 1
211
+ },
212
+ "additional_details": {
213
+ "until": "[]",
214
+ "do_sample": "true",
215
+ "min_p": "0.0",
216
+ "presence_penalty": "1.5",
217
+ "repetition_penalty": "1.0",
218
+ "seed": "1234",
219
+ "num_fewshot": "0"
220
+ }
221
+ }
222
+ },
223
+ {
224
+ "evaluation_name": "mmlu_pro_chat_economics/custom-extract",
225
+ "source_data": {
226
+ "dataset_name": "mmlu_pro_chat",
227
+ "source_type": "hf_dataset",
228
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
229
+ "hf_split": "test"
230
+ },
231
+ "evaluation_timestamp": "1777088443",
232
+ "metric_config": {
233
+ "evaluation_description": "exact_match (filter: custom-extract)",
234
+ "lower_is_better": false,
235
+ "score_type": "continuous",
236
+ "min_score": 0.0,
237
+ "max_score": 1.0
238
+ },
239
+ "score_details": {
240
+ "score": 0.8499210110584517,
241
+ "details": {
242
+ "seed_scores": "[0.8542654028436019, 0.8483412322274881, 0.8471563981042654]",
243
+ "seed_values": "[1234, 4158, 42]"
244
+ },
245
+ "uncertainty": {
246
+ "standard_error": {
247
+ "value": 0.0021989590690481933,
248
+ "method": "across_seeds"
249
+ },
250
+ "num_samples": 3
251
+ }
252
+ },
253
+ "generation_config": {
254
+ "generation_args": {
255
+ "temperature": 1.0,
256
+ "top_p": 0.95,
257
+ "top_k": 40.0,
258
+ "max_tokens": 64000,
259
+ "max_attempts": 1
260
+ },
261
+ "additional_details": {
262
+ "until": "[]",
263
+ "do_sample": "true",
264
+ "min_p": "0.0",
265
+ "presence_penalty": "1.5",
266
+ "repetition_penalty": "1.0",
267
+ "seed": "1234",
268
+ "num_fewshot": "0"
269
+ }
270
+ }
271
+ },
272
+ {
273
+ "evaluation_name": "mmlu_pro_chat_engineering/custom-extract",
274
+ "source_data": {
275
+ "dataset_name": "mmlu_pro_chat",
276
+ "source_type": "hf_dataset",
277
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
278
+ "hf_split": "test"
279
+ },
280
+ "evaluation_timestamp": "1777088443",
281
+ "metric_config": {
282
+ "evaluation_description": "exact_match (filter: custom-extract)",
283
+ "lower_is_better": false,
284
+ "score_type": "continuous",
285
+ "min_score": 0.0,
286
+ "max_score": 1.0
287
+ },
288
+ "score_details": {
289
+ "score": 0.7282421740626075,
290
+ "details": {
291
+ "seed_scores": "[0.7347781217750258, 0.7213622291021672, 0.7285861713106295]",
292
+ "seed_values": "[1234, 4158, 42]"
293
+ },
294
+ "uncertainty": {
295
+ "standard_error": {
296
+ "value": 0.003876652105120269,
297
+ "method": "across_seeds"
298
+ },
299
+ "num_samples": 3
300
+ }
301
+ },
302
+ "generation_config": {
303
+ "generation_args": {
304
+ "temperature": 1.0,
305
+ "top_p": 0.95,
306
+ "top_k": 40.0,
307
+ "max_tokens": 64000,
308
+ "max_attempts": 1
309
+ },
310
+ "additional_details": {
311
+ "until": "[]",
312
+ "do_sample": "true",
313
+ "min_p": "0.0",
314
+ "presence_penalty": "1.5",
315
+ "repetition_penalty": "1.0",
316
+ "seed": "1234",
317
+ "num_fewshot": "0"
318
+ }
319
+ }
320
+ },
321
+ {
322
+ "evaluation_name": "mmlu_pro_chat_health/custom-extract",
323
+ "source_data": {
324
+ "dataset_name": "mmlu_pro_chat",
325
+ "source_type": "hf_dataset",
326
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
327
+ "hf_split": "test"
328
+ },
329
+ "evaluation_timestamp": "1777088443",
330
+ "metric_config": {
331
+ "evaluation_description": "exact_match (filter: custom-extract)",
332
+ "lower_is_better": false,
333
+ "score_type": "continuous",
334
+ "min_score": 0.0,
335
+ "max_score": 1.0
336
+ },
337
+ "score_details": {
338
+ "score": 0.7921760391198044,
339
+ "details": {
340
+ "seed_scores": "[0.7946210268948656, 0.7970660146699267, 0.784841075794621]",
341
+ "seed_values": "[1234, 4158, 42]"
342
+ },
343
+ "uncertainty": {
344
+ "standard_error": {
345
+ "value": 0.0037347805174864275,
346
+ "method": "across_seeds"
347
+ },
348
+ "num_samples": 3
349
+ }
350
+ },
351
+ "generation_config": {
352
+ "generation_args": {
353
+ "temperature": 1.0,
354
+ "top_p": 0.95,
355
+ "top_k": 40.0,
356
+ "max_tokens": 64000,
357
+ "max_attempts": 1
358
+ },
359
+ "additional_details": {
360
+ "until": "[]",
361
+ "do_sample": "true",
362
+ "min_p": "0.0",
363
+ "presence_penalty": "1.5",
364
+ "repetition_penalty": "1.0",
365
+ "seed": "1234",
366
+ "num_fewshot": "0"
367
+ }
368
+ }
369
+ },
370
+ {
371
+ "evaluation_name": "mmlu_pro_chat_history/custom-extract",
372
+ "source_data": {
373
+ "dataset_name": "mmlu_pro_chat",
374
+ "source_type": "hf_dataset",
375
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
376
+ "hf_split": "test"
377
+ },
378
+ "evaluation_timestamp": "1777088443",
379
+ "metric_config": {
380
+ "evaluation_description": "exact_match (filter: custom-extract)",
381
+ "lower_is_better": false,
382
+ "score_type": "continuous",
383
+ "min_score": 0.0,
384
+ "max_score": 1.0
385
+ },
386
+ "score_details": {
387
+ "score": 0.6815398075240595,
388
+ "details": {
389
+ "seed_scores": "[0.6850393700787402, 0.6797900262467191, 0.6797900262467191]",
390
+ "seed_values": "[1234, 4158, 42]"
391
+ },
392
+ "uncertainty": {
393
+ "standard_error": {
394
+ "value": 0.0017497812773403416,
395
+ "method": "across_seeds"
396
+ },
397
+ "num_samples": 3
398
+ }
399
+ },
400
+ "generation_config": {
401
+ "generation_args": {
402
+ "temperature": 1.0,
403
+ "top_p": 0.95,
404
+ "top_k": 40.0,
405
+ "max_tokens": 64000,
406
+ "max_attempts": 1
407
+ },
408
+ "additional_details": {
409
+ "until": "[]",
410
+ "do_sample": "true",
411
+ "min_p": "0.0",
412
+ "presence_penalty": "1.5",
413
+ "repetition_penalty": "1.0",
414
+ "seed": "1234",
415
+ "num_fewshot": "0"
416
+ }
417
+ }
418
+ },
419
+ {
420
+ "evaluation_name": "mmlu_pro_chat_law/custom-extract",
421
+ "source_data": {
422
+ "dataset_name": "mmlu_pro_chat",
423
+ "source_type": "hf_dataset",
424
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
425
+ "hf_split": "test"
426
+ },
427
+ "evaluation_timestamp": "1777088443",
428
+ "metric_config": {
429
+ "evaluation_description": "exact_match (filter: custom-extract)",
430
+ "lower_is_better": false,
431
+ "score_type": "continuous",
432
+ "min_score": 0.0,
433
+ "max_score": 1.0
434
+ },
435
+ "score_details": {
436
+ "score": 0.5812897366030881,
437
+ "details": {
438
+ "seed_scores": "[0.5876475930971844, 0.5767484105358764, 0.5794732061762035]",
439
+ "seed_values": "[1234, 4158, 42]"
440
+ },
441
+ "uncertainty": {
442
+ "standard_error": {
443
+ "value": 0.0032747967987865603,
444
+ "method": "across_seeds"
445
+ },
446
+ "num_samples": 3
447
+ }
448
+ },
449
+ "generation_config": {
450
+ "generation_args": {
451
+ "temperature": 1.0,
452
+ "top_p": 0.95,
453
+ "top_k": 40.0,
454
+ "max_tokens": 64000,
455
+ "max_attempts": 1
456
+ },
457
+ "additional_details": {
458
+ "until": "[]",
459
+ "do_sample": "true",
460
+ "min_p": "0.0",
461
+ "presence_penalty": "1.5",
462
+ "repetition_penalty": "1.0",
463
+ "seed": "1234",
464
+ "num_fewshot": "0"
465
+ }
466
+ }
467
+ },
468
+ {
469
+ "evaluation_name": "mmlu_pro_chat_math/custom-extract",
470
+ "source_data": {
471
+ "dataset_name": "mmlu_pro_chat",
472
+ "source_type": "hf_dataset",
473
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
474
+ "hf_split": "test"
475
+ },
476
+ "evaluation_timestamp": "1777088443",
477
+ "metric_config": {
478
+ "evaluation_description": "exact_match (filter: custom-extract)",
479
+ "lower_is_better": false,
480
+ "score_type": "continuous",
481
+ "min_score": 0.0,
482
+ "max_score": 1.0
483
+ },
484
+ "score_details": {
485
+ "score": 0.9346163335800641,
486
+ "details": {
487
+ "seed_scores": "[0.9333826794966691, 0.9370836417468542, 0.9333826794966691]",
488
+ "seed_values": "[1234, 4158, 42]"
489
+ },
490
+ "uncertainty": {
491
+ "standard_error": {
492
+ "value": 0.0012336540833950418,
493
+ "method": "across_seeds"
494
+ },
495
+ "num_samples": 3
496
+ }
497
+ },
498
+ "generation_config": {
499
+ "generation_args": {
500
+ "temperature": 1.0,
501
+ "top_p": 0.95,
502
+ "top_k": 40.0,
503
+ "max_tokens": 64000,
504
+ "max_attempts": 1
505
+ },
506
+ "additional_details": {
507
+ "until": "[]",
508
+ "do_sample": "true",
509
+ "min_p": "0.0",
510
+ "presence_penalty": "1.5",
511
+ "repetition_penalty": "1.0",
512
+ "seed": "1234",
513
+ "num_fewshot": "0"
514
+ }
515
+ }
516
+ },
517
+ {
518
+ "evaluation_name": "mmlu_pro_chat_other/custom-extract",
519
+ "source_data": {
520
+ "dataset_name": "mmlu_pro_chat",
521
+ "source_type": "hf_dataset",
522
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
523
+ "hf_split": "test"
524
+ },
525
+ "evaluation_timestamp": "1777088443",
526
+ "metric_config": {
527
+ "evaluation_description": "exact_match (filter: custom-extract)",
528
+ "lower_is_better": false,
529
+ "score_type": "continuous",
530
+ "min_score": 0.0,
531
+ "max_score": 1.0
532
+ },
533
+ "score_details": {
534
+ "score": 0.7853535353535354,
535
+ "details": {
536
+ "seed_scores": "[0.7954545454545454, 0.775974025974026, 0.7846320346320347]",
537
+ "seed_values": "[1234, 4158, 42]"
538
+ },
539
+ "uncertainty": {
540
+ "standard_error": {
541
+ "value": 0.005635100776267413,
542
+ "method": "across_seeds"
543
+ },
544
+ "num_samples": 3
545
+ }
546
+ },
547
+ "generation_config": {
548
+ "generation_args": {
549
+ "temperature": 1.0,
550
+ "top_p": 0.95,
551
+ "top_k": 40.0,
552
+ "max_tokens": 64000,
553
+ "max_attempts": 1
554
+ },
555
+ "additional_details": {
556
+ "until": "[]",
557
+ "do_sample": "true",
558
+ "min_p": "0.0",
559
+ "presence_penalty": "1.5",
560
+ "repetition_penalty": "1.0",
561
+ "seed": "1234",
562
+ "num_fewshot": "0"
563
+ }
564
+ }
565
+ },
566
+ {
567
+ "evaluation_name": "mmlu_pro_chat_philosophy/custom-extract",
568
+ "source_data": {
569
+ "dataset_name": "mmlu_pro_chat",
570
+ "source_type": "hf_dataset",
571
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
572
+ "hf_split": "test"
573
+ },
574
+ "evaluation_timestamp": "1777088443",
575
+ "metric_config": {
576
+ "evaluation_description": "exact_match (filter: custom-extract)",
577
+ "lower_is_better": false,
578
+ "score_type": "continuous",
579
+ "min_score": 0.0,
580
+ "max_score": 1.0
581
+ },
582
+ "score_details": {
583
+ "score": 0.7247828991315964,
584
+ "details": {
585
+ "seed_scores": "[0.7234468937875751, 0.7274549098196392, 0.7234468937875751]",
586
+ "seed_values": "[1234, 4158, 42]"
587
+ },
588
+ "uncertainty": {
589
+ "standard_error": {
590
+ "value": 0.0013360053440213775,
591
+ "method": "across_seeds"
592
+ },
593
+ "num_samples": 3
594
+ }
595
+ },
596
+ "generation_config": {
597
+ "generation_args": {
598
+ "temperature": 1.0,
599
+ "top_p": 0.95,
600
+ "top_k": 40.0,
601
+ "max_tokens": 64000,
602
+ "max_attempts": 1
603
+ },
604
+ "additional_details": {
605
+ "until": "[]",
606
+ "do_sample": "true",
607
+ "min_p": "0.0",
608
+ "presence_penalty": "1.5",
609
+ "repetition_penalty": "1.0",
610
+ "seed": "1234",
611
+ "num_fewshot": "0"
612
+ }
613
+ }
614
+ },
615
+ {
616
+ "evaluation_name": "mmlu_pro_chat_physics/custom-extract",
617
+ "source_data": {
618
+ "dataset_name": "mmlu_pro_chat",
619
+ "source_type": "hf_dataset",
620
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
621
+ "hf_split": "test"
622
+ },
623
+ "evaluation_timestamp": "1777088443",
624
+ "metric_config": {
625
+ "evaluation_description": "exact_match (filter: custom-extract)",
626
+ "lower_is_better": false,
627
+ "score_type": "continuous",
628
+ "min_score": 0.0,
629
+ "max_score": 1.0
630
+ },
631
+ "score_details": {
632
+ "score": 0.8734924300744162,
633
+ "details": {
634
+ "seed_scores": "[0.8775981524249422, 0.8729792147806005, 0.8698999230177059]",
635
+ "seed_values": "[1234, 4158, 42]"
636
+ },
637
+ "uncertainty": {
638
+ "standard_error": {
639
+ "value": 0.002237053602022402,
640
+ "method": "across_seeds"
641
+ },
642
+ "num_samples": 3
643
+ }
644
+ },
645
+ "generation_config": {
646
+ "generation_args": {
647
+ "temperature": 1.0,
648
+ "top_p": 0.95,
649
+ "top_k": 40.0,
650
+ "max_tokens": 64000,
651
+ "max_attempts": 1
652
+ },
653
+ "additional_details": {
654
+ "until": "[]",
655
+ "do_sample": "true",
656
+ "min_p": "0.0",
657
+ "presence_penalty": "1.5",
658
+ "repetition_penalty": "1.0",
659
+ "seed": "1234",
660
+ "num_fewshot": "0"
661
+ }
662
+ }
663
+ },
664
+ {
665
+ "evaluation_name": "mmlu_pro_chat_psychology/custom-extract",
666
+ "source_data": {
667
+ "dataset_name": "mmlu_pro_chat",
668
+ "source_type": "hf_dataset",
669
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
670
+ "hf_split": "test"
671
+ },
672
+ "evaluation_timestamp": "1777088443",
673
+ "metric_config": {
674
+ "evaluation_description": "exact_match (filter: custom-extract)",
675
+ "lower_is_better": false,
676
+ "score_type": "continuous",
677
+ "min_score": 0.0,
678
+ "max_score": 1.0
679
+ },
680
+ "score_details": {
681
+ "score": 0.8074352548036758,
682
+ "details": {
683
+ "seed_scores": "[0.7944862155388471, 0.8233082706766918, 0.8045112781954887]",
684
+ "seed_values": "[1234, 4158, 42]"
685
+ },
686
+ "uncertainty": {
687
+ "standard_error": {
688
+ "value": 0.008447681042671974,
689
+ "method": "across_seeds"
690
+ },
691
+ "num_samples": 3
692
+ }
693
+ },
694
+ "generation_config": {
695
+ "generation_args": {
696
+ "temperature": 1.0,
697
+ "top_p": 0.95,
698
+ "top_k": 40.0,
699
+ "max_tokens": 64000,
700
+ "max_attempts": 1
701
+ },
702
+ "additional_details": {
703
+ "until": "[]",
704
+ "do_sample": "true",
705
+ "min_p": "0.0",
706
+ "presence_penalty": "1.5",
707
+ "repetition_penalty": "1.0",
708
+ "seed": "1234",
709
+ "num_fewshot": "0"
710
+ }
711
+ }
712
+ },
713
+ {
714
+ "evaluation_name": "mmlu_pro_chat/custom-extract",
715
+ "source_data": {
716
+ "dataset_name": "mmlu_pro_chat",
717
+ "source_type": "other"
718
+ },
719
+ "evaluation_timestamp": "1777088443",
720
+ "metric_config": {
721
+ "evaluation_description": "exact_match (filter: custom-extract)",
722
+ "lower_is_better": false,
723
+ "score_type": "continuous",
724
+ "min_score": 0.0,
725
+ "max_score": 1.0
726
+ },
727
+ "score_details": {
728
+ "score": 0.8124722960992908,
729
+ "details": {
730
+ "seed_scores": "[0.8144115691489362, 0.8125, 0.8105053191489362]",
731
+ "seed_values": "[1234, 4158, 42]"
732
+ },
733
+ "uncertainty": {
734
+ "standard_error": {
735
+ "value": 0.0011277223203151384,
736
+ "method": "across_seeds"
737
+ },
738
+ "num_samples": 3
739
+ }
740
+ }
741
+ }
742
+ ]
743
+ }