Spaces:
Running
Running
timchen0618
Show incomplete runs as incorrect; fix missing questions via BrowseComp JSONL fallback
1eb493c | #!/usr/bin/env python3 | |
| """ | |
| Patch the traj_summary_orig_ext (orig-analysis) HF dataset to add | |
| question/correct_answer/correct by joining with eval result files. | |
| Dataset: timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1 (826 rows) | |
| Eval dir: evals/bcp/Qwen3-Embedding-8B/full/gpt-oss-120b/ | |
| traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0 (832 eval files) | |
| Python env: /scratch/hc3337/envs/raca-py312/bin/python | |
| """ | |
| from __future__ import annotations | |
| import argparse, json, sys, os | |
| from pathlib import Path | |
| os.environ.setdefault("HF_HOME", "/scratch/hc3337/.cache/huggingface") | |
| REPO = "timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1" | |
| EVAL_DIR = Path("/scratch/hc3337/projects/BrowseComp-Plus/evals/bcp/Qwen3-Embedding-8B/full/gpt-oss-120b/traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0") | |
| BC_JSONL = Path("/scratch/hc3337/projects/BrowseComp-Plus/data/browsecomp_plus_decrypted_test300.jsonl") | |
| def load_browsecomp_questions(jsonl_path: Path) -> dict: | |
| qmap: dict = {} | |
| if not jsonl_path.exists(): | |
| return qmap | |
| with jsonl_path.open("r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| d = json.loads(line) | |
| qid_raw = str(d.get("query_id", "")).strip() | |
| qid = int(qid_raw) if qid_raw.isdigit() else qid_raw | |
| q = d.get("query") or d.get("question") or "" | |
| if qid and q: | |
| qmap[qid] = q | |
| except Exception as e: | |
| print(f"warning: skipping line: {e}", file=sys.stderr) | |
| print(f"Loaded {len(qmap)} questions from {jsonl_path}", file=sys.stderr) | |
| return qmap | |
| def load_eval_data(eval_dir: Path) -> dict: | |
| eval_map: dict = {} | |
| for p in eval_dir.glob("*_eval.json"): | |
| try: | |
| d = json.load(p.open("r", encoding="utf-8")) | |
| qid_raw = str(d.get("query_id", "")).strip() | |
| qid = int(qid_raw) if qid_raw.isdigit() else qid_raw | |
| jr = d.get("judge_result") or {} | |
| correct_val = jr.get("correct") | |
| eval_map[qid] = { | |
| "question": str(d.get("question") or ""), | |
| "correct_answer": str(d.get("correct_answer") or ""), | |
| "correct": bool(correct_val) if correct_val is not None else None, | |
| } | |
| except Exception as e: | |
| print(f"warning: skipping {p.name}: {e}", file=sys.stderr) | |
| print(f"Loaded {len(eval_map)} eval entries from {eval_dir}", file=sys.stderr) | |
| return eval_map | |
| def main(): | |
| from datasets import load_dataset, Dataset | |
| eval_map = load_eval_data(EVAL_DIR) | |
| bc_questions = load_browsecomp_questions(BC_JSONL) | |
| print(f"Loading {REPO}...", file=sys.stderr) | |
| ds = load_dataset(REPO, split="train") | |
| print(f"Loaded {len(ds)} rows. Columns: {ds.column_names}", file=sys.stderr) | |
| rows = [] | |
| matched = 0 | |
| for row in ds: | |
| qid_raw = str(row["query_id"]).strip() | |
| qid = int(qid_raw) if qid_raw.isdigit() else qid_raw | |
| ev = eval_map.get(qid, {}) | |
| if ev: | |
| matched += 1 | |
| r = dict(row) | |
| question = ev.get("question", "") or bc_questions.get(qid, "") | |
| r["question"] = question | |
| r["correct_answer"] = ev.get("correct_answer", "") | |
| r["correct"] = ev.get("correct", None) | |
| rows.append(r) | |
| print(f"Matched {matched}/{len(rows)} rows with eval data.", file=sys.stderr) | |
| no_question = sum(1 for r in rows if not r.get("question")) | |
| print(f"Rows missing question: {no_question}", file=sys.stderr) | |
| correct_count = sum(1 for r in rows if r.get("correct") is True) | |
| if matched: | |
| print(f"Accuracy: {correct_count}/{matched} ({100*correct_count//matched}%)", file=sys.stderr) | |
| ds_new = Dataset.from_list(rows) | |
| ds_new.push_to_hub(REPO, split="train", | |
| commit_message="Fix missing questions via BrowseComp JSONL fallback") | |
| print(f"Pushed {len(rows)} rows to {REPO}.") | |
| if __name__ == "__main__": | |
| main() | |