dashboard / patch_orig_analysis_with_eval.py
timchen0618
Show incomplete runs as incorrect; fix missing questions via BrowseComp JSONL fallback
1eb493c
#!/usr/bin/env python3
"""
Patch the traj_summary_orig_ext (orig-analysis) HF dataset to add
question/correct_answer/correct by joining with eval result files.
Dataset: timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1 (826 rows)
Eval dir: evals/bcp/Qwen3-Embedding-8B/full/gpt-oss-120b/
traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0 (832 eval files)
Python env: /scratch/hc3337/envs/raca-py312/bin/python
"""
from __future__ import annotations
import argparse, json, sys, os
from pathlib import Path
os.environ.setdefault("HF_HOME", "/scratch/hc3337/.cache/huggingface")
REPO = "timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1"
EVAL_DIR = Path("/scratch/hc3337/projects/BrowseComp-Plus/evals/bcp/Qwen3-Embedding-8B/full/gpt-oss-120b/traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0")
BC_JSONL = Path("/scratch/hc3337/projects/BrowseComp-Plus/data/browsecomp_plus_decrypted_test300.jsonl")
def load_browsecomp_questions(jsonl_path: Path) -> dict:
qmap: dict = {}
if not jsonl_path.exists():
return qmap
with jsonl_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
qid_raw = str(d.get("query_id", "")).strip()
qid = int(qid_raw) if qid_raw.isdigit() else qid_raw
q = d.get("query") or d.get("question") or ""
if qid and q:
qmap[qid] = q
except Exception as e:
print(f"warning: skipping line: {e}", file=sys.stderr)
print(f"Loaded {len(qmap)} questions from {jsonl_path}", file=sys.stderr)
return qmap
def load_eval_data(eval_dir: Path) -> dict:
eval_map: dict = {}
for p in eval_dir.glob("*_eval.json"):
try:
d = json.load(p.open("r", encoding="utf-8"))
qid_raw = str(d.get("query_id", "")).strip()
qid = int(qid_raw) if qid_raw.isdigit() else qid_raw
jr = d.get("judge_result") or {}
correct_val = jr.get("correct")
eval_map[qid] = {
"question": str(d.get("question") or ""),
"correct_answer": str(d.get("correct_answer") or ""),
"correct": bool(correct_val) if correct_val is not None else None,
}
except Exception as e:
print(f"warning: skipping {p.name}: {e}", file=sys.stderr)
print(f"Loaded {len(eval_map)} eval entries from {eval_dir}", file=sys.stderr)
return eval_map
def main():
from datasets import load_dataset, Dataset
eval_map = load_eval_data(EVAL_DIR)
bc_questions = load_browsecomp_questions(BC_JSONL)
print(f"Loading {REPO}...", file=sys.stderr)
ds = load_dataset(REPO, split="train")
print(f"Loaded {len(ds)} rows. Columns: {ds.column_names}", file=sys.stderr)
rows = []
matched = 0
for row in ds:
qid_raw = str(row["query_id"]).strip()
qid = int(qid_raw) if qid_raw.isdigit() else qid_raw
ev = eval_map.get(qid, {})
if ev:
matched += 1
r = dict(row)
question = ev.get("question", "") or bc_questions.get(qid, "")
r["question"] = question
r["correct_answer"] = ev.get("correct_answer", "")
r["correct"] = ev.get("correct", None)
rows.append(r)
print(f"Matched {matched}/{len(rows)} rows with eval data.", file=sys.stderr)
no_question = sum(1 for r in rows if not r.get("question"))
print(f"Rows missing question: {no_question}", file=sys.stderr)
correct_count = sum(1 for r in rows if r.get("correct") is True)
if matched:
print(f"Accuracy: {correct_count}/{matched} ({100*correct_count//matched}%)", file=sys.stderr)
ds_new = Dataset.from_list(rows)
ds_new.push_to_hub(REPO, split="train",
commit_message="Fix missing questions via BrowseComp JSONL fallback")
print(f"Pushed {len(rows)} rows to {REPO}.")
if __name__ == "__main__":
main()