Spaces:

timchen0618
/

dashboard

Running

dashboard / patch_orig_analysis_with_eval.py

timchen0618

Show incomplete runs as incorrect; fix missing questions via BrowseComp JSONL fallback

1eb493c 5 days ago

4.09 kB

	#!/usr/bin/env python3
	"""
	Patch the traj_summary_orig_ext (orig-analysis) HF dataset to add
	question/correct_answer/correct by joining with eval result files.

	Dataset: timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1 (826 rows)
	Eval dir: evals/bcp/Qwen3-Embedding-8B/full/gpt-oss-120b/
	traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0 (832 eval files)

	Python env: /scratch/hc3337/envs/raca-py312/bin/python
	"""
	from __future__ import annotations
	import argparse, json, sys, os
	from pathlib import Path

	os.environ.setdefault("HF_HOME", "/scratch/hc3337/.cache/huggingface")

	REPO = "timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1"
	EVAL_DIR = Path("/scratch/hc3337/projects/BrowseComp-Plus/evals/bcp/Qwen3-Embedding-8B/full/gpt-oss-120b/traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0")
	BC_JSONL = Path("/scratch/hc3337/projects/BrowseComp-Plus/data/browsecomp_plus_decrypted_test300.jsonl")


	def load_browsecomp_questions(jsonl_path: Path) -> dict:
	qmap: dict = {}
	if not jsonl_path.exists():
	return qmap
	with jsonl_path.open("r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	d = json.loads(line)
	qid_raw = str(d.get("query_id", "")).strip()
	qid = int(qid_raw) if qid_raw.isdigit() else qid_raw
	q = d.get("query") or d.get("question") or ""
	if qid and q:
	qmap[qid] = q
	except Exception as e:
	print(f"warning: skipping line: {e}", file=sys.stderr)
	print(f"Loaded {len(qmap)} questions from {jsonl_path}", file=sys.stderr)
	return qmap


	def load_eval_data(eval_dir: Path) -> dict:
	eval_map: dict = {}
	for p in eval_dir.glob("*_eval.json"):
	try:
	d = json.load(p.open("r", encoding="utf-8"))
	qid_raw = str(d.get("query_id", "")).strip()
	qid = int(qid_raw) if qid_raw.isdigit() else qid_raw
	jr = d.get("judge_result") or {}
	correct_val = jr.get("correct")
	eval_map[qid] = {
	"question": str(d.get("question") or ""),
	"correct_answer": str(d.get("correct_answer") or ""),
	"correct": bool(correct_val) if correct_val is not None else None,
	}
	except Exception as e:
	print(f"warning: skipping {p.name}: {e}", file=sys.stderr)
	print(f"Loaded {len(eval_map)} eval entries from {eval_dir}", file=sys.stderr)
	return eval_map


	def main():
	from datasets import load_dataset, Dataset

	eval_map = load_eval_data(EVAL_DIR)
	bc_questions = load_browsecomp_questions(BC_JSONL)

	print(f"Loading {REPO}...", file=sys.stderr)
	ds = load_dataset(REPO, split="train")
	print(f"Loaded {len(ds)} rows. Columns: {ds.column_names}", file=sys.stderr)

	rows = []
	matched = 0
	for row in ds:
	qid_raw = str(row["query_id"]).strip()
	qid = int(qid_raw) if qid_raw.isdigit() else qid_raw
	ev = eval_map.get(qid, {})
	if ev:
	matched += 1
	r = dict(row)
	question = ev.get("question", "") or bc_questions.get(qid, "")
	r["question"] = question
	r["correct_answer"] = ev.get("correct_answer", "")
	r["correct"] = ev.get("correct", None)
	rows.append(r)

	print(f"Matched {matched}/{len(rows)} rows with eval data.", file=sys.stderr)
	no_question = sum(1 for r in rows if not r.get("question"))
	print(f"Rows missing question: {no_question}", file=sys.stderr)
	correct_count = sum(1 for r in rows if r.get("correct") is True)
	if matched:
	print(f"Accuracy: {correct_count}/{matched} ({100*correct_count//matched}%)", file=sys.stderr)

	ds_new = Dataset.from_list(rows)
	ds_new.push_to_hub(REPO, split="train",
	commit_message="Fix missing questions via BrowseComp JSONL fallback")
	print(f"Pushed {len(rows)} rows to {REPO}.")


	if __name__ == "__main__":
	main()