Spaces:
Running
Running
File size: 6,465 Bytes
fa6b40b bd51d10 fa6b40b 5b00900 fa6b40b cffb305 5b00900 cffb305 5b00900 cffb305 5b00900 cffb305 fa6b40b bd51d10 5b00900 bd51d10 fa6b40b bd51d10 fa6b40b bd51d10 58fe58d d14bce3 bd51d10 d0a0739 58fe58d bd51d10 d14bce3 bd51d10 d14bce3 d0a0739 bd51d10 fa6b40b bd51d10 fa6b40b bd51d10 fa6b40b bd51d10 fa6b40b bd51d10 45ce7ef fa6b40b 94039e3 bd51d10 45ce7ef fa6b40b bd51d10 8026e0e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | from flask import Blueprint, jsonify, request
from datasets import load_dataset
import json
bp = Blueprint("selected_tools", __name__, url_prefix="/api/selected-tools")
VARIANTS: dict[str, dict] = {
"traj_summary_orig_ext": {
"repo": "timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1",
"label": "Summary of Trajectory",
"description": "Selected tools (orig messages) · traj_summary_orig_ext conditioned",
},
# test300 variants — excerpt extracted from <trajectory_summary> in run files
"test300-gpt-oss-120b-less-chars": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1",
"label": "Gemini 2.5 Pro 0",
"description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0_less_chars",
},
"test300-gpt-oss-120b": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1",
"label": "Gemini 2.5 Pro 1",
"description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0",
},
"test300-gemini-2p5-pro": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1",
"label": "Gemini 2.5 Pro 2",
"description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_gemini-2.5-pro_1_seed0",
},
"test300-gemini-3p1-pro": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1",
"label": "Gemini 3.1 Pro Preview",
"description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_gemini_3.1-pro-preview_seed0",
},
"test300-random-seed0": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1",
"label": "Random Seed 0",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed0_gpt-oss-120b_seed0",
},
"test300-random-seed1": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1",
"label": "Random Seed 1",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed1_gpt-oss-120b_seed0",
},
"test300-random-seed3": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1",
"label": "Random Seed 3",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed3_gpt-oss-120b_seed0",
},
"test300-random-seed4": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1",
"label": "Random Seed 4",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed4_gpt-oss-120b_seed0",
},
"test300-random-seed5": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1",
"label": "Random Seed 5",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed5_gpt-oss-120b_seed0",
},
"test300-random-seed6": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1",
"label": "Random Seed 6",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed6_gpt-oss-120b_seed0",
},
"test300-random-seed7": {
"repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1",
"label": "Random Seed 7",
"description": "test300 · traj_summary_orig_ext_selected_tools_random_seed7_gpt-oss-120b_seed0",
},
}
DEFAULT_VARIANT = "traj_summary_orig_ext"
_cache: dict[str, list] = {}
def _load(variant: str) -> list:
if variant in _cache:
return _cache[variant]
if variant not in VARIANTS:
raise ValueError(f"Unknown variant: {variant!r}")
repo = VARIANTS[variant]["repo"]
ds = load_dataset(repo, split="train")
rows = []
for row in ds:
tool_counts = {}
try:
tool_counts = json.loads(row.get("tool_call_counts") or "{}")
except Exception:
pass
total_tool_calls = sum(tool_counts.values()) if tool_counts else 0
sel_idx = row["selected_indices"]
if isinstance(sel_idx, str):
try:
sel_idx = json.loads(sel_idx)
except Exception:
sel_idx = []
new_traj = row.get("new_trajectory") or ""
new_status = "completed" if "[Final Answer]" in new_traj else "incomplete"
rows.append({
"query_id": str(row["query_id"]),
"rationale": row.get("rationale") or "",
"selected_indices": sel_idx,
"k_requested": int(row["k_requested"]),
"k_effective": int(row["k_effective"]),
"excerpt": row["excerpt"],
"new_trajectory": new_traj,
"direct_answer": bool(row["direct_answer"]),
"tool_call_counts": tool_counts,
"total_tool_calls": total_tool_calls,
"status": row["status"],
"new_status": new_status,
"question": row.get("question") or "",
"correct_answer": row.get("correct_answer") or "",
"correct": row.get("correct"), # None if not available
})
_cache[variant] = rows
return rows
@bp.get("/")
def get_data():
variant = request.args.get("variant", DEFAULT_VARIANT)
try:
rows = _load(variant)
return jsonify({"rows": rows, "variant": variant, "variants": VARIANTS})
except Exception as e:
return jsonify({"error": str(e)}), 500
@bp.get("/variants")
def get_variants():
return jsonify({"variants": VARIANTS, "default": DEFAULT_VARIANT})
@bp.post("/reload")
def reload_data():
import shutil, os
variant = request.args.get("variant", DEFAULT_VARIANT)
if variant in _cache:
del _cache[variant]
if variant not in VARIANTS:
return jsonify({"error": f"Unknown variant: {variant!r}"}), 400
repo = VARIANTS[variant]["repo"]
try:
# Delete cached dataset dir so stale schema metadata doesn't block new columns
cache_base = os.path.expanduser("~/.cache/huggingface/hub")
dataset_cache_name = "datasets--" + repo.replace("/", "--")
dataset_cache_path = os.path.join(cache_base, dataset_cache_name)
if os.path.exists(dataset_cache_path):
shutil.rmtree(dataset_cache_path)
rows = _load(variant)
return jsonify({"status": "ok", "count": len(rows), "variant": variant})
except Exception as e:
return jsonify({"error": str(e)}), 500
|