File size: 6,465 Bytes
fa6b40b
bd51d10
 
 
 
 
fa6b40b
 
 
5b00900
fa6b40b
 
cffb305
 
 
5b00900
cffb305
 
5b00900
 
 
 
 
cffb305
 
5b00900
cffb305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa6b40b
bd51d10
5b00900
bd51d10
fa6b40b
bd51d10
fa6b40b
 
 
 
 
 
 
 
bd51d10
 
 
 
 
 
 
 
58fe58d
 
 
 
 
 
d14bce3
 
bd51d10
 
d0a0739
58fe58d
bd51d10
 
 
d14bce3
bd51d10
 
 
 
d14bce3
d0a0739
 
 
bd51d10
fa6b40b
bd51d10
 
 
 
 
fa6b40b
bd51d10
fa6b40b
 
bd51d10
 
 
 
fa6b40b
 
 
 
 
bd51d10
 
45ce7ef
fa6b40b
 
 
94039e3
 
 
bd51d10
45ce7ef
 
 
 
 
 
 
fa6b40b
bd51d10
8026e0e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from flask import Blueprint, jsonify, request
from datasets import load_dataset
import json

bp = Blueprint("selected_tools", __name__, url_prefix="/api/selected-tools")

VARIANTS: dict[str, dict] = {
    "traj_summary_orig_ext": {
        "repo": "timchen0618/browsecomp-plus-selected-tools-orig-analysis-v1",
        "label": "Summary of Trajectory",
        "description": "Selected tools (orig messages) · traj_summary_orig_ext conditioned",
    },
    # test300 variants — excerpt extracted from <trajectory_summary> in run files
    "test300-gpt-oss-120b-less-chars": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-less-chars-v1",
        "label": "Gemini 2.5 Pro 0",
        "description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0_less_chars",
    },
    "test300-gpt-oss-120b": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-gpt-oss-120b-v1",
        "label": "Gemini 2.5 Pro 1",
        "description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_seed0",
    },
    "test300-gemini-2p5-pro": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-gemini-2p5-pro-v1",
        "label": "Gemini 2.5 Pro 2",
        "description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_gemini-2.5-pro_1_seed0",
    },
    "test300-gemini-3p1-pro": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-gemini-3p1-pro-v1",
        "label": "Gemini 3.1 Pro Preview",
        "description": "test300 · traj_summary_orig_ext_selected_tools_gpt-oss-120b_gemini_3.1-pro-preview_seed0",
    },
    "test300-random-seed0": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed0-v1",
        "label": "Random Seed 0",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed0_gpt-oss-120b_seed0",
    },
    "test300-random-seed1": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed1-v1",
        "label": "Random Seed 1",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed1_gpt-oss-120b_seed0",
    },
    "test300-random-seed3": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed3-v1",
        "label": "Random Seed 3",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed3_gpt-oss-120b_seed0",
    },
    "test300-random-seed4": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed4-v1",
        "label": "Random Seed 4",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed4_gpt-oss-120b_seed0",
    },
    "test300-random-seed5": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed5-v1",
        "label": "Random Seed 5",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed5_gpt-oss-120b_seed0",
    },
    "test300-random-seed6": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed6-v1",
        "label": "Random Seed 6",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed6_gpt-oss-120b_seed0",
    },
    "test300-random-seed7": {
        "repo": "timchen0618/browsecomp-plus-sel-tools-test300-random-seed7-v1",
        "label": "Random Seed 7",
        "description": "test300 · traj_summary_orig_ext_selected_tools_random_seed7_gpt-oss-120b_seed0",
    },
}

DEFAULT_VARIANT = "traj_summary_orig_ext"

_cache: dict[str, list] = {}


def _load(variant: str) -> list:
    if variant in _cache:
        return _cache[variant]
    if variant not in VARIANTS:
        raise ValueError(f"Unknown variant: {variant!r}")
    repo = VARIANTS[variant]["repo"]
    ds = load_dataset(repo, split="train")
    rows = []
    for row in ds:
        tool_counts = {}
        try:
            tool_counts = json.loads(row.get("tool_call_counts") or "{}")
        except Exception:
            pass
        total_tool_calls = sum(tool_counts.values()) if tool_counts else 0
        sel_idx = row["selected_indices"]
        if isinstance(sel_idx, str):
            try:
                sel_idx = json.loads(sel_idx)
            except Exception:
                sel_idx = []
        new_traj = row.get("new_trajectory") or ""
        new_status = "completed" if "[Final Answer]" in new_traj else "incomplete"
        rows.append({
            "query_id": str(row["query_id"]),
            "rationale": row.get("rationale") or "",
            "selected_indices": sel_idx,
            "k_requested": int(row["k_requested"]),
            "k_effective": int(row["k_effective"]),
            "excerpt": row["excerpt"],
            "new_trajectory": new_traj,
            "direct_answer": bool(row["direct_answer"]),
            "tool_call_counts": tool_counts,
            "total_tool_calls": total_tool_calls,
            "status": row["status"],
            "new_status": new_status,
            "question": row.get("question") or "",
            "correct_answer": row.get("correct_answer") or "",
            "correct": row.get("correct"),  # None if not available
        })
    _cache[variant] = rows
    return rows


@bp.get("/")
def get_data():
    variant = request.args.get("variant", DEFAULT_VARIANT)
    try:
        rows = _load(variant)
        return jsonify({"rows": rows, "variant": variant, "variants": VARIANTS})
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@bp.get("/variants")
def get_variants():
    return jsonify({"variants": VARIANTS, "default": DEFAULT_VARIANT})


@bp.post("/reload")
def reload_data():
    import shutil, os
    variant = request.args.get("variant", DEFAULT_VARIANT)
    if variant in _cache:
        del _cache[variant]
    if variant not in VARIANTS:
        return jsonify({"error": f"Unknown variant: {variant!r}"}), 400
    repo = VARIANTS[variant]["repo"]
    try:
        # Delete cached dataset dir so stale schema metadata doesn't block new columns
        cache_base = os.path.expanduser("~/.cache/huggingface/hub")
        dataset_cache_name = "datasets--" + repo.replace("/", "--")
        dataset_cache_path = os.path.join(cache_base, dataset_cache_name)
        if os.path.exists(dataset_cache_path):
            shutil.rmtree(dataset_cache_path)
        rows = _load(variant)
        return jsonify({"status": "ok", "count": len(rows), "variant": variant})
    except Exception as e:
        return jsonify({"error": str(e)}), 500