moccaram's picture
Replace v1 demo with v2 XGBoost-backed Gradio app (reference-backed rebuild)
8ba081b verified
"""CV-aware training driver — one harness for all five models.
The driver expects each model to expose ``fit(X, y, sample_weight=None)``,
``predict(X)``, and (optionally) ``predict_proba(X)``. The triple-barrier label
``{-1, 0, +1}`` is shared across all of them.
Sample weights come from AFML Ch.4 — observations whose label intervals overlap
contribute less unique information, so they should count less in the loss. The
simplest implementation is to weight inversely by the number of overlapping
labels (Snippet 4.1); for now the driver supports passing pre-computed weights
or falling back to uniform.
"""
from __future__ import annotations
from collections.abc import Callable
from typing import Any
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from .cv import PurgedKFold
from .eval import fold_metrics
def fit_predict_one_fold(
model_builder: Callable[[], Any],
X_train: pd.DataFrame,
y_train: pd.Series,
X_test: pd.DataFrame,
sample_weight_train: np.ndarray | None = None,
standardize: bool = True,
) -> tuple[np.ndarray, Any]:
"""Fit on the train fold, predict on the test fold. Returns (y_pred, fitted_model)."""
if standardize:
scaler = StandardScaler().fit(X_train.values)
X_train_s = pd.DataFrame(
scaler.transform(X_train.values), index=X_train.index, columns=X_train.columns
)
X_test_s = pd.DataFrame(
scaler.transform(X_test.values), index=X_test.index, columns=X_test.columns
)
else:
X_train_s, X_test_s = X_train, X_test
model = model_builder()
model.fit(X_train_s, y_train.values, sample_weight=sample_weight_train)
return model.predict(X_test_s), model
def run_cv(
model_name: str,
model_builder: Callable[[], Any],
X: pd.DataFrame,
y: pd.Series,
cv: PurgedKFold,
sample_weight: pd.Series | None = None,
standardize: bool = True,
extra_columns: dict | None = None,
) -> pd.DataFrame:
"""Run a model across all CV folds. Returns one row per fold."""
rows = []
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X)):
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
sw_train = sample_weight.iloc[train_idx].values if sample_weight is not None else None
y_pred, _ = fit_predict_one_fold(
model_builder=model_builder,
X_train=X_train,
y_train=y_train,
X_test=X_test,
sample_weight_train=sw_train,
standardize=standardize,
)
metrics = fold_metrics(y_test.values, y_pred)
row = {"model": model_name, "fold": fold_idx, **metrics}
if extra_columns:
row.update(extra_columns)
rows.append(row)
return pd.DataFrame(rows)
def uniqueness_weights(t1: pd.Series) -> pd.Series:
"""Approximate AFML Ch.4 sample-uniqueness weights.
For each event, count how many other events have overlapping
``[start, t1]`` intervals, and weight inversely. Not the rigorous Snippet
4.1 (which counts overlap proportionally), but the right order of magnitude
and much faster.
"""
weights = pd.Series(1.0, index=t1.index)
t1_arr = t1.values
start_arr = t1.index.values
n = len(t1)
for i in range(n):
overlap = np.sum((start_arr <= t1_arr[i]) & (t1_arr >= start_arr[i]))
weights.iloc[i] = 1.0 / max(overlap, 1)
# normalize so the weights sum to n (mean weight = 1)
weights *= n / weights.sum()
return weights