moccaram's picture
Replace v1 demo with v2 XGBoost-backed Gradio app (reference-backed rebuild)
8ba081b verified
"""Evaluation metrics with statistical significance — triple-barrier era.
The original notebook reported directional accuracy without binomial p-values;
49.9% over 499 days is statistically indistinguishable from 50%. This module
makes that explicit by attaching a p-value to every accuracy figure.
Metric conventions
------------------
- For 3-class labels ``{-1, 0, +1}``, the null is uniform random: ``p_null=1/3``.
- For *directional accuracy when acting*, restrict to predictions ``in {-1, +1}``
(i.e. ignore "no-action" 0 predictions), compare to ``p_null=1/2``.
- Both metrics use a one-sided binomial test (we only care if it beats chance).
"""
from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from .cv import binomial_pvalue
def directional_accuracy_when_acting(
y_true: np.ndarray, y_pred: np.ndarray
) -> tuple[float, int, int]:
"""Accuracy conditioned on the model predicting a non-zero direction.
Returns ``(accuracy, n_correct, n_acting)``. If ``n_acting`` is 0, returns
``(nan, 0, 0)``.
"""
acting_mask = y_pred != 0
n_acting = int(acting_mask.sum())
if n_acting == 0:
return float("nan"), 0, 0
correct = int(((y_pred == y_true) & acting_mask).sum())
return correct / n_acting, correct, n_acting
def fold_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
"""Per-fold metric bundle. Designed to be one row in the comparison CSV."""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
n = len(y_true)
acc = accuracy_score(y_true, y_pred)
n_acc_correct = int((y_true == y_pred).sum())
dir_acc, n_dir_correct, n_acting = directional_accuracy_when_acting(y_true, y_pred)
return {
"n_test": n,
"accuracy": acc,
"binom_p_acc": binomial_pvalue(n_acc_correct, n, p_null=1 / 3),
"n_acting": n_acting,
"dir_acc_when_acting": dir_acc,
"binom_p_dir": (
binomial_pvalue(n_dir_correct, n_acting, p_null=0.5) if n_acting > 0 else float("nan")
),
}
def summarize_results(results: pd.DataFrame) -> pd.DataFrame:
"""Aggregate per-fold rows to per-model summary with mean ± std."""
keep = ["accuracy", "binom_p_acc", "dir_acc_when_acting", "binom_p_dir"]
grouped = results.groupby("model")[keep]
summary = grouped.agg(["mean", "std"])
summary.columns = [f"{c}_{stat}" for c, stat in summary.columns]
summary["n_folds"] = results.groupby("model").size()
return summary.reset_index()
def confusion_table(y_true: np.ndarray, y_pred: np.ndarray, labels=(-1, 0, 1)) -> pd.DataFrame:
"""Confusion matrix as a labeled DataFrame (rows=true, cols=pred)."""
cm = confusion_matrix(y_true, y_pred, labels=list(labels))
return pd.DataFrame(
cm, index=[f"true_{c}" for c in labels], columns=[f"pred_{c}" for c in labels]
)