Spaces:

moccaram
/

DataSynthis_ML_JobTask

Sleeping

App Files Files Community

DataSynthis_ML_JobTask / src /cv.py

moccaram

Replace v1 demo with v2 XGBoost-backed Gradio app (reference-backed rebuild)

8ba081b verified 7 days ago

raw

history blame contribute delete

3.71 kB

	"""Purged k-fold cross-validation — AFML Ch.7 (BonusPDF pp.62-67).

	Standard k-fold leaks information in finance because labels span time intervals.
	If a training label's interval ``[t_i, t1_i]`` overlaps a test label's interval
	``[t_j, t1_j]``, the two share underlying price information and the train/test
	boundary is fictitious. ``PurgedKFold`` drops the offending training samples;
	an additional ``pctEmbargo`` buffer drops samples immediately after each test
	fold to prevent reverse leakage from the test set into a later train fold.

	This is a port of AFML Snippets 7.2-7.3 (BonusPDF pp.65-66). The canonical class
	inherits from sklearn's ``_BaseKFold`` so it works as a drop-in replacement.
	"""

	from __future__ import annotations

	import numpy as np
	import pandas as pd
	from scipy import stats
	from sklearn.model_selection._split import _BaseKFold


	class PurgedKFold(_BaseKFold):
	"""K-fold CV with purging + optional embargo. AFML Snippet 7.3 (BonusPDF p.66)."""

	def __init__(self, n_splits: int = 5, t1: pd.Series \| None = None, pct_embargo: float = 0.0):
	if not isinstance(t1, pd.Series):
	raise ValueError("`t1` must be a pd.Series of label-end timestamps")
	super().__init__(n_splits, shuffle=False, random_state=None)
	self.t1 = t1
	self.pct_embargo = pct_embargo

	def split(self, X, y=None, groups=None):
	if not X.index.equals(self.t1.index):
	raise ValueError("X.index must equal t1.index")
	indices = np.arange(X.shape[0])
	embargo_size = int(X.shape[0] * self.pct_embargo)
	test_ranges = [(arr[0], arr[-1] + 1) for arr in np.array_split(indices, self.n_splits)]

	for i, j in test_ranges:
	t0 = self.t1.index[i]
	test_indices = indices[i:j]
	max_t1_in_test = self.t1.iloc[test_indices].max()
	max_t1_pos = self.t1.index.searchsorted(max_t1_in_test)
	# left train: rows whose label ended before test starts
	left_train = self.t1.index.searchsorted(self.t1[self.t1 <= t0].index)
	# right train: rows starting after max-t1 + embargo
	if max_t1_pos < X.shape[0]:
	right_train = indices[max_t1_pos + embargo_size :]
	else:
	right_train = np.array([], dtype=int)
	train_indices = np.concatenate([left_train, right_train])
	yield train_indices, test_indices


	def get_embargo_times(times: pd.DatetimeIndex, pct_embargo: float) -> pd.Series:
	"""AFML Snippet 7.2 (BonusPDF p.65). Map each timestamp to its embargo end."""
	step = int(times.shape[0] * pct_embargo)
	if step == 0:
	return pd.Series(times, index=times)
	embargo = pd.Series(times[step:], index=times[:-step])
	return pd.concat([embargo, pd.Series(times[-1], index=times[-step:])])


	def binomial_pvalue(n_correct: int, n_total: int, p_null: float = 0.5) -> float:
	"""One-sided binomial p-value: ``P(X >= n_correct \| n=n_total, p=p_null)``.

	Used to test whether observed accuracy or directional accuracy exceeds the
	null. For three-class targets, pass ``p_null=1/3``; for binary direction
	after dropping 0-labels, pass ``p_null=0.5``.
	"""
	return float(stats.binomtest(n_correct, n_total, p=p_null, alternative="greater").pvalue)


	def proportion_ci(n_correct: int, n_total: int, alpha: float = 0.05) -> tuple[float, float]:
	"""Wilson 95% CI for an accuracy proportion. More accurate than normal-approx for small n."""
	if n_total == 0:
	return (np.nan, np.nan)
	ci = stats.binomtest(n_correct, n_total).proportion_ci(
	confidence_level=1 - alpha, method="wilson"
	)
	return float(ci.low), float(ci.high)