Spaces:

moccaram
/

DataSynthis_ML_JobTask

Sleeping

App Files Files Community

DataSynthis_ML_JobTask / src /app.py

moccaram

Replace v1 demo with v2 XGBoost-backed Gradio app (reference-backed rebuild)

8ba081b verified 7 days ago

raw

history blame contribute delete

7.73 kB

	"""Gradio demo — AAPL triple-barrier direction classifier (educational).

	Loads the XGBoost model (the headline winner in this study, mean test accuracy
	~38% vs 33% random) and lets the user pick any date in the available range to
	inspect the next-10-day direction prediction with class probabilities.

	This is a portfolio artifact. The directional accuracy when the model
	actually picks a side is ~36% — worse than random. Do not trade on this.
	"""

	from __future__ import annotations

	import io
	import sys
	import warnings
	from pathlib import Path

	warnings.filterwarnings("ignore")

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd

	ROOT = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(ROOT))

	from src.data import load_aapl_with_spy, get_daily_vol
	from src.features import frac_diff_ffd
	from src.labeling import cusum_filter, get_events, get_bins, drop_labels
	from src.models.xgb_model import XGBTripleBarrier


	CLASS_LABELS = {-1: "DOWN (stop-loss first)", 0: "FLAT (time-out, no signal)", 1: "UP (profit-taking first)"}


	def build_features_and_labels():
	"""Rebuild the full feature matrix + triple-barrier labels at startup."""
	df = load_aapl_with_spy()
	close = df["Adj Close"]
	log_returns = np.log(close).diff().dropna()
	daily_vol = get_daily_vol(close, span=100)

	features = pd.DataFrame(index=df.index)
	features["frac_diff_close"] = frac_diff_ffd(np.log(close).to_frame("c"), 0.4, thres=1e-5)["c"]
	features["frac_diff_volume"] = frac_diff_ffd(
	np.log(df["Volume"].replace(0, np.nan)).to_frame("v"), 0.4, thres=1e-5
	)["v"]
	features["hl_range"] = (df["High"] - df["Low"]) / df["Close"]
	features["spy_return"] = np.log(df["SPY_Close"]).diff()
	features["volatility_20d"] = log_returns.rolling(20).std()
	features["rolling_beta"] = (
	log_returns.rolling(30).cov(features["spy_return"])
	/ features["spy_return"].rolling(30).var()
	)
	features["day_of_week"] = df.index.dayofweek
	features["vol_regime"] = daily_vol / daily_vol.rolling(252, min_periods=60).median()
	features = features.dropna()

	t_events = cusum_filter(np.log(close), threshold=float(daily_vol.median()))
	events = get_events(
	close=close, t_events=t_events, pt_sl=(2.0, 2.0),
	target=daily_vol, min_ret=0.005, num_days=10,
	)
	labels = get_bins(events, close)
	events_with_labels = events.join(labels[["bin"]])
	events_with_labels = drop_labels(events_with_labels, min_pct=0.05)
	labels = labels.loc[events_with_labels.index]

	aligned = features.index.intersection(labels.index)
	return df, close, features, labels.loc[aligned, "bin"].astype(int), features.loc[aligned]


	print("Loading data and training XGBoost (one-time, ~10 sec)...")
	DF, CLOSE, FEATURES_FULL, Y_TRAIN, X_TRAIN_ALIGNED = build_features_and_labels()

	from sklearn.preprocessing import StandardScaler
	SCALER = StandardScaler().fit(X_TRAIN_ALIGNED.values)
	MODEL = XGBTripleBarrier(random_state=42)
	MODEL.fit(
	pd.DataFrame(SCALER.transform(X_TRAIN_ALIGNED.values), index=X_TRAIN_ALIGNED.index, columns=X_TRAIN_ALIGNED.columns),
	Y_TRAIN.values,
	)
	print(f"Model trained on {len(X_TRAIN_ALIGNED)} labeled events. Ready.")

	VALID_DATES = FEATURES_FULL.index
	DEFAULT_DATE = VALID_DATES[-1]


	def predict(date_str: str):
	try:
	date = pd.Timestamp(date_str)
	except Exception:
	return "Invalid date format. Use YYYY-MM-DD.", None, None

	available = FEATURES_FULL.index[FEATURES_FULL.index <= date]
	if len(available) == 0:
	return f"No features available on or before {date.date()}. Try a later date.", None, None
	use_date = available[-1]

	x_row = FEATURES_FULL.loc[[use_date]]
	x_scaled = pd.DataFrame(SCALER.transform(x_row.values), index=x_row.index, columns=x_row.columns)
	proba = MODEL.predict_proba(x_scaled)[0]
	pred_class = int(MODEL.classes_[np.argmax(proba)])

	proba_df = pd.DataFrame(
	{"class": [CLASS_LABELS[c] for c in MODEL.classes_], "probability": [f"{p:.1%}" for p in proba]}
	)

	end_idx = DF.index.get_loc(use_date)
	start_idx = max(0, end_idx - 59)
	chart_data = DF["Adj Close"].iloc[start_idx : end_idx + 1]

	fig, ax = plt.subplots(figsize=(8, 3.5))
	ax.plot(chart_data.index, chart_data.values, color="black", lw=1.0)
	ax.scatter([chart_data.index[-1]], [chart_data.iloc[-1]], color="red", s=40, zorder=3, label=f"As-of: {use_date.date()}")
	ax.set_title(f"AAPL adjusted close — 60 days ending {use_date.date()}")
	ax.set_ylabel("Price ($)")
	ax.legend(loc="best")
	ax.grid(alpha=0.3)
	plt.tight_layout()

	summary = (
	f"As-of date: {use_date.date()} \n"
	f"Last close: ${chart_data.iloc[-1]:.2f} \n"
	f"Prediction (next 10 trading days): {CLASS_LABELS[pred_class]} \n"
	f"Confidence (max class probability): {proba.max():.1%}"
	)
	return summary, proba_df, fig


	def build_interface():
	import gradio as gr

	caveat = """
	> ⚠️ This is an educational portfolio artifact, NOT a trading signal.
	>
	> Under 5-fold purged k-fold cross-validation (López de Prado, AFML, Ch.7), this XGBoost
	> classifier reaches mean accuracy ~38% on a 3-class triple-barrier label set (random baseline
	> = 33%, p<0.05 in 3 of 5 folds). However, *directional accuracy when the model picks a side*
	> is ~36% — worse than coin flip**. The model is mildly informative about "will something
	> happen vs nothing" but uninformative about "up vs down." Do not trade real money on this.
	"""

	with gr.Blocks(title="AAPL Triple-Barrier Direction Classifier") as demo:
	gr.Markdown("# AAPL Triple-Barrier Direction Classifier (educational)")
	gr.Markdown(caveat)
	gr.Markdown(
	"Reference-backed financial-ML pipeline: triple-barrier labeling "
	"(AFML Ch.3), fractional differentiation (Ch.5), purged k-fold CV (Ch.7), "
	"XGBoost classifier. Repo: this folder."
	)

	with gr.Row():
	with gr.Column(scale=1):
	date_input = gr.Textbox(
	label="As-of date (YYYY-MM-DD)",
	value=str(DEFAULT_DATE.date()),
	info=f"Valid range: {VALID_DATES[0].date()} → {VALID_DATES[-1].date()}",
	)
	predict_btn = gr.Button("Predict next 10-day direction", variant="primary")
	summary_md = gr.Markdown()
	proba_table = gr.Dataframe(headers=["class", "probability"], label="Class probabilities")

	with gr.Column(scale=2):
	chart = gr.Plot(label="60-day price context")

	predict_btn.click(
	fn=predict, inputs=[date_input], outputs=[summary_md, proba_table, chart]
	)

	gr.Markdown(
	"---\n"
	"Headline result table (mean over 5 purged folds):\n\n"
	"\| Model \| Accuracy \| Beat random (p<0.05) \| Dir.acc when acting \|\n"
	"\|-----------\|----------\|----------------------\|---------------------\|\n"
	"\| Majority \| 35.0% \| 0/5 folds \| N/A \|\n"
	"\| SES \| 36.8% \| 2/5 folds \| always abstains \|\n"
	"\| ARIMA \| 36.8% \| 2/5 folds \| always abstains \|\n"
	"\| LSTM \| 35.8% \| 2/5 folds \| 33% (worse than 50%) \|\n"
	"\| XGBoost \| 37.8% \| 3/5 folds \| 36% (worse than 50%) \|\n"
	)

	return demo


	if __name__ == "__main__":
	app = build_interface()
	app.launch(server_name="127.0.0.1", server_port=7860, inbrowser=False, share=False)