moccaram's picture
Replace v1 demo with v2 XGBoost-backed Gradio app (reference-backed rebuild)
8ba081b verified
"""Data loaders for the AAPL/SPY pipeline + EWM daily volatility (AFML Snippet 3.1).
The CSVs under ``data/raw/`` have a column-header bug: the header reads
``Open,High,Low,Close,Adj Close,Volume`` but the underlying yfinance frame was
saved after a ``sort_index(axis=1)`` so the actual column order is alphabetical:
``Adj Close, Close, High, Low, Open, Volume``. We override the headers on load.
"""
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "raw"
ACTUAL_COLUMN_ORDER = ["Date", "Adj Close", "Close", "High", "Low", "Open", "Volume", "company_name"]
def load_ohlcv(ticker: str, data_dir: Path | None = None) -> pd.DataFrame:
"""Load a single-ticker OHLCV CSV from ``data/raw/``, fixing the column order."""
data_dir = data_dir or DATA_DIR
path = data_dir / f"{ticker}_stock_data_2010_2024.csv"
df = pd.read_csv(path, header=0, names=ACTUAL_COLUMN_ORDER, skiprows=1)
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date").sort_index()
return df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]]
def load_aapl_with_spy() -> pd.DataFrame:
"""Merged AAPL + SPY frame for market-relative features. Index = trading dates."""
aapl = load_ohlcv("AAPL")
spy = load_ohlcv("SPY")[["Adj Close", "Volume"]].rename(
columns={"Adj Close": "SPY_Close", "Volume": "SPY_Volume"}
)
return aapl.join(spy, how="inner")
def get_daily_vol(close: pd.Series, span: int = 100) -> pd.Series:
"""EWM daily-return volatility — AFML Snippet 3.1 (BonusPDF p.26).
Used to set the horizontal barrier widths in triple-barrier labeling. Output
is forward-fill safe: NaNs only at the leading edge before EWM warmup.
"""
returns = close.pct_change()
return returns.ewm(span=span).std()
def cumulative_returns_path(close: pd.Series, t0, t1) -> pd.Series:
"""Return path from t0 to t1 expressed as ``close/close[t0] - 1``."""
return close.loc[t0:t1] / close.loc[t0] - 1