Spaces:
Sleeping
Sleeping
| """Data loaders for the AAPL/SPY pipeline + EWM daily volatility (AFML Snippet 3.1). | |
| The CSVs under ``data/raw/`` have a column-header bug: the header reads | |
| ``Open,High,Low,Close,Adj Close,Volume`` but the underlying yfinance frame was | |
| saved after a ``sort_index(axis=1)`` so the actual column order is alphabetical: | |
| ``Adj Close, Close, High, Low, Open, Volume``. We override the headers on load. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "raw" | |
| ACTUAL_COLUMN_ORDER = ["Date", "Adj Close", "Close", "High", "Low", "Open", "Volume", "company_name"] | |
| def load_ohlcv(ticker: str, data_dir: Path | None = None) -> pd.DataFrame: | |
| """Load a single-ticker OHLCV CSV from ``data/raw/``, fixing the column order.""" | |
| data_dir = data_dir or DATA_DIR | |
| path = data_dir / f"{ticker}_stock_data_2010_2024.csv" | |
| df = pd.read_csv(path, header=0, names=ACTUAL_COLUMN_ORDER, skiprows=1) | |
| df["Date"] = pd.to_datetime(df["Date"]) | |
| df = df.set_index("Date").sort_index() | |
| return df[["Open", "High", "Low", "Close", "Adj Close", "Volume"]] | |
| def load_aapl_with_spy() -> pd.DataFrame: | |
| """Merged AAPL + SPY frame for market-relative features. Index = trading dates.""" | |
| aapl = load_ohlcv("AAPL") | |
| spy = load_ohlcv("SPY")[["Adj Close", "Volume"]].rename( | |
| columns={"Adj Close": "SPY_Close", "Volume": "SPY_Volume"} | |
| ) | |
| return aapl.join(spy, how="inner") | |
| def get_daily_vol(close: pd.Series, span: int = 100) -> pd.Series: | |
| """EWM daily-return volatility — AFML Snippet 3.1 (BonusPDF p.26). | |
| Used to set the horizontal barrier widths in triple-barrier labeling. Output | |
| is forward-fill safe: NaNs only at the leading edge before EWM warmup. | |
| """ | |
| returns = close.pct_change() | |
| return returns.ewm(span=span).std() | |
| def cumulative_returns_path(close: pd.Series, t0, t1) -> pd.Series: | |
| """Return path from t0 to t1 expressed as ``close/close[t0] - 1``.""" | |
| return close.loc[t0:t1] / close.loc[t0] - 1 | |