"""Evaluation metrics with statistical significance — triple-barrier era. The original notebook reported directional accuracy without binomial p-values; 49.9% over 499 days is statistically indistinguishable from 50%. This module makes that explicit by attaching a p-value to every accuracy figure. Metric conventions ------------------ - For 3-class labels ``{-1, 0, +1}``, the null is uniform random: ``p_null=1/3``. - For *directional accuracy when acting*, restrict to predictions ``in {-1, +1}`` (i.e. ignore "no-action" 0 predictions), compare to ``p_null=1/2``. - Both metrics use a one-sided binomial test (we only care if it beats chance). """ from __future__ import annotations import numpy as np import pandas as pd from sklearn.metrics import accuracy_score, confusion_matrix from .cv import binomial_pvalue def directional_accuracy_when_acting( y_true: np.ndarray, y_pred: np.ndarray ) -> tuple[float, int, int]: """Accuracy conditioned on the model predicting a non-zero direction. Returns ``(accuracy, n_correct, n_acting)``. If ``n_acting`` is 0, returns ``(nan, 0, 0)``. """ acting_mask = y_pred != 0 n_acting = int(acting_mask.sum()) if n_acting == 0: return float("nan"), 0, 0 correct = int(((y_pred == y_true) & acting_mask).sum()) return correct / n_acting, correct, n_acting def fold_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict: """Per-fold metric bundle. Designed to be one row in the comparison CSV.""" y_true = np.asarray(y_true) y_pred = np.asarray(y_pred) n = len(y_true) acc = accuracy_score(y_true, y_pred) n_acc_correct = int((y_true == y_pred).sum()) dir_acc, n_dir_correct, n_acting = directional_accuracy_when_acting(y_true, y_pred) return { "n_test": n, "accuracy": acc, "binom_p_acc": binomial_pvalue(n_acc_correct, n, p_null=1 / 3), "n_acting": n_acting, "dir_acc_when_acting": dir_acc, "binom_p_dir": ( binomial_pvalue(n_dir_correct, n_acting, p_null=0.5) if n_acting > 0 else float("nan") ), } def summarize_results(results: pd.DataFrame) -> pd.DataFrame: """Aggregate per-fold rows to per-model summary with mean ± std.""" keep = ["accuracy", "binom_p_acc", "dir_acc_when_acting", "binom_p_dir"] grouped = results.groupby("model")[keep] summary = grouped.agg(["mean", "std"]) summary.columns = [f"{c}_{stat}" for c, stat in summary.columns] summary["n_folds"] = results.groupby("model").size() return summary.reset_index() def confusion_table(y_true: np.ndarray, y_pred: np.ndarray, labels=(-1, 0, 1)) -> pd.DataFrame: """Confusion matrix as a labeled DataFrame (rows=true, cols=pred).""" cm = confusion_matrix(y_true, y_pred, labels=list(labels)) return pd.DataFrame( cm, index=[f"true_{c}" for c in labels], columns=[f"pred_{c}" for c in labels] )