from customer_retention.analysis.notebook_progress import track_and_export_previous

track_and_export_previous("11_scoring_validation.ipynb")

import sys
from pathlib import Path

from customer_retention.core.config.experiments import (  # noqa: F401
    FINDINGS_DIR,
    OUTPUT_DIR,
    setup_experiments_structure,
)

from customer_retention.core.compat.detection import is_databricks
from customer_retention.stages.scoring import ScoringConfig, ScoringDataLoader

if is_databricks():
    try:
        config = ScoringConfig.from_databricks()
    except ValueError as e:
        import os
        print(f"ERROR: {e}")
        print("\nDiagnostic info:")
        print(f"  CR_EXPERIMENT_NAME = {os.environ.get('CR_EXPERIMENT_NAME', '(not set)')}")
        print(f"  CR_CATALOG = {os.environ.get('CR_CATALOG', '(not set)')}")
        print(f"  CR_SCHEMA = {os.environ.get('CR_SCHEMA', '(not set)')}")
        print("\nOn Databricks, experiments are created under /Users/{username}/.")
        print("Set CR_EXPERIMENT_NAME to the full path, e.g.:")
        print('  os.environ["CR_EXPERIMENT_NAME"] = "/Users/you@example.com/customer_churn"')
        raise
else:
    generated_dir = Path("../generated_pipelines/local")
    pipeline_dirs = sorted(generated_dir.glob("*/config.py"))
    if not pipeline_dirs:
        raise FileNotFoundError(
            f"No generated pipeline found under {generated_dir}. Run notebook 10 first."
        )
    config = ScoringConfig.from_local_config(pipeline_dirs[-1].parent)

loader = ScoringDataLoader(config)

PIPELINE_NAME = config.pipeline_name
TARGET_COLUMN = config.target_column
ENTITY_KEY = config.entity_key
RECOMMENDATIONS_HASH = config.recommendations_hash
ORIGINAL_COLUMN = config.original_column

print(f"Pipeline: {PIPELINE_NAME}")
print(f"Platform: {'Databricks' if config.is_databricks else 'Local'}")
print(f"Experiments dir: {config.experiments_dir}")
print(f"Recommendations hash: {RECOMMENDATIONS_HASH}")

Pipeline: customer_churn
Platform: Local
Experiments dir: /Users/Vital/python/CustomerRetention/experiments
Recommendations hash: ddd6956b

import mlflow
import mlflow.sklearn
import mlflow.xgboost
import numpy as np
import pandas as pd
import xgboost as xgb

from customer_retention.transforms import ArtifactStore, TransformExecutor

_registry = ArtifactStore.from_manifest(Path(config.artifacts_path) / "manifest.yaml")
_executor = TransformExecutor()

ENCODINGS, SCALINGS = loader.load_transforms()

PREDICTIONS_PATH = config.production_dir / "data" / "scoring" / "predictions.parquet"

mlflow.set_tracking_uri(config.mlflow_tracking_uri)

features_df = loader.load_gold_features()

if ORIGINAL_COLUMN not in features_df.columns:
    raise ValueError(
        f"No holdout found (column '{ORIGINAL_COLUMN}' missing). "
        "Holdout must be created in silver layer BEFORE gold layer feature computation."
    )

scoring_mask = features_df[TARGET_COLUMN].isna() & features_df[ORIGINAL_COLUMN].notna()
scoring_df = features_df[scoring_mask].copy()
print(f"Found {len(scoring_df):,} holdout records for scoring")

scoring_features = loader.load_scoring_features(scoring_df)

model, model_uri = loader.load_model()
print(f"Loading model: {model_uri}")

_SCORING_AVAILABLE = True


def prepare_features(df):
    return loader.prepare_features(df, ENCODINGS + SCALINGS, _executor, _registry)


X = prepare_features(scoring_features)
y_true = scoring_features[ORIGINAL_COLUMN].values

if X.shape[1] == 0:
    print(
        "WARNING: Feature matrix has 0 columns after preparation.\n"
        "The model was likely trained before feature selection was fixed.\n"
        "Re-run notebooks 08 and 10 to retrain, then re-run this notebook.\n"
        "Skipping scoring validation."
    )
    _SCORING_AVAILABLE = False
    predictions_df = pd.DataFrame(columns=[ENTITY_KEY, "prediction", "probability", "actual", "correct"])

if _SCORING_AVAILABLE:
    X, _missing_feats, _extra_feats = loader.align_features_to_model(X, model)
    if _missing_feats:
        print(f"WARNING: {len(_missing_feats)} features missing from scoring data (filled with 0):")
        for f in _missing_feats[:10]:
            print(f"  - {f}")
        if len(_missing_feats) > 10:
            print(f"  ... and {len(_missing_feats) - 10} more")
    if _extra_feats:
        print(f"INFO: {len(_extra_feats)} extra features in scoring data (dropped)")

if _SCORING_AVAILABLE:
    print("Generating predictions...")
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X)[:, 1]
    else:
        y_proba = model.predict(xgb.DMatrix(X, feature_names=list(X.columns)))
    y_pred = (y_proba >= 0.5).astype(int)

    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba) if len(np.unique(y_true)) > 1 else 0.0,
    }
    print("\nValidation Metrics (vs original values):")
    for name, value in metrics.items():
        print(f"  {name}: {value:.4f}")

    predictions_df = pd.DataFrame({
        ENTITY_KEY: scoring_df[ENTITY_KEY].values,
        "prediction": y_pred,
        "probability": y_proba,
        "actual": y_true,
        "correct": (y_pred == y_true).astype(int),
    })
    PREDICTIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
    predictions_df.to_parquet(PREDICTIONS_PATH, index=False)
    print(f"\nPredictions saved: {PREDICTIONS_PATH}")
    print(f"Correct: {predictions_df['correct'].sum():,}/{len(predictions_df):,} ({predictions_df['correct'].mean():.1%})")

2026/02/15 09:14:40 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas

2026/02/15 09:14:40 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables

2026/02/15 09:14:40 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types

2026/02/15 09:14:40 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints

2026/02/15 09:14:40 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults

2026/02/15 09:14:40 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments

Found 3,076 holdout records for scoring

2026/02/15 09:14:40 INFO alembic.runtime.migration: Context impl SQLiteImpl.

2026/02/15 09:14:40 INFO alembic.runtime.migration: Will assume non-transactional DDL.

Loading model: runs:/e887a61ceea7436a9caaaabee28c516a/model_xgboost_ddd6956b
WARNING: 14 features missing from scoring data (filled with 0):
  - favday_mode_24h
  - city_mode_24h
  - favday_mode_7d
  - city_mode_7d
  - favday_mode_30d
  - city_mode_30d
  - favday_mode_90d
  - city_mode_90d
  - favday_mode_180d
  - city_mode_180d
  ... and 4 more
Generating predictions...

Validation Metrics (vs original values):
  accuracy: 0.9519
  precision: 0.9479
  recall: 0.9947
  f1: 0.9707
  roc_auc: 0.9687

Predictions saved: /Users/Vital/python/CustomerRetention/experiments/data/scoring/predictions.parquet
Correct: 2,928/3,076 (95.2%)

if _SCORING_AVAILABLE:
    from sklearn.metrics import (
        accuracy_score,
        confusion_matrix,
        f1_score,
        precision_score,
        recall_score,
        roc_auc_score,
    )

    y_true = predictions_df["actual"]
    y_pred = predictions_df["prediction"]
    y_proba = predictions_df["probability"]

    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1 Score": f1_score(y_true, y_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, y_proba) if len(np.unique(y_true)) > 1 else 0.0,
    }

    print("\n=== Scoring Validation Metrics ===")
    for name, value in metrics.items():
        print(f"  {name}: {value:.4f}")

    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(f"  TN={cm[0,0]:,}  FP={cm[0,1]:,}")
    print(f"  FN={cm[1,0]:,}  TP={cm[1,1]:,}")
else:
    print("Sections 11.2-11.9 skipped: retrain model via notebooks 08 + 10")

=== Scoring Validation Metrics ===
  Accuracy: 0.9519
  Precision: 0.9479
  Recall: 0.9947
  F1 Score: 0.9707
  ROC-AUC: 0.9687

Confusion Matrix:
  TN=474  FP=135
  FN=13  TP=2,454

if _SCORING_AVAILABLE:
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    fpr, tpr, _ = roc_curve(y_true, y_proba)
    axes[0].plot(fpr, tpr, "b-", lw=2, label=f"ROC (AUC={metrics['ROC-AUC']:.3f})")
    axes[0].plot([0, 1], [0, 1], "k--", lw=1)
    axes[0].set_xlabel("False Positive Rate")
    axes[0].set_ylabel("True Positive Rate")
    axes[0].set_title("ROC Curve")
    axes[0].legend()

    axes[1].hist(y_proba[y_true == 0], bins=30, alpha=0.5, label="Actual=0", color="blue")
    axes[1].hist(y_proba[y_true == 1], bins=30, alpha=0.5, label="Actual=1", color="red")
    axes[1].axvline(x=0.5, color="black", linestyle="--", label="Threshold")
    axes[1].set_xlabel("Predicted Probability")
    axes[1].set_ylabel("Count")
    axes[1].set_title("Probability Distribution")
    axes[1].legend()

    plt.tight_layout()
    plt.show()

if _SCORING_AVAILABLE:
    from IPython.display import display
    from sklearn.metrics import (
        accuracy_score,
        average_precision_score,
        confusion_matrix,
        f1_score,
        precision_recall_curve,
        precision_score,
        recall_score,
        roc_auc_score,
        roc_curve,
    )

    mlflow.set_tracking_uri(config.mlflow_tracking_uri)
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name(PIPELINE_NAME)

    X_holdout = prepare_features(scoring_features)
    y_actual = predictions_df["actual"].values

    logged_models = client.search_logged_models(experiment_ids=[experiment.experiment_id])

    model_types = ["logistic_regression", "random_forest", "xgboost"]
    model_display_names = ["Logistic Regression", "Random Forest", "XGBoost"]
    loaded_models = {}
    model_predictions = {}

    for model_type, display_name in zip(model_types, model_display_names):
        model_name_pattern = f"model_{model_type}"
        if RECOMMENDATIONS_HASH:
            model_name_pattern = f"{model_name_pattern}_{RECOMMENDATIONS_HASH}"

        matching_model = None
        for lm in logged_models:
            if lm.name == model_name_pattern:
                if matching_model is None or lm.creation_timestamp > matching_model.creation_timestamp:
                    matching_model = lm

        if matching_model:
            try:
                if "xgboost" in model_type:
                    m = mlflow.xgboost.load_model(matching_model.model_uri)
                else:
                    m = mlflow.sklearn.load_model(matching_model.model_uri)

                X_aligned, _, _ = loader.align_features_to_model(X_holdout, m)

                if "xgboost" in model_type:
                    dmatrix = xgb.DMatrix(X_aligned, feature_names=list(X_aligned.columns))
                    yp = m.predict(dmatrix)
                else:
                    yp = m.predict_proba(X_aligned)[:, 1]

                y_p = (yp > 0.5).astype(int)
                loaded_models[display_name] = m
                model_predictions[display_name] = {"y_pred": y_p, "y_proba": yp}
                print(f"Loaded {display_name}: ROC-AUC = {roc_auc_score(y_actual, yp):.4f}")
            except Exception as e:
                print(f"Could not load {display_name}: {e}")

    print(f"\nLoaded {len(loaded_models)} models for comparison")

Loaded Logistic Regression: ROC-AUC = 0.9675
Loaded Random Forest: ROC-AUC = 0.9653
Loaded XGBoost: ROC-AUC = 0.9687

Loaded 3 models for comparison

if _SCORING_AVAILABLE:
    n_models = len(model_predictions)
    if n_models > 0:
        fig, axes = plt.subplots(3, n_models, figsize=(5 * n_models, 12))
        if n_models == 1:
            axes = axes.reshape(-1, 1)

        colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]

        for col_idx, (name, preds) in enumerate(model_predictions.items()):
            y_p = preds["y_pred"]
            yp = preds["y_proba"]
            color = colors[col_idx % len(colors)]

            cm = confusion_matrix(y_actual, y_p)
            ax = axes[0, col_idx]
            ax.imshow(cm, cmap="Blues")
            ax.set_xticks([0, 1])
            ax.set_yticks([0, 1])
            ax.set_xticklabels(["Pred 0", "Pred 1"])
            ax.set_yticklabels(["Actual 0", "Actual 1"])
            for i in range(2):
                for j in range(2):
                    pct = cm[i, j] / cm.sum() * 100
                    ax.text(j, i, f"{cm[i, j]}\n({pct:.1f}%)", ha="center", va="center",
                            color="white" if cm[i, j] > cm.max() / 2 else "black", fontsize=10)
            acc = accuracy_score(y_actual, y_p)
            ax.set_title(f"{name}\nAccuracy: {acc:.3f}", fontsize=11, fontweight="bold")

            ax = axes[1, col_idx]
            fpr, tpr, _ = roc_curve(y_actual, yp)
            auc = roc_auc_score(y_actual, yp)
            ax.plot(fpr, tpr, color=color, lw=2, label=f"AUC = {auc:.4f}")
            ax.plot([0, 1], [0, 1], "k--", lw=1, alpha=0.5)
            ax.fill_between(fpr, tpr, alpha=0.2, color=color)
            ax.set_xlabel("False Positive Rate")
            ax.set_ylabel("True Positive Rate")
            ax.set_title("ROC Curve", fontsize=10)
            ax.legend(loc="lower right")
            ax.grid(True, alpha=0.3)

            ax = axes[2, col_idx]
            precision_vals, recall_vals, _ = precision_recall_curve(y_actual, yp)
            pr_auc = average_precision_score(y_actual, yp)
            ax.plot(recall_vals, precision_vals, color=color, lw=2, label=f"PR-AUC = {pr_auc:.4f}")
            baseline = y_actual.sum() / len(y_actual)
            ax.axhline(y=baseline, color="gray", linestyle="--", lw=1, label=f"Baseline = {baseline:.2f}")
            ax.fill_between(recall_vals, precision_vals, alpha=0.2, color=color)
            ax.set_xlabel("Recall")
            ax.set_ylabel("Precision")
            ax.set_title("Precision-Recall Curve", fontsize=10)
            ax.legend(loc="lower left")
            ax.grid(True, alpha=0.3)

        plt.suptitle("Model Comparison Grid: Holdout Set Performance",
                     fontsize=14, fontweight="bold", y=1.02)
        plt.tight_layout()
        plt.show()
    else:
        print("No models loaded for comparison")

if _SCORING_AVAILABLE and model_predictions:
    comparison_results = []
    for name, preds in model_predictions.items():
        y_p = preds["y_pred"]
        yp = preds["y_proba"]
        comparison_results.append({
            "Model": name,
            "ROC-AUC": roc_auc_score(y_actual, yp),
            "PR-AUC": average_precision_score(y_actual, yp),
            "F1-Score": f1_score(y_actual, y_p),
            "Precision": precision_score(y_actual, y_p, zero_division=0),
            "Recall": recall_score(y_actual, y_p, zero_division=0),
            "Accuracy": accuracy_score(y_actual, y_p),
        })

    comparison_df = pd.DataFrame(comparison_results).set_index("Model")
    print("\n" + "=" * 70)
    print("MODEL COMPARISON SUMMARY (Holdout Set)")
    print("=" * 70)
    display(
        comparison_df.style
        .highlight_max(axis=0, props="background-color: #2e7d32; color: white")
        .format("{:.4f}")
    )

    best_model_name = comparison_df["ROC-AUC"].idxmax()
    best_auc = comparison_df.loc[best_model_name, "ROC-AUC"]
    print(f"\nBest Model: {best_model_name} (ROC-AUC = {best_auc:.4f})")

======================================================================
MODEL COMPARISON SUMMARY (Holdout Set)
======================================================================

Best Model: XGBoost (ROC-AUC = 0.9687)

if _SCORING_AVAILABLE:
    gold_features = loader.load_gold_features()

    holdout_mask = gold_features[ORIGINAL_COLUMN].notna()
    holdout_gold = gold_features[holdout_mask].copy()
    print(f"Holdout entities for validation: {holdout_mask.sum():,}")

    scoring_entity_ids = set(scoring_features[ENTITY_KEY].values)
    gold_holdout = holdout_gold[holdout_gold[ENTITY_KEY].isin(scoring_entity_ids)]

    exclude_cols = {ENTITY_KEY, config.timestamp_column, TARGET_COLUMN, ORIGINAL_COLUMN}
    compare_cols = [
        c for c in gold_holdout.columns
        if c not in exclude_cols and not c.startswith("original_")
    ]

    print("\n" + "=" * 60)
    print("ADVERSARIAL PIPELINE VALIDATION")
    print("=" * 60)

    mismatches = []
    for col in compare_cols:
        if col in scoring_features.columns and col in gold_holdout.columns:
            g_vals = gold_holdout[col].values
            s_vals = scoring_features.reindex(gold_holdout.index)[col].values
            if pd.api.types.is_numeric_dtype(gold_holdout[col]):
                delta = np.abs(g_vals.astype(float) - s_vals.astype(float))
                max_delta = np.nanmax(delta) if len(delta) > 0 else 0
                if max_delta > 1e-6:
                    mismatches.append({"feature": col, "max_delta": max_delta})

    if not mismatches:
        print("\nPASSED: Scoring features match training features")
    else:
        print(f"\nFAILED: {len(mismatches)} features with drift")
        display(pd.DataFrame(mismatches).sort_values("max_delta", ascending=False))

Holdout entities for validation: 3,076

============================================================
ADVERSARIAL PIPELINE VALIDATION
============================================================

PASSED: Scoring features match training features

if _SCORING_AVAILABLE:
    from customer_retention.stages.validation import validate_feature_transformation

    training_mask = gold_features[ORIGINAL_COLUMN].isna()
    training_subset = gold_features[training_mask].copy()
    scoring_subset = gold_features[~training_mask].copy()

    report = validate_feature_transformation(
        training_df=training_subset,
        scoring_df=scoring_subset,
        transform_fn=prepare_features,
        entity_column=ENTITY_KEY,
        verbose=True,
    )

    if report.passed:
        print("Transformation validation PASSED")
    else:
        print(f"Transformation validation FAILED: {len(report.feature_mismatches)} mismatches")

Validating transformation consistency...

Transformation validation: PASSED
Transformation validation PASSED

if _SCORING_AVAILABLE:
    import shap

    mlflow.set_tracking_uri(config.mlflow_tracking_uri)
    client = mlflow.tracking.MlflowClient()

    experiment = client.get_experiment_by_name(PIPELINE_NAME)
    _hash_filter = f"tags.recommendations_hash = '{RECOMMENDATIONS_HASH}'" if RECOMMENDATIONS_HASH else ""
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string=_hash_filter,
        order_by=["metrics.best_roc_auc DESC"],
        max_results=1,
    )
    parent_run = runs[0]

    best_model_tag = parent_run.data.tags.get("best_model", "random_forest")
    model_name = f"model_{best_model_tag}"
    if RECOMMENDATIONS_HASH:
        model_name = f"{model_name}_{RECOMMENDATIONS_HASH}"

    child_runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string=f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'",
    )
    model_run = next((c for c in child_runs if c.info.run_name == best_model_tag), parent_run)

    loader_module = mlflow.xgboost if best_model_tag == "xgboost" else mlflow.sklearn
    model_uri = f"runs:/{model_run.info.run_id}/{model_name}"
    print(f"Loading model: {model_uri}")
    try:
        model = loader_module.load_model(model_uri)
    except Exception:
        logged_models = client.search_logged_models(experiment_ids=[experiment.experiment_id])
        _match = next((lm for lm in logged_models if lm.name == model_name and lm.source_run_id == model_run.info.run_id), None)
        if _match:
            model_uri = _match.model_uri
            model = loader_module.load_model(model_uri)
        else:
            raise
    print(f"Model type: {type(model).__name__}")

Loading model: runs:/e887a61ceea7436a9caaaabee28c516a/model_xgboost_ddd6956b

Model type: Booster

if _SCORING_AVAILABLE:
    X = prepare_features(scoring_features)
    X, _, _ = loader.align_features_to_model(X, model)
    feature_names = list(X.columns)
    print(f"Prepared {len(feature_names)} features for SHAP analysis")

Prepared 431 features for SHAP analysis

if _SCORING_AVAILABLE:
    print("Creating SHAP explainer (may take a moment)...")

    _is_tree_model = (
        hasattr(model, "estimators_")
        or hasattr(model, "get_booster")
        or type(model).__name__ == "Booster"
    )

    if _is_tree_model:
        explainer = shap.TreeExplainer(model)
        print(f"Using TreeExplainer ({type(model).__name__})")
    else:
        background_size = min(100, len(X))
        background = shap.sample(X, background_size)
        _max_evals = 2 * len(feature_names) + 1
        if hasattr(model, "predict_proba"):
            explainer = shap.Explainer(
                model.predict_proba, background,
                feature_names=feature_names, max_evals=_max_evals,
            )
        else:
            explainer = shap.Explainer(
                model, background,
                feature_names=feature_names, max_evals=_max_evals,
            )
        print(f"Using PermutationExplainer (max_evals={_max_evals})")

    print("Computing SHAP values...")
    shap_values = explainer(X)
    print(f"SHAP values computed for {len(shap_values)} records")

Creating SHAP explainer (may take a moment)...
Using TreeExplainer (Booster)
Computing SHAP values...

SHAP values computed for 3076 records

if _SCORING_AVAILABLE:
    if len(shap_values.shape) == 3:
        shap_vals = shap_values[:, :, 1]
    else:
        shap_vals = shap_values

    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_vals, X, feature_names=feature_names, show=False, max_display=20)
    plt.title("Feature Importance (SHAP Summary)")
    plt.tight_layout()
    plt.show()

if _SCORING_AVAILABLE:
    mean_shap = np.abs(shap_vals.values).mean(axis=0)
    importance_df = pd.DataFrame({
        "feature": feature_names,
        "importance": mean_shap,
    }).sort_values("importance", ascending=False)

    print("Top 15 Most Important Features:")
    display(importance_df.head(15))

Top 15 Most Important Features:

if _SCORING_AVAILABLE:
    browser_df = predictions_df.merge(
        scoring_features[[ENTITY_KEY] + feature_names],
        on=ENTITY_KEY,
        how="left",
    )

    print(f"Customer browser ready with {len(browser_df):,} records")
    print("\nPrediction Distribution:")
    print(f"  Predicted Positive: {(browser_df['prediction'] == 1).sum():,}")
    print(f"  Predicted Negative: {(browser_df['prediction'] == 0).sum():,}")
    print(f"\nCorrect Predictions: {browser_df['correct'].sum():,}/{len(browser_df):,} ({browser_df['correct'].mean():.1%})")

Customer browser ready with 3,076 records

Prediction Distribution:
  Predicted Positive: 2,589
  Predicted Negative: 487

Correct Predictions: 2,928/3,076 (95.2%)

if _SCORING_AVAILABLE:
    def show_customer(idx: int):
        row = browser_df.iloc[idx]
        entity_id = row[ENTITY_KEY]

        print(f"=== Customer {entity_id} ===")
        print(f"Prediction: {int(row['prediction'])} (probability: {row['probability']:.3f})")
        print(f"Actual: {int(row['actual'])}")
        print(f"Correct: {'Yes' if row['correct'] else 'No'}")
        print()

        feature_vals = X.iloc[idx]
        if len(shap_values.shape) == 3:
            customer_shap = shap_values[idx, :, 1].values
        else:
            customer_shap = shap_values[idx].values

        feature_impact = pd.DataFrame({
            "feature": feature_names,
            "value": feature_vals.values,
            "shap_impact": customer_shap,
        }).sort_values("shap_impact", key=abs, ascending=False)

        print("Top Contributing Features:")
        display(feature_impact.head(10))

        plt.figure(figsize=(10, 6))
        if len(shap_values.shape) == 3:
            shap.plots.waterfall(shap_values[idx, :, 1], max_display=10, show=False)
        else:
            shap.plots.waterfall(shap_values[idx], max_display=10, show=False)
        plt.title(f"SHAP Explanation for Customer {entity_id}")
        plt.tight_layout()
        plt.show()

if _SCORING_AVAILABLE:
    print("Showing first 3 customers:\n")
    for i in range(min(3, len(browser_df))):
        show_customer(i)
        print("\n" + "=" * 60 + "\n")

Showing first 3 customers:

=== Customer 2264XM ===
Prediction: 1 (probability: 0.997)
Actual: 1
Correct: Yes

Top Contributing Features:

============================================================

=== Customer 22V484 ===
Prediction: 1 (probability: 0.960)
Actual: 1
Correct: Yes

Top Contributing Features:

============================================================

=== Customer 22W4K4 ===
Prediction: 1 (probability: 0.997)
Actual: 1
Correct: Yes

Top Contributing Features:

============================================================

if _SCORING_AVAILABLE:
    def lookup_customer(entity_id):
        mask = browser_df[ENTITY_KEY] == entity_id
        if not mask.any():
            print(f"Customer {entity_id} not found in scoring set")
            return
        idx = browser_df[mask].index[0]
        x_idx = browser_df.index.get_loc(idx)
        show_customer(x_idx)

    print("Available entity IDs (first 10):")
    print(browser_df[ENTITY_KEY].head(10).tolist())

Available entity IDs (first 10):
['2264XM', '22V484', '22W4K4', '23C49G', '23HEFR', '24QMLK', '24U5J6', '254JFD', '27J3QL', '28BN8W']

if _SCORING_AVAILABLE:
    incorrect = browser_df[browser_df["correct"] == 0]
    print(f"Misclassified customers: {len(incorrect):,}")

    fp = incorrect[incorrect["prediction"] == 1]
    print(f"  False Positives: {len(fp):,}")

    fn = incorrect[incorrect["prediction"] == 0]
    print(f"  False Negatives: {len(fn):,}")

Misclassified customers: 148
  False Positives: 135
  False Negatives: 13

if _SCORING_AVAILABLE and len(fp) > 0:
    print("\n=== Example False Positive ===")
    fp_idx = browser_df.index.get_loc(fp.index[0])
    show_customer(fp_idx)

=== Example False Positive ===
=== Customer 29QUW2 ===
Prediction: 1 (probability: 0.935)
Actual: 0
Correct: No

Top Contributing Features:

if _SCORING_AVAILABLE and len(fn) > 0:
    print("\n=== Example False Negative ===")
    fn_idx = browser_df.index.get_loc(fn.index[0])
    show_customer(fn_idx)

=== Example False Negative ===
=== Customer 2HDAYS ===
Prediction: 0 (probability: 0.436)
Actual: 1
Correct: No

Top Contributing Features:

if _SCORING_AVAILABLE:
    if is_databricks():
        from customer_retention.core.compat.detection import get_spark_session
        spark = get_spark_session()

    output_dir = config.scoring_output_dir
    output_dir.mkdir(parents=True, exist_ok=True)

    importance_df.to_csv(output_dir / "feature_importance.csv", index=False)
    print(f"Feature importance saved to {output_dir / 'feature_importance.csv'}")

    top_features = importance_df.head(10)["feature"].tolist()
    shap_by_entity = pd.DataFrame({ENTITY_KEY: scoring_features[ENTITY_KEY].values})
    for feat in top_features:
        feat_idx = feature_names.index(feat)
        if len(shap_values.shape) == 3:
            shap_by_entity[f"shap_{feat}"] = shap_values[:, feat_idx, 1].values
        else:
            shap_by_entity[f"shap_{feat}"] = shap_values[:, feat_idx].values

    detailed_df = predictions_df.merge(shap_by_entity, on=ENTITY_KEY, how="left")
    detailed_df.to_parquet(output_dir / "predictions_with_shap.parquet", index=False)
    print(f"Detailed predictions with SHAP saved to {output_dir / 'predictions_with_shap.parquet'}")

    if is_databricks():
        table_name = f"{config.catalog}.{config.schema}.scoring_results"
        spark.createDataFrame(detailed_df).write.format("delta") \
            .mode("overwrite").saveAsTable(table_name)
        print(f"Results saved to Delta table: {table_name}")

Feature importance saved to /Users/Vital/python/CustomerRetention/experiments/data/scoring/feature_importance.csv

Detailed predictions with SHAP saved to /Users/Vital/python/CustomerRetention/experiments/data/scoring/predictions_with_shap.parquet

	feature	importance
378	esent_sum_all_time	2.140893
379	esent_mean_all_time	1.227238
404	city_mode_all_time	0.326609
187	city_mode_30d	0.290312
406	event_count_all_time_x_esent_sum_all_time	0.286193
412	eclickrate_sum_all_time_is_zero	0.213538
402	favday_mode_all_time	0.180940
396	paperless_mode_all_time	0.130447
418	firstorder_delta_hours_sum_all_time_is_zero	0.123825
313	city_mode_180d	0.123719
415	ordfreq_sum_all_time_is_zero	0.105782
425	lastorder_delta_hours_sum_all_time_is_zero	0.065654
390	lastorder_dow_max_all_time	0.062946
319	eopenrate_sum_365d	0.056250
376	city_mode_365d	0.055627

	feature	value	shap_impact
378	esent_sum_all_time	45.0	2.369076
406	event_count_all_time_x_esent_sum_all_time	45.0	1.400224
379	esent_mean_all_time	45.0	1.127664
313	city_mode_180d	0.0	-0.296199
187	city_mode_30d	0.0	0.294507
404	city_mode_all_time	0.0	0.253583
396	paperless_mode_all_time	0.0	-0.234938
384	firstorder_is_weekend_sum_all_time	1.0	0.190879
412	eclickrate_sum_all_time_is_zero	1.0	-0.183380
250	city_mode_90d	0.0	-0.148264

	feature	value	shap_impact
379	esent_mean_all_time	16.0	0.808921
404	city_mode_all_time	0.0	0.536407
187	city_mode_30d	0.0	0.331545
418	firstorder_delta_hours_sum_all_time_is_zero	1.0	0.201956
402	favday_mode_all_time	0.0	0.197204
396	paperless_mode_all_time	1.0	-0.178479
412	eclickrate_sum_all_time_is_zero	1.0	-0.127180
378	esent_sum_all_time	16.0	0.122382
319	eopenrate_sum_365d	0.0	0.058322
376	city_mode_365d	0.0	-0.047637

	feature	value	shap_impact
378	esent_sum_all_time	69.0	2.429544
406	event_count_all_time_x_esent_sum_all_time	69.0	1.208919
379	esent_mean_all_time	69.0	0.905038
313	city_mode_180d	0.0	-0.296172
187	city_mode_30d	0.0	0.294507
396	paperless_mode_all_time	0.0	-0.204989
412	eclickrate_sum_all_time_is_zero	0.0	0.190957
404	city_mode_all_time	0.0	0.190272
250	city_mode_90d	0.0	-0.150387
415	ordfreq_sum_all_time_is_zero	0.0	-0.123107

	feature	value	shap_impact
379	esent_mean_all_time	20.0	0.918219
412	eclickrate_sum_all_time_is_zero	1.0	-0.430392
404	city_mode_all_time	0.0	0.423524
187	city_mode_30d	0.0	0.331835
415	ordfreq_sum_all_time_is_zero	0.0	-0.282130
418	firstorder_delta_hours_sum_all_time_is_zero	1.0	0.233514
400	doorstep_mode_all_time	1.0	0.215912
402	favday_mode_all_time	0.0	0.185288
378	esent_sum_all_time	20.0	-0.117532
319	eopenrate_sum_365d	0.0	0.056919

Chapter 11: Scoring, Validation & Explanations¶

11.1 Run Scoring¶

11.2 Summary Metrics¶

11.3 Model Comparison Grid¶

11.4 Adversarial Pipeline Validation¶

11.5 Transformation Validation¶

11.6 Model Explanations (SHAP)¶

11.7 Customer Browser¶

11.8 Error Analysis¶

11.9 Export Results¶

	ROC-AUC	PR-AUC	F1-Score	Precision	Recall	Accuracy
Model
Logistic Regression	0.9675	0.9891	0.9521	0.9093	0.9992	0.9194
Random Forest	0.9653	0.9868	0.9700	0.9570	0.9834	0.9512
XGBoost	0.9687	0.9887	0.9707	0.9479	0.9947	0.9519

	feature	value	shap_impact
378	esent_sum_all_time	10.0	-1.841491
415	ordfreq_sum_all_time_is_zero	0.0	-0.621209
418	firstorder_delta_hours_sum_all_time_is_zero	0.0	-0.505629
379	esent_mean_all_time	10.0	0.438855
404	city_mode_all_time	0.0	0.374753
187	city_mode_30d	0.0	0.365399
402	favday_mode_all_time	0.0	0.354211
412	eclickrate_sum_all_time_is_zero	0.0	0.239357
425	lastorder_delta_hours_sum_all_time_is_zero	0.0	-0.191343
396	paperless_mode_all_time	1.0	0.079916