from customer_retention.analysis.notebook_progress import track_and_export_previous

track_and_export_previous("11_scoring_validation.ipynb")

import sys
from pathlib import Path

from customer_retention.core.config.experiments import (  # noqa: F401
    FINDINGS_DIR,
    OUTPUT_DIR,
    setup_experiments_structure,
)

from customer_retention.core.compat.detection import is_databricks
from customer_retention.stages.scoring import ScoringConfig, ScoringDataLoader

if is_databricks():
    try:
        config = ScoringConfig.from_databricks()
    except ValueError as e:
        import os
        print(f"ERROR: {e}")
        print("\nDiagnostic info:")
        print(f"  CR_EXPERIMENT_NAME = {os.environ.get('CR_EXPERIMENT_NAME', '(not set)')}")
        print(f"  CR_CATALOG = {os.environ.get('CR_CATALOG', '(not set)')}")
        print(f"  CR_SCHEMA = {os.environ.get('CR_SCHEMA', '(not set)')}")
        print("\nOn Databricks, experiments are created under /Users/{username}/.")
        print("Set CR_EXPERIMENT_NAME to the full path, e.g.:")
        print('  os.environ["CR_EXPERIMENT_NAME"] = "/Users/you@example.com/customer_churn"')
        raise
else:
    generated_dir = Path("../generated_pipelines/local")
    pipeline_dirs = sorted(generated_dir.glob("*/config.py"))
    if not pipeline_dirs:
        raise FileNotFoundError(
            f"No generated pipeline found under {generated_dir}. Run notebook 10 first."
        )
    config = ScoringConfig.from_local_config(pipeline_dirs[-1].parent)

loader = ScoringDataLoader(config)

PIPELINE_NAME = config.pipeline_name
TARGET_COLUMN = config.target_column
ENTITY_KEY = config.entity_key
RECOMMENDATIONS_HASH = config.recommendations_hash
ORIGINAL_COLUMN = config.original_column

print(f"Pipeline: {PIPELINE_NAME}")
print(f"Platform: {'Databricks' if config.is_databricks else 'Local'}")
print(f"Experiments dir: {config.experiments_dir}")
print(f"Recommendations hash: {RECOMMENDATIONS_HASH}")

Pipeline: customer_churn
Platform: Local
Experiments dir: /Users/Vital/python/CustomerRetention/experiments
Recommendations hash: e8df49d0

import mlflow
import mlflow.sklearn
import mlflow.xgboost
import numpy as np
import pandas as pd
import xgboost as xgb

from customer_retention.transforms import ArtifactStore, TransformExecutor

_registry = ArtifactStore.from_manifest(Path(config.artifacts_path) / "manifest.yaml")
_executor = TransformExecutor()

ENCODINGS, SCALINGS = loader.load_transforms()

PREDICTIONS_PATH = config.production_dir / "data" / "scoring" / "predictions.parquet"

mlflow.set_tracking_uri(config.mlflow_tracking_uri)

features_df = loader.load_gold_features()

if ORIGINAL_COLUMN not in features_df.columns:
    raise ValueError(
        f"No holdout found (column '{ORIGINAL_COLUMN}' missing). "
        "Holdout must be created in silver layer BEFORE gold layer feature computation."
    )

scoring_mask = features_df[TARGET_COLUMN].isna() & features_df[ORIGINAL_COLUMN].notna()
scoring_df = features_df[scoring_mask].copy()
print(f"Found {len(scoring_df):,} holdout records for scoring")

scoring_features = loader.load_scoring_features(scoring_df)

model, model_uri = loader.load_model()
print(f"Loading model: {model_uri}")

_SCORING_AVAILABLE = True


def prepare_features(df):
    return loader.prepare_features(df, ENCODINGS + SCALINGS, _executor, _registry)


X = prepare_features(scoring_features)
y_true = scoring_features[ORIGINAL_COLUMN].values

if X.shape[1] == 0:
    print(
        "WARNING: Feature matrix has 0 columns after preparation.\n"
        "The model was likely trained before feature selection was fixed.\n"
        "Re-run notebooks 08 and 10 to retrain, then re-run this notebook.\n"
        "Skipping scoring validation."
    )
    _SCORING_AVAILABLE = False
    predictions_df = pd.DataFrame(columns=[ENTITY_KEY, "prediction", "probability", "actual", "correct"])

if _SCORING_AVAILABLE:
    X, _missing_feats, _extra_feats = loader.align_features_to_model(X, model)
    if _missing_feats:
        print(f"WARNING: {len(_missing_feats)} features missing from scoring data (filled with 0):")
        for f in _missing_feats[:10]:
            print(f"  - {f}")
        if len(_missing_feats) > 10:
            print(f"  ... and {len(_missing_feats) - 10} more")
    if _extra_feats:
        print(f"INFO: {len(_extra_feats)} extra features in scoring data (dropped)")

if _SCORING_AVAILABLE:
    print("Generating predictions...")
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X)[:, 1]
    else:
        y_proba = model.predict(xgb.DMatrix(X, feature_names=list(X.columns)))
    y_pred = (y_proba >= 0.5).astype(int)

    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba) if len(np.unique(y_true)) > 1 else 0.0,
    }
    print("\nValidation Metrics (vs original values):")
    for name, value in metrics.items():
        print(f"  {name}: {value:.4f}")

    predictions_df = pd.DataFrame({
        ENTITY_KEY: scoring_df[ENTITY_KEY].values,
        "prediction": y_pred,
        "probability": y_proba,
        "actual": y_true,
        "correct": (y_pred == y_true).astype(int),
    })
    PREDICTIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
    predictions_df.to_parquet(PREDICTIONS_PATH, index=False)
    print(f"\nPredictions saved: {PREDICTIONS_PATH}")
    print(f"Correct: {predictions_df['correct'].sum():,}/{len(predictions_df):,} ({predictions_df['correct'].mean():.1%})")

Found 499 holdout records for scoring

2026/02/14 22:20:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas

2026/02/14 22:20:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables

2026/02/14 22:20:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types

2026/02/14 22:20:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints

2026/02/14 22:20:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults

2026/02/14 22:20:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments

2026/02/14 22:20:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.

2026/02/14 22:20:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.

Loading model: runs:/bcdd94c57a5342e1bb147779ed6f3f65/model_logistic_regression_e8df49d0
WARNING: 21 features missing from scoring data (filled with 0):
  - campaign_type_mode_24h
  - subject_line_category_mode_24h
  - device_type_mode_24h
  - campaign_type_mode_7d
  - subject_line_category_mode_7d
  - device_type_mode_7d
  - campaign_type_mode_30d
  - subject_line_category_mode_30d
  - device_type_mode_30d
  - campaign_type_mode_90d
  ... and 11 more
Generating predictions...

Validation Metrics (vs original values):
  accuracy: 0.9940
  precision: 0.0000
  recall: 0.0000
  f1: 0.0000
  roc_auc: 1.0000

Predictions saved: /Users/Vital/python/CustomerRetention/experiments/data/scoring/predictions.parquet
Correct: 496/499 (99.4%)

if _SCORING_AVAILABLE:
    from sklearn.metrics import (
        accuracy_score,
        confusion_matrix,
        f1_score,
        precision_score,
        recall_score,
        roc_auc_score,
    )

    y_true = predictions_df["actual"]
    y_pred = predictions_df["prediction"]
    y_proba = predictions_df["probability"]

    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1 Score": f1_score(y_true, y_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, y_proba) if len(np.unique(y_true)) > 1 else 0.0,
    }

    print("\n=== Scoring Validation Metrics ===")
    for name, value in metrics.items():
        print(f"  {name}: {value:.4f}")

    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(f"  TN={cm[0,0]:,}  FP={cm[0,1]:,}")
    print(f"  FN={cm[1,0]:,}  TP={cm[1,1]:,}")
else:
    print("Sections 11.2-11.9 skipped: retrain model via notebooks 08 + 10")

=== Scoring Validation Metrics ===
  Accuracy: 0.9940
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000
  ROC-AUC: 1.0000

Confusion Matrix:
  TN=496  FP=0
  FN=3  TP=0

if _SCORING_AVAILABLE:
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    fpr, tpr, _ = roc_curve(y_true, y_proba)
    axes[0].plot(fpr, tpr, "b-", lw=2, label=f"ROC (AUC={metrics['ROC-AUC']:.3f})")
    axes[0].plot([0, 1], [0, 1], "k--", lw=1)
    axes[0].set_xlabel("False Positive Rate")
    axes[0].set_ylabel("True Positive Rate")
    axes[0].set_title("ROC Curve")
    axes[0].legend()

    axes[1].hist(y_proba[y_true == 0], bins=30, alpha=0.5, label="Actual=0", color="blue")
    axes[1].hist(y_proba[y_true == 1], bins=30, alpha=0.5, label="Actual=1", color="red")
    axes[1].axvline(x=0.5, color="black", linestyle="--", label="Threshold")
    axes[1].set_xlabel("Predicted Probability")
    axes[1].set_ylabel("Count")
    axes[1].set_title("Probability Distribution")
    axes[1].legend()

    plt.tight_layout()
    plt.show()

if _SCORING_AVAILABLE:
    from IPython.display import display
    from sklearn.metrics import (
        accuracy_score,
        average_precision_score,
        confusion_matrix,
        f1_score,
        precision_recall_curve,
        precision_score,
        recall_score,
        roc_auc_score,
        roc_curve,
    )

    mlflow.set_tracking_uri(config.mlflow_tracking_uri)
    client = mlflow.tracking.MlflowClient()
    experiment = client.get_experiment_by_name(PIPELINE_NAME)

    X_holdout = prepare_features(scoring_features)
    y_actual = predictions_df["actual"].values

    logged_models = client.search_logged_models(experiment_ids=[experiment.experiment_id])

    model_types = ["logistic_regression", "random_forest", "xgboost"]
    model_display_names = ["Logistic Regression", "Random Forest", "XGBoost"]
    loaded_models = {}
    model_predictions = {}

    for model_type, display_name in zip(model_types, model_display_names):
        model_name_pattern = f"model_{model_type}"
        if RECOMMENDATIONS_HASH:
            model_name_pattern = f"{model_name_pattern}_{RECOMMENDATIONS_HASH}"

        matching_model = None
        for lm in logged_models:
            if lm.name == model_name_pattern:
                if matching_model is None or lm.creation_timestamp > matching_model.creation_timestamp:
                    matching_model = lm

        if matching_model:
            try:
                if "xgboost" in model_type:
                    m = mlflow.xgboost.load_model(matching_model.model_uri)
                else:
                    m = mlflow.sklearn.load_model(matching_model.model_uri)

                X_aligned, _, _ = loader.align_features_to_model(X_holdout, m)

                if "xgboost" in model_type:
                    dmatrix = xgb.DMatrix(X_aligned, feature_names=list(X_aligned.columns))
                    yp = m.predict(dmatrix)
                else:
                    yp = m.predict_proba(X_aligned)[:, 1]

                y_p = (yp > 0.5).astype(int)
                loaded_models[display_name] = m
                model_predictions[display_name] = {"y_pred": y_p, "y_proba": yp}
                print(f"Loaded {display_name}: ROC-AUC = {roc_auc_score(y_actual, yp):.4f}")
            except Exception as e:
                print(f"Could not load {display_name}: {e}")

    print(f"\nLoaded {len(loaded_models)} models for comparison")

Loaded Logistic Regression: ROC-AUC = 1.0000
Loaded Random Forest: ROC-AUC = 1.0000
Loaded XGBoost: ROC-AUC = 1.0000

Loaded 3 models for comparison

if _SCORING_AVAILABLE:
    n_models = len(model_predictions)
    if n_models > 0:
        fig, axes = plt.subplots(3, n_models, figsize=(5 * n_models, 12))
        if n_models == 1:
            axes = axes.reshape(-1, 1)

        colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]

        for col_idx, (name, preds) in enumerate(model_predictions.items()):
            y_p = preds["y_pred"]
            yp = preds["y_proba"]
            color = colors[col_idx % len(colors)]

            cm = confusion_matrix(y_actual, y_p)
            ax = axes[0, col_idx]
            ax.imshow(cm, cmap="Blues")
            ax.set_xticks([0, 1])
            ax.set_yticks([0, 1])
            ax.set_xticklabels(["Pred 0", "Pred 1"])
            ax.set_yticklabels(["Actual 0", "Actual 1"])
            for i in range(2):
                for j in range(2):
                    pct = cm[i, j] / cm.sum() * 100
                    ax.text(j, i, f"{cm[i, j]}\n({pct:.1f}%)", ha="center", va="center",
                            color="white" if cm[i, j] > cm.max() / 2 else "black", fontsize=10)
            acc = accuracy_score(y_actual, y_p)
            ax.set_title(f"{name}\nAccuracy: {acc:.3f}", fontsize=11, fontweight="bold")

            ax = axes[1, col_idx]
            fpr, tpr, _ = roc_curve(y_actual, yp)
            auc = roc_auc_score(y_actual, yp)
            ax.plot(fpr, tpr, color=color, lw=2, label=f"AUC = {auc:.4f}")
            ax.plot([0, 1], [0, 1], "k--", lw=1, alpha=0.5)
            ax.fill_between(fpr, tpr, alpha=0.2, color=color)
            ax.set_xlabel("False Positive Rate")
            ax.set_ylabel("True Positive Rate")
            ax.set_title("ROC Curve", fontsize=10)
            ax.legend(loc="lower right")
            ax.grid(True, alpha=0.3)

            ax = axes[2, col_idx]
            precision_vals, recall_vals, _ = precision_recall_curve(y_actual, yp)
            pr_auc = average_precision_score(y_actual, yp)
            ax.plot(recall_vals, precision_vals, color=color, lw=2, label=f"PR-AUC = {pr_auc:.4f}")
            baseline = y_actual.sum() / len(y_actual)
            ax.axhline(y=baseline, color="gray", linestyle="--", lw=1, label=f"Baseline = {baseline:.2f}")
            ax.fill_between(recall_vals, precision_vals, alpha=0.2, color=color)
            ax.set_xlabel("Recall")
            ax.set_ylabel("Precision")
            ax.set_title("Precision-Recall Curve", fontsize=10)
            ax.legend(loc="lower left")
            ax.grid(True, alpha=0.3)

        plt.suptitle("Model Comparison Grid: Holdout Set Performance",
                     fontsize=14, fontweight="bold", y=1.02)
        plt.tight_layout()
        plt.show()
    else:
        print("No models loaded for comparison")

if _SCORING_AVAILABLE and model_predictions:
    comparison_results = []
    for name, preds in model_predictions.items():
        y_p = preds["y_pred"]
        yp = preds["y_proba"]
        comparison_results.append({
            "Model": name,
            "ROC-AUC": roc_auc_score(y_actual, yp),
            "PR-AUC": average_precision_score(y_actual, yp),
            "F1-Score": f1_score(y_actual, y_p),
            "Precision": precision_score(y_actual, y_p, zero_division=0),
            "Recall": recall_score(y_actual, y_p, zero_division=0),
            "Accuracy": accuracy_score(y_actual, y_p),
        })

    comparison_df = pd.DataFrame(comparison_results).set_index("Model")
    print("\n" + "=" * 70)
    print("MODEL COMPARISON SUMMARY (Holdout Set)")
    print("=" * 70)
    display(
        comparison_df.style
        .highlight_max(axis=0, props="background-color: #2e7d32; color: white")
        .format("{:.4f}")
    )

    best_model_name = comparison_df["ROC-AUC"].idxmax()
    best_auc = comparison_df.loc[best_model_name, "ROC-AUC"]
    print(f"\nBest Model: {best_model_name} (ROC-AUC = {best_auc:.4f})")

======================================================================
MODEL COMPARISON SUMMARY (Holdout Set)
======================================================================

Best Model: Logistic Regression (ROC-AUC = 1.0000)

if _SCORING_AVAILABLE:
    gold_features = loader.load_gold_features()

    holdout_mask = gold_features[ORIGINAL_COLUMN].notna()
    holdout_gold = gold_features[holdout_mask].copy()
    print(f"Holdout entities for validation: {holdout_mask.sum():,}")

    scoring_entity_ids = set(scoring_features[ENTITY_KEY].values)
    gold_holdout = holdout_gold[holdout_gold[ENTITY_KEY].isin(scoring_entity_ids)]

    exclude_cols = {ENTITY_KEY, config.timestamp_column, TARGET_COLUMN, ORIGINAL_COLUMN}
    compare_cols = [
        c for c in gold_holdout.columns
        if c not in exclude_cols and not c.startswith("original_")
    ]

    print("\n" + "=" * 60)
    print("ADVERSARIAL PIPELINE VALIDATION")
    print("=" * 60)

    mismatches = []
    for col in compare_cols:
        if col in scoring_features.columns and col in gold_holdout.columns:
            g_vals = gold_holdout[col].values
            s_vals = scoring_features.reindex(gold_holdout.index)[col].values
            if pd.api.types.is_numeric_dtype(gold_holdout[col]):
                delta = np.abs(g_vals.astype(float) - s_vals.astype(float))
                max_delta = np.nanmax(delta) if len(delta) > 0 else 0
                if max_delta > 1e-6:
                    mismatches.append({"feature": col, "max_delta": max_delta})

    if not mismatches:
        print("\nPASSED: Scoring features match training features")
    else:
        print(f"\nFAILED: {len(mismatches)} features with drift")
        display(pd.DataFrame(mismatches).sort_values("max_delta", ascending=False))

Holdout entities for validation: 499

============================================================
ADVERSARIAL PIPELINE VALIDATION
============================================================

PASSED: Scoring features match training features

if _SCORING_AVAILABLE:
    from customer_retention.stages.validation import validate_feature_transformation

    training_mask = gold_features[ORIGINAL_COLUMN].isna()
    training_subset = gold_features[training_mask].copy()
    scoring_subset = gold_features[~training_mask].copy()

    report = validate_feature_transformation(
        training_df=training_subset,
        scoring_df=scoring_subset,
        transform_fn=prepare_features,
        entity_column=ENTITY_KEY,
        verbose=True,
    )

    if report.passed:
        print("Transformation validation PASSED")
    else:
        print(f"Transformation validation FAILED: {len(report.feature_mismatches)} mismatches")

Validating transformation consistency...
Transformation validation: PASSED
Transformation validation PASSED

if _SCORING_AVAILABLE:
    import shap

    mlflow.set_tracking_uri(config.mlflow_tracking_uri)
    client = mlflow.tracking.MlflowClient()

    experiment = client.get_experiment_by_name(PIPELINE_NAME)
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["metrics.best_roc_auc DESC"],
        max_results=1,
    )
    parent_run = runs[0]

    best_model_tag = parent_run.data.tags.get("best_model", "random_forest")
    model_name = f"model_{best_model_tag}"
    if RECOMMENDATIONS_HASH:
        model_name = f"{model_name}_{RECOMMENDATIONS_HASH}"

    child_runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        filter_string=f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'",
    )
    model_run = next((c for c in child_runs if c.info.run_name == best_model_tag), parent_run)

    model_uri = f"runs:/{model_run.info.run_id}/{model_name}"
    print(f"Loading model: {model_uri}")
    if best_model_tag == "xgboost":
        model = mlflow.xgboost.load_model(model_uri)
    else:
        model = mlflow.sklearn.load_model(model_uri)
    print(f"Model type: {type(model).__name__}")

Loading model: runs:/bcdd94c57a5342e1bb147779ed6f3f65/model_logistic_regression_e8df49d0

Model type: LogisticRegression

if _SCORING_AVAILABLE:
    X = prepare_features(scoring_features)
    X, _, _ = loader.align_features_to_model(X, model)
    feature_names = list(X.columns)
    print(f"Prepared {len(feature_names)} features for SHAP analysis")

Prepared 263 features for SHAP analysis

if _SCORING_AVAILABLE:
    print("Creating SHAP explainer (may take a moment)...")

    _is_tree_model = hasattr(model, "estimators_") or hasattr(model, "get_booster")

    if _is_tree_model:
        explainer = shap.TreeExplainer(model)
        print(f"Using TreeExplainer ({type(model).__name__})")
    else:
        background_size = min(100, len(X))
        background = shap.sample(X, background_size)
        _max_evals = 2 * len(feature_names) + 1
        if hasattr(model, "predict_proba"):
            explainer = shap.Explainer(
                model.predict_proba, background,
                feature_names=feature_names, max_evals=_max_evals,
            )
        else:
            explainer = shap.Explainer(
                model, background,
                feature_names=feature_names, max_evals=_max_evals,
            )
        print(f"Using PermutationExplainer (max_evals={_max_evals})")

    print("Computing SHAP values...")
    shap_values = explainer(X)
    print(f"SHAP values computed for {len(shap_values)} records")

Creating SHAP explainer (may take a moment)...
Using PermutationExplainer (max_evals=527)
Computing SHAP values...

PermutationExplainer explainer:  51%|█████     | 252/499 [00:00<?, ?it/s]

PermutationExplainer explainer:  52%|█████▏    | 257/499 [00:10<00:05, 41.76it/s]

PermutationExplainer explainer:  53%|█████▎    | 262/499 [00:10<00:06, 35.43it/s]

PermutationExplainer explainer:  53%|█████▎    | 266/499 [00:10<00:07, 32.67it/s]

PermutationExplainer explainer:  54%|█████▍    | 270/499 [00:10<00:07, 30.37it/s]

PermutationExplainer explainer:  55%|█████▍    | 274/499 [00:10<00:07, 31.61it/s]

PermutationExplainer explainer:  56%|█████▌    | 278/499 [00:10<00:07, 31.11it/s]

PermutationExplainer explainer:  57%|█████▋    | 282/499 [00:10<00:06, 32.03it/s]

PermutationExplainer explainer:  57%|█████▋    | 286/499 [00:11<00:06, 32.23it/s]

if _SCORING_AVAILABLE:
    if len(shap_values.shape) == 3:
        shap_vals = shap_values[:, :, 1]
    else:
        shap_vals = shap_values

    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_vals, X, feature_names=feature_names, show=False, max_display=20)
    plt.title("Feature Importance (SHAP Summary)")
    plt.tight_layout()
    plt.show()

if _SCORING_AVAILABLE:
    mean_shap = np.abs(shap_vals.values).mean(axis=0)
    importance_df = pd.DataFrame({
        "feature": feature_names,
        "importance": mean_shap,
    }).sort_values("importance", ascending=False)

    print("Top 15 Most Important Features:")
    display(importance_df.head(15))

Top 15 Most Important Features:

if _SCORING_AVAILABLE:
    browser_df = predictions_df.merge(
        scoring_features[[ENTITY_KEY] + feature_names],
        on=ENTITY_KEY,
        how="left",
    )

    print(f"Customer browser ready with {len(browser_df):,} records")
    print("\nPrediction Distribution:")
    print(f"  Predicted Positive: {(browser_df['prediction'] == 1).sum():,}")
    print(f"  Predicted Negative: {(browser_df['prediction'] == 0).sum():,}")
    print(f"\nCorrect Predictions: {browser_df['correct'].sum():,}/{len(browser_df):,} ({browser_df['correct'].mean():.1%})")

Customer browser ready with 499 records

Prediction Distribution:
  Predicted Positive: 0
  Predicted Negative: 499

Correct Predictions: 496/499 (99.4%)

if _SCORING_AVAILABLE:
    def show_customer(idx: int):
        row = browser_df.iloc[idx]
        entity_id = row[ENTITY_KEY]

        print(f"=== Customer {entity_id} ===")
        print(f"Prediction: {int(row['prediction'])} (probability: {row['probability']:.3f})")
        print(f"Actual: {int(row['actual'])}")
        print(f"Correct: {'Yes' if row['correct'] else 'No'}")
        print()

        feature_vals = X.iloc[idx]
        if len(shap_values.shape) == 3:
            customer_shap = shap_values[idx, :, 1].values
        else:
            customer_shap = shap_values[idx].values

        feature_impact = pd.DataFrame({
            "feature": feature_names,
            "value": feature_vals.values,
            "shap_impact": customer_shap,
        }).sort_values("shap_impact", key=abs, ascending=False)

        print("Top Contributing Features:")
        display(feature_impact.head(10))

        plt.figure(figsize=(10, 6))
        if len(shap_values.shape) == 3:
            shap.plots.waterfall(shap_values[idx, :, 1], max_display=10, show=False)
        else:
            shap.plots.waterfall(shap_values[idx], max_display=10, show=False)
        plt.title(f"SHAP Explanation for Customer {entity_id}")
        plt.tight_layout()
        plt.show()

if _SCORING_AVAILABLE:
    print("Showing first 3 customers:\n")
    for i in range(min(3, len(browser_df))):
        show_customer(i)
        print("\n" + "=" * 60 + "\n")

Showing first 3 customers:

=== Customer 0091D5 ===
Prediction: 0 (probability: 0.000)
Actual: 0
Correct: Yes

Top Contributing Features:

============================================================

=== Customer 00CAEF ===
Prediction: 0 (probability: 0.000)
Actual: 0
Correct: Yes

Top Contributing Features:

============================================================

=== Customer 014D12 ===
Prediction: 0 (probability: 0.000)
Actual: 0
Correct: Yes

Top Contributing Features:

============================================================

if _SCORING_AVAILABLE:
    def lookup_customer(entity_id):
        mask = browser_df[ENTITY_KEY] == entity_id
        if not mask.any():
            print(f"Customer {entity_id} not found in scoring set")
            return
        idx = browser_df[mask].index[0]
        x_idx = browser_df.index.get_loc(idx)
        show_customer(x_idx)

    print("Available entity IDs (first 10):")
    print(browser_df[ENTITY_KEY].head(10).tolist())

Available entity IDs (first 10):
['0091D5', '00CAEF', '014D12', '01758D', '01C2FD', '01E1D7', '0242EC', '035822', '03A263', '045255']

if _SCORING_AVAILABLE:
    incorrect = browser_df[browser_df["correct"] == 0]
    print(f"Misclassified customers: {len(incorrect):,}")

    fp = incorrect[incorrect["prediction"] == 1]
    print(f"  False Positives: {len(fp):,}")

    fn = incorrect[incorrect["prediction"] == 0]
    print(f"  False Negatives: {len(fn):,}")

Misclassified customers: 3
  False Positives: 0
  False Negatives: 3

if _SCORING_AVAILABLE and len(fp) > 0:
    print("\n=== Example False Positive ===")
    fp_idx = browser_df.index.get_loc(fp.index[0])
    show_customer(fp_idx)

if _SCORING_AVAILABLE and len(fn) > 0:
    print("\n=== Example False Negative ===")
    fn_idx = browser_df.index.get_loc(fn.index[0])
    show_customer(fn_idx)

=== Example False Negative ===
=== Customer 0C3629 ===
Prediction: 0 (probability: 0.008)
Actual: 1
Correct: No

Top Contributing Features:

if _SCORING_AVAILABLE:
    if is_databricks():
        from customer_retention.core.compat.detection import get_spark_session
        spark = get_spark_session()

    output_dir = config.scoring_output_dir
    output_dir.mkdir(parents=True, exist_ok=True)

    importance_df.to_csv(output_dir / "feature_importance.csv", index=False)
    print(f"Feature importance saved to {output_dir / 'feature_importance.csv'}")

    top_features = importance_df.head(10)["feature"].tolist()
    shap_by_entity = pd.DataFrame({ENTITY_KEY: scoring_features[ENTITY_KEY].values})
    for feat in top_features:
        feat_idx = feature_names.index(feat)
        if len(shap_values.shape) == 3:
            shap_by_entity[f"shap_{feat}"] = shap_values[:, feat_idx, 1].values
        else:
            shap_by_entity[f"shap_{feat}"] = shap_values[:, feat_idx].values

    detailed_df = predictions_df.merge(shap_by_entity, on=ENTITY_KEY, how="left")
    detailed_df.to_parquet(output_dir / "predictions_with_shap.parquet", index=False)
    print(f"Detailed predictions with SHAP saved to {output_dir / 'predictions_with_shap.parquet'}")

    if is_databricks():
        table_name = f"{config.catalog}.{config.schema}.scoring_results"
        spark.createDataFrame(detailed_df).write.format("delta") \
            .mode("overwrite").saveAsTable(table_name)
        print(f"Results saved to Delta table: {table_name}")

Feature importance saved to /Users/Vital/python/CustomerRetention/experiments/data/scoring/feature_importance.csv
Detailed predictions with SHAP saved to /Users/Vital/python/CustomerRetention/experiments/data/scoring/predictions_with_shap.parquet

	feature	importance
252	event_count_365d_x_event_count_all_time	0.004375
235	unsubscribe_date_is_weekend_count_all_time	0.003182
236	campaign_type_nunique_all_time	0.000694
242	subject_line_category_nunique_all_time	0.000685
251	event_count_180d_x_event_count_all_time	0.000442
244	device_type_nunique_all_time	0.000181
216	send_hour_count_all_time	0.000055
218	time_to_open_hours_max_all_time	0.000049
248	event_count_all_time	0.000044
111	send_hour_sum_90d	0.000037
238	opened_nunique_all_time	0.000028
112	send_hour_mean_90d	0.000028
113	send_hour_max_90d	0.000026
228	unsubscribe_date_dow_sum_all_time	0.000022
213	send_hour_sum_all_time	0.000021

	feature	value	shap_impact
235	unsubscribe_date_is_weekend_count_all_time	17.000000	-0.001448
251	event_count_180d_x_event_count_all_time	34.000000	0.001112
252	event_count_365d_x_event_count_all_time	68.000000	0.000463
238	opened_nunique_all_time	2.000000	-0.000200
250	event_count_180d_x_event_count_365d	8.000000	0.000070
213	send_hour_sum_all_time	1.872622	-0.000044
208	device_type_nunique_365d	2.000000	0.000011
232	unsubscribe_date_is_weekend_sum_all_time	0.000000	0.000011
202	opened_nunique_365d	2.000000	0.000009
173	subject_line_category_nunique_180d	2.000000	0.000007

	feature	value	shap_impact
236	campaign_type_nunique_all_time	4.0	0.001076
251	event_count_180d_x_event_count_all_time	0.0	-0.001026
113	send_hour_max_90d	0.0	-0.000162
252	event_count_365d_x_event_count_all_time	0.0	-0.000122
238	opened_nunique_all_time	1.0	0.000114
242	subject_line_category_nunique_all_time	5.0	0.000064
250	event_count_180d_x_event_count_365d	0.0	-0.000056
235	unsubscribe_date_is_weekend_count_all_time	8.0	0.000035
218	time_to_open_hours_max_all_time	0.0	0.000026
230	unsubscribe_date_dow_max_all_time	5.0	0.000018

	feature	value	shap_impact
252	event_count_365d_x_event_count_all_time	34.000000	8.616077e-06
218	time_to_open_hours_max_all_time	1.226219	-2.289508e-06
244	device_type_nunique_all_time	3.000000	-1.834122e-06
248	event_count_all_time	1.358505	-1.474241e-06
235	unsubscribe_date_is_weekend_count_all_time	17.000000	-1.209352e-06
216	send_hour_count_all_time	1.358505	-1.159090e-06
236	campaign_type_nunique_all_time	6.000000	-2.181824e-07
246	bounced_nunique_all_time	2.000000	-1.979600e-07
227	unsubscribe_date_hour_count_all_time	0.000000	-1.930423e-07
231	unsubscribe_date_dow_count_all_time	0.000000	-1.716589e-07

	feature	value	shap_impact
235	unsubscribe_date_is_weekend_count_all_time	1.000000	0.081694
252	event_count_365d_x_event_count_all_time	0.000000	-0.079160
251	event_count_180d_x_event_count_all_time	0.000000	-0.004101
242	subject_line_category_nunique_all_time	1.000000	0.003197
244	device_type_nunique_all_time	1.000000	0.003130
214	send_hour_mean_all_time	-15.543916	0.002661
236	campaign_type_nunique_all_time	1.000000	0.001949
216	send_hour_count_all_time	0.526589	0.001144
111	send_hour_sum_90d	0.000000	-0.000700
239	opened_mode_all_time	1.000000	-0.000664

Chapter 11: Scoring, Validation & Explanations¶

11.1 Run Scoring¶

11.2 Summary Metrics¶

11.3 Model Comparison Grid¶

11.4 Adversarial Pipeline Validation¶

11.5 Transformation Validation¶

11.6 Model Explanations (SHAP)¶

11.7 Customer Browser¶

11.8 Error Analysis¶

11.9 Export Results¶

	ROC-AUC	PR-AUC	F1-Score	Precision	Recall	Accuracy
Model
Logistic Regression	1.0000	1.0000	0.0000	0.0000	0.0000	0.9940
Random Forest	1.0000	1.0000	1.0000	1.0000	1.0000	1.0000
XGBoost	1.0000	1.0000	1.0000	1.0000	1.0000	1.0000