from customer_retention.analysis.notebook_progress import track_and_export_previous

track_and_export_previous("06_feature_opportunities.ipynb")

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import yaml

from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine, RecommendationRegistry
from customer_retention.analysis.visualization import ChartBuilder, display_figure
from customer_retention.core.config.column_config import ColumnType
from customer_retention.core.config.experiments import (
    EXPERIMENTS_DIR,
    FINDINGS_DIR,
)
from customer_retention.stages.features import CustomerSegmenter
from customer_retention.stages.profiling import FeatureCapacityAnalyzer

from pathlib import Path

from customer_retention.analysis.auto_explorer import load_notebook_findings, resolve_target_column

FINDINGS_PATH, _namespace, dataset_name = load_notebook_findings(
    "06_feature_opportunities.ipynb", prefer_merged=True
)
print(f"Using: {FINDINGS_PATH}")

RECOMMENDATIONS_PATH = FINDINGS_PATH.replace("_findings.yaml", "_recommendations.yaml")

findings = ExplorationFindings.load(FINDINGS_PATH)
target = resolve_target_column(_namespace, findings)

# Load data
from customer_retention.analysis.auto_explorer.active_dataset_store import load_active_dataset
from customer_retention.core.config.column_config import DatasetGranularity
from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS

if dataset_name is None and _namespace:
    from customer_retention.integrations.adapters.factory import get_delta
    df = get_delta(force_local=True).read(str(_namespace.silver_merged_path))
    data_source = "silver_merged"
    RECOMMENDATIONS_PATH = str(_namespace.merged_recommendations_path)
elif "_aggregated" in FINDINGS_PATH and _namespace:
    from customer_retention.analysis.auto_explorer.active_dataset_store import load_silver_merged
    df = load_silver_merged(_namespace, dataset_name, DatasetGranularity.EVENT_LEVEL)
    data_source = f"aggregated:{dataset_name}"
else:
    df = load_active_dataset(_namespace, dataset_name)
    data_source = dataset_name

charts = ChartBuilder()

if Path(RECOMMENDATIONS_PATH).exists():
    with open(RECOMMENDATIONS_PATH, "r") as f:
        registry = RecommendationRegistry.from_dict(yaml.safe_load(f))
    print(f"Loaded existing recommendations: {len(registry.all_recommendations)} total")
else:
    registry = RecommendationRegistry()
    print("Initialized new recommendation registry")

# Ensure all layers are initialized (even if loaded from file)
if not registry.bronze:
    registry.init_bronze(findings.source_path)
if not registry.silver:
    registry.init_silver(findings.entity_column or "entity_id")
if not registry.gold:
    registry.init_gold(target or "target")
    print("  Initialized gold layer for feature engineering recommendations")

print(f"\nLoaded {len(df):,} rows from: {data_source}")

Using: /Users/Vital/python/CustomerRetention/experiments/runs/retail-e7471284/merged/silver_merged_findings.yaml

Loaded existing recommendations: 1138 total

Loaded 13,354,180 rows from: silver_merged

recommender = RecommendationEngine()
feature_recs = recommender.recommend_features(findings)

print(f"Found {len(feature_recs)} feature engineering opportunities:\n")

for rec in feature_recs:
    print(f"{rec.feature_name}")
    print(f"  Source: {rec.source_column}")
    print(f"  Type: {rec.feature_type}")
    print(f"  Priority: {rec.priority}")
    print(f"  Description: {rec.description}")
    print()

Found 47 feature engineering opportunities:

as_of_date_year
  Source: as_of_date
  Type: temporal
  Priority: medium
  Description: Extract year from as_of_date

as_of_date_month
  Source: as_of_date
  Type: temporal
  Priority: medium
  Description: Extract month from as_of_date

as_of_date_dayofweek
  Source: as_of_date
  Type: temporal
  Priority: medium
  Description: Extract day of week from as_of_date

days_since_as_of_date
  Source: as_of_date
  Type: datetime
  Priority: high
  Description: Days since as_of_date until today

created_year
  Source: created
  Type: temporal
  Priority: medium
  Description: Extract year from created

created_month
  Source: created
  Type: temporal
  Priority: medium
  Description: Extract month from created

created_dayofweek
  Source: created
  Type: temporal
  Priority: medium
  Description: Extract day of week from created

days_since_created
  Source: created
  Type: datetime
  Priority: high
  Description: Days since created until today

firstorder_year
  Source: firstorder
  Type: temporal
  Priority: medium
  Description: Extract year from firstorder

firstorder_month
  Source: firstorder
  Type: temporal
  Priority: medium
  Description: Extract month from firstorder

firstorder_dayofweek
  Source: firstorder
  Type: temporal
  Priority: medium
  Description: Extract day of week from firstorder

days_since_firstorder
  Source: firstorder
  Type: datetime
  Priority: high
  Description: Days since firstorder until today

lastorder_year
  Source: lastorder
  Type: temporal
  Priority: medium
  Description: Extract year from lastorder

lastorder_month
  Source: lastorder
  Type: temporal
  Priority: medium
  Description: Extract month from lastorder

lastorder_dayofweek
  Source: lastorder
  Type: temporal
  Priority: medium
  Description: Extract day of week from lastorder

days_since_lastorder
  Source: lastorder
  Type: datetime
  Priority: high
  Description: Days since lastorder until today

esent_binned
  Source: esent
  Type: numeric
  Priority: low
  Description: Binned version of esent

eopenrate_binned
  Source: eopenrate
  Type: numeric
  Priority: low
  Description: Binned version of eopenrate

eopenrate_log
  Source: eopenrate
  Type: numeric
  Priority: high
  Description: Log transform of eopenrate (high skewness)

eclickrate_binned
  Source: eclickrate
  Type: numeric
  Priority: low
  Description: Binned version of eclickrate

eclickrate_log
  Source: eclickrate
  Type: numeric
  Priority: high
  Description: Log transform of eclickrate (high skewness)

avgorder_binned
  Source: avgorder
  Type: numeric
  Priority: low
  Description: Binned version of avgorder

avgorder_log
  Source: avgorder
  Type: numeric
  Priority: high
  Description: Log transform of avgorder (high skewness)

ordfreq_binned
  Source: ordfreq
  Type: numeric
  Priority: low
  Description: Binned version of ordfreq

ordfreq_log
  Source: ordfreq
  Type: numeric
  Priority: high
  Description: Log transform of ordfreq (high skewness)

favday_sin_cos
  Source: favday
  Type: cyclical
  Priority: high
  Description: Cyclical encoding (sin/cos) for favday

city_encoded
  Source: city
  Type: categorical
  Priority: high
  Description: One-hot encoded city

created_delta_hours_binned
  Source: created_delta_hours
  Type: numeric
  Priority: low
  Description: Binned version of created_delta_hours

created_delta_hours_log
  Source: created_delta_hours
  Type: numeric
  Priority: high
  Description: Log transform of created_delta_hours (high skewness)

created_hour_binned
  Source: created_hour
  Type: numeric
  Priority: low
  Description: Binned version of created_hour

created_dow_binned
  Source: created_dow
  Type: numeric
  Priority: low
  Description: Binned version of created_dow

firstorder_delta_hours_binned
  Source: firstorder_delta_hours
  Type: numeric
  Priority: low
  Description: Binned version of firstorder_delta_hours

firstorder_delta_hours_log
  Source: firstorder_delta_hours
  Type: numeric
  Priority: high
  Description: Log transform of firstorder_delta_hours (high skewness)

firstorder_hour_binned
  Source: firstorder_hour
  Type: numeric
  Priority: low
  Description: Binned version of firstorder_hour

firstorder_dow_binned
  Source: firstorder_dow
  Type: numeric
  Priority: low
  Description: Binned version of firstorder_dow

days_since_created_binned
  Source: days_since_created
  Type: numeric
  Priority: low
  Description: Binned version of days_since_created

days_since_created_log
  Source: days_since_created
  Type: numeric
  Priority: high
  Description: Log transform of days_since_created (high skewness)

days_until_created_binned
  Source: days_until_created
  Type: numeric
  Priority: low
  Description: Binned version of days_until_created

days_until_created_log
  Source: days_until_created
  Type: numeric
  Priority: high
  Description: Log transform of days_until_created (high skewness)

log1p_days_since_created_binned
  Source: log1p_days_since_created
  Type: numeric
  Priority: low
  Description: Binned version of log1p_days_since_created

is_future_created_binned
  Source: is_future_created
  Type: numeric
  Priority: low
  Description: Binned version of is_future_created

days_since_firstorder_binned
  Source: days_since_firstorder
  Type: numeric
  Priority: low
  Description: Binned version of days_since_firstorder

days_since_firstorder_log
  Source: days_since_firstorder
  Type: numeric
  Priority: high
  Description: Log transform of days_since_firstorder (high skewness)

days_until_firstorder_binned
  Source: days_until_firstorder
  Type: numeric
  Priority: low
  Description: Binned version of days_until_firstorder

days_until_firstorder_log
  Source: days_until_firstorder
  Type: numeric
  Priority: high
  Description: Log transform of days_until_firstorder (high skewness)

log1p_days_since_firstorder_binned
  Source: log1p_days_since_firstorder
  Type: numeric
  Priority: low
  Description: Binned version of log1p_days_since_firstorder

is_future_firstorder_binned
  Source: is_future_firstorder
  Type: numeric
  Priority: low
  Description: Binned version of is_future_firstorder

# Feature Capacity Analysis
capacity_analyzer = FeatureCapacityAnalyzer()

# Get all potential feature columns (excluding target and identifiers)
feature_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [
        ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE,
        ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL,
        ColumnType.BINARY
    ] and name != target
    and name not in TEMPORAL_METADATA_COLS
]

print("=" * 80)
print("FEATURE CAPACITY ANALYSIS")
print("=" * 80)

if target:
    # Analyze capacity with current features
    numeric_features = [
        name for name, col in findings.columns.items()
        if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]
        and name != target
    ]

    capacity_result = capacity_analyzer.analyze(
        df,
        feature_cols=numeric_features,
        target_col=target,
    )

    print("\n📊 DATA SUMMARY:")
    print(f"   Total samples: {capacity_result.total_samples:,}")
    print(f"   Minority class samples: {capacity_result.minority_class_samples:,}")
    print(f"   Minority class rate: {capacity_result.minority_class_samples/capacity_result.total_samples:.1%}")
    print(f"   Current numeric features: {capacity_result.total_features}")

    print("\n📈 FEATURE CAPACITY METRICS:")
    print(f"   Events Per Variable (EPV): {capacity_result.events_per_variable:.1f}")
    print(f"   Samples Per Feature: {capacity_result.samples_per_feature:.1f}")
    print(f"   Capacity Status: {capacity_result.capacity_status.upper()}")

    # Capacity status visualization
    status_colors = {"adequate": "#2ecc71", "limited": "#f39c12", "inadequate": "#e74c3c"}
    status_color = status_colors.get(capacity_result.capacity_status, "#95a5a6")

    print("\n🎯 RECOMMENDED FEATURE COUNTS:")
    print(f"   Conservative (EPV=20): {capacity_result.recommended_features_conservative} features")
    print(f"   Moderate (EPV=10):     {capacity_result.recommended_features_moderate} features")
    print(f"   Aggressive (EPV=5):    {capacity_result.recommended_features_aggressive} features")

    # Effective features analysis
    if capacity_result.effective_features_result:
        eff = capacity_result.effective_features_result
        print("\n🔍 EFFECTIVE FEATURES (accounting for correlation):")
        print(f"   Total features analyzed: {eff.total_count}")
        print(f"   Effective independent features: {eff.effective_count:.1f}")
        print(f"   Redundant features identified: {len(eff.redundant_features)}")

        if eff.redundant_features:
            print("\n   ⚠️ Redundant features (highly correlated):")
            for feat in eff.redundant_features[:5]:
                print(f"      • {feat}")

        if eff.feature_clusters:
            print(f"\n   📦 Correlated feature clusters ({len(eff.feature_clusters)}):")
            for i, cluster in enumerate(eff.feature_clusters[:3]):
                print(f"      Cluster {i+1}: {', '.join(cluster[:4])}")
                if len(cluster) > 4:
                    print(f"                  ... and {len(cluster)-4} more")

    # Persist feature capacity to registry
    registry.add_bronze_feature_capacity(
        epv=capacity_result.events_per_variable,
        capacity_status=capacity_result.capacity_status,
        recommended_features=capacity_result.recommended_features_moderate,
        current_features=capacity_result.total_features,
        rationale=f"EPV={capacity_result.events_per_variable:.1f}, status={capacity_result.capacity_status}",
        source_notebook="06_feature_opportunities"
    )
    print("\n✅ Persisted feature capacity recommendation to registry")

    # Store capacity info in findings
    findings.metadata["feature_capacity"] = capacity_result.to_dict()
else:
    print("\n⚠️ No target column detected. Capacity analysis requires a target variable.")

================================================================================
FEATURE CAPACITY ANALYSIS
================================================================================

📊 DATA SUMMARY:
   Total samples: 13,354,180
   Minority class samples: 2,743,748
   Minority class rate: 20.5%
   Current numeric features: 19

📈 FEATURE CAPACITY METRICS:
   Events Per Variable (EPV): 144407.8
   Samples Per Feature: 702851.6
   Capacity Status: ADEQUATE

🎯 RECOMMENDED FEATURE COUNTS:
   Conservative (EPV=20): 137187 features
   Moderate (EPV=10):     274374 features
   Aggressive (EPV=5):    548749 features

🔍 EFFECTIVE FEATURES (accounting for correlation):
   Total features analyzed: 19
   Effective independent features: 14.0
   Redundant features identified: 5

   ⚠️ Redundant features (highly correlated):
      • days_until_created
      • days_since_firstorder
      • days_since_created
      • days_until_firstorder
      • firstorder_delta_hours

   📦 Correlated feature clusters (1):
      Cluster 1: created_delta_hours, firstorder_delta_hours, days_since_created, days_until_created
                  ... and 2 more

✅ Persisted feature capacity recommendation to registry

# Model Complexity Guidance
if target and 'capacity_result' in dir():
    guidance = capacity_result.complexity_guidance

    print("=" * 70)
    print("MODEL COMPLEXITY GUIDANCE")
    print("=" * 70)

    # Create visualization of feature limits by model type
    model_types = ["Linear\n(no regularization)", "Regularized\n(L1/L2)", "Tree-based\n(RF/XGBoost)"]
    max_features = [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree]
    current_features = capacity_result.total_features

    colors = ['#e74c3c' if m < current_features else '#2ecc71' for m in max_features]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=model_types,
        y=max_features,
        marker_color=colors,
        text=[f"{m}" for m in max_features],
        textposition='outside',
        name='Max Features'
    ))

    # Add horizontal line for current feature count
    fig.add_hline(
        y=current_features,
        line_dash="dash",
        line_color="#3498db",
        annotation_text=f"Current: {current_features}",
        annotation_position="right"
    )

    # Calculate y-axis range to fit labels
    max_val = max(max_features)
    fig.update_layout(
        title="Maximum Recommended Features by Model Type",
        xaxis_title="Model Type",
        yaxis_title="Max Features",
        yaxis_range=[0, max_val * 1.15],  # Add 15% headroom for labels
        template='plotly_white',
        height=400,
        showlegend=False,
    )

    display_figure(fig)

    print(f"\n🎯 RECOMMENDED MODEL TYPE: {guidance.recommended_model_type.replace('_', ' ').title()}")

    print("\n📋 MODEL-SPECIFIC RECOMMENDATIONS:")
    for rec in guidance.model_recommendations:
        print(f"   • {rec}")

    print("\n💡 GENERAL GUIDANCE:")
    for rec in guidance.recommendations:
        print(f"   {rec}")

    # Summary table
    print("\n" + "-" * 70)
    print("FEATURE BUDGET SUMMARY:")
    print("-" * 70)
    summary_data = {
        "Model Type": ["Linear (no regularization)", "Regularized (L1/L2)", "Tree-based"],
        "Max Features": [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree],
        "Current": [current_features] * 3,
        "Status": [
            "✅ OK" if guidance.max_features_linear >= current_features else "⚠️ Reduce",
            "✅ OK" if guidance.max_features_regularized >= current_features else "⚠️ Reduce",
            "✅ OK" if guidance.max_features_tree >= current_features else "⚠️ Reduce"
        ]
    }
    display(pd.DataFrame(summary_data))

    # Persist model type recommendation to registry
    registry.add_bronze_model_type(
        model_type=guidance.recommended_model_type,
        max_features_linear=guidance.max_features_linear,
        max_features_regularized=guidance.max_features_regularized,
        max_features_tree=guidance.max_features_tree,
        rationale=f"Recommended: {guidance.recommended_model_type}",
        source_notebook="06_feature_opportunities"
    )
    print(f"\n✅ Persisted model type recommendation to registry: {guidance.recommended_model_type}")

======================================================================
MODEL COMPLEXITY GUIDANCE
======================================================================

🎯 RECOMMENDED MODEL TYPE: Linear

📋 MODEL-SPECIFIC RECOMMENDATIONS:
   • Adequate data for standard logistic regression
   • Can use all features without regularization
   • Consider tree models for comparison

💡 GENERAL GUIDANCE:
   Adequate: EPV=144407.8. Sufficient data for robust modeling.

----------------------------------------------------------------------
FEATURE BUDGET SUMMARY:
----------------------------------------------------------------------

✅ Persisted model type recommendation to registry: linear

# Segment Capacity Analysis
categorical_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]
    and name not in TEMPORAL_METADATA_COLS
]

print("=" * 70)
print("SEGMENT CAPACITY ANALYSIS")
print("=" * 70)

if target and categorical_cols and 'numeric_features' in dir():
    # Analyze the first categorical column as potential segment
    segment_col = categorical_cols[0]

    print(f"\n📊 Analyzing segments by: {segment_col}")
    print(f"   Features to evaluate: {len(numeric_features)}")

    segment_result = capacity_analyzer.analyze_segment_capacity(
        df,
        feature_cols=numeric_features,
        target_col=target,
        segment_col=segment_col,
    )

    print(f"\n🎯 RECOMMENDED STRATEGY: {segment_result.recommended_strategy.replace('_', ' ').title()}")
    print(f"   Reason: {segment_result.strategy_reason}")

    # Segment details table
    segment_data = []
    for seg_name, cap in segment_result.segment_capacities.items():
        segment_data.append({
            "Segment": seg_name,
            "Samples": cap.total_samples,
            "Minority Events": cap.minority_class_samples,
            "EPV": f"{cap.events_per_variable:.1f}",
            "Max Features (EPV=10)": cap.recommended_features_moderate,
            "Status": cap.capacity_status.title()
        })

    segment_df = pd.DataFrame(segment_data)
    segment_df = segment_df.sort_values("Samples", ascending=False)
    display(segment_df)

    # Visualization
    fig = go.Figure()

    max_events = 0
    for seg_name, cap in segment_result.segment_capacities.items():
        color = "#2ecc71" if cap.capacity_status == "adequate" else "#f39c12" if cap.capacity_status == "limited" else "#e74c3c"
        fig.add_trace(go.Bar(
            name=seg_name,
            x=[seg_name],
            y=[cap.minority_class_samples],
            marker_color=color,
            text=[f"EPV={cap.events_per_variable:.1f}"],
            textposition='outside'
        ))
        max_events = max(max_events, cap.minority_class_samples)

    # Add threshold line
    threshold_events = len(numeric_features) * 10  # EPV=10 threshold
    fig.add_hline(
        y=threshold_events,
        line_dash="dash",
        line_color="#3498db",
        annotation_text=f"Min events for {len(numeric_features)} features (EPV=10)",
        annotation_position="right"
    )

    # Calculate y-axis range to fit labels
    y_max = max(max_events, threshold_events)
    fig.update_layout(
        title=f"Minority Class Events by Segment ({segment_col})",
        xaxis_title="Segment",
        yaxis_title="Minority Class Events",
        yaxis_range=[0, y_max * 1.15],  # Add 15% headroom for labels
        template='plotly_white',
        height=400,
        showlegend=False,
    )
    display_figure(fig)

    print("\n📋 SEGMENT RECOMMENDATIONS:")
    for rec in segment_result.recommendations:
        print(f"   {rec}")

    if segment_result.viable_segments:
        print(f"\n   ✅ Viable for separate models: {', '.join(segment_result.viable_segments)}")
    if segment_result.insufficient_segments:
        print(f"   ⚠️ Insufficient data: {', '.join(segment_result.insufficient_segments)}")

    # Store in findings
    findings.metadata["segment_capacity"] = segment_result.to_dict()
else:
    print("\n⚠️ No categorical columns available for segment analysis.")
    print("   Segment capacity analysis requires at least one categorical column.")

======================================================================
SEGMENT CAPACITY ANALYSIS
======================================================================

📊 Analyzing segments by: city
   Features to evaluate: 19

🎯 RECOMMENDED STRATEGY: Segment Models
   Reason: All segments have adequate data for separate models.

📋 SEGMENT RECOMMENDATIONS:
   ✅ All 4 segments have sufficient data for independent models.
   Consider: Separate models may capture segment-specific patterns better.

   ✅ Viable for separate models: DEL, BOM, MAA, BLR

# Feature Capacity Action Items Summary
if target and 'capacity_result' in dir():
    print("=" * 70)
    print("FEATURE CAPACITY ACTION ITEMS")
    print("=" * 70)

    print("\n📋 BASED ON YOUR DATA CAPACITY:")

    # Action items based on capacity status
    if capacity_result.capacity_status == "adequate":
        print("\n✅ ADEQUATE CAPACITY - You have room to add features")
        print(f"   • Current features: {capacity_result.total_features}")
        print(f"   • Can add up to: {capacity_result.recommended_features_moderate - capacity_result.total_features} more features (EPV=10)")
        print("   • Consider: Creating derived features from datetime and categorical columns")
    elif capacity_result.capacity_status == "limited":
        print("\n⚠️ LIMITED CAPACITY - Be selective with new features")
        print(f"   • Current features: {capacity_result.total_features}")
        print(f"   • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)")
        print(f"   • Action: Remove {max(0, capacity_result.total_features - capacity_result.recommended_features_moderate)} redundant features before adding new ones")
        print("   • Consider: Using regularization (L1/Lasso) if keeping all features")
    else:
        print("\n🔴 INADEQUATE CAPACITY - Reduce features or get more data")
        print(f"   • Current features: {capacity_result.total_features}")
        print(f"   • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)")
        print(f"   • CRITICAL: Reduce to {capacity_result.recommended_features_conservative} features for stable estimates")
        print("   • Options: (1) Feature selection, (2) PCA, (3) Collect more data")

    # Redundancy recommendations
    if capacity_result.effective_features_result and capacity_result.effective_features_result.redundant_features:
        redundant = capacity_result.effective_features_result.redundant_features
        print("\n🔄 REDUNDANT FEATURES TO CONSIDER REMOVING:")
        print("   These features are highly correlated with others and add little new information:")
        for feat in redundant[:5]:
            print(f"   • {feat}")
        if len(redundant) > 5:
            print(f"   ... and {len(redundant) - 5} more")

    # New feature budget
    print("\n💰 FEATURE BUDGET FOR NEW FEATURES:")
    remaining_budget = capacity_result.recommended_features_moderate - capacity_result.total_features
    if remaining_budget > 0:
        print(f"   You can safely add {remaining_budget} new features")
        print("   Prioritize:")
        print("   • Recency features (days_since_last_activity)")
        print("   • Tenure features (days_since_created)")
        print("   • Engagement composites (email_engagement_score)")
    else:
        print(f"   ⚠️ At or over capacity. Remove {-remaining_budget} features before adding new ones.")

    # Model selection summary
    print("\n🎯 RECOMMENDED MODELING APPROACH:")
    if capacity_result.complexity_guidance:
        print(f"   Model type: {capacity_result.complexity_guidance.recommended_model_type.replace('_', ' ').title()}")
        if "regularized" in capacity_result.complexity_guidance.recommended_model_type:
            print("   → Use Lasso (L1) for automatic feature selection")
            print("   → Use Ridge (L2) if you want to keep all features")
        elif "tree" in capacity_result.complexity_guidance.recommended_model_type:
            print("   → Random Forest or XGBoost recommended")
            print("   → Trees handle correlated features naturally")

    print("\n" + "=" * 70)

======================================================================
FEATURE CAPACITY ACTION ITEMS
======================================================================

📋 BASED ON YOUR DATA CAPACITY:

✅ ADEQUATE CAPACITY - You have room to add features
   • Current features: 19
   • Can add up to: 274355 more features (EPV=10)
   • Consider: Creating derived features from datetime and categorical columns

🔄 REDUNDANT FEATURES TO CONSIDER REMOVING:
   These features are highly correlated with others and add little new information:
   • days_until_created
   • days_since_firstorder
   • days_since_created
   • days_until_firstorder
   • firstorder_delta_hours

💰 FEATURE BUDGET FOR NEW FEATURES:
   You can safely add 274355 new features
   Prioritize:
   • Recency features (days_since_last_activity)
   • Tenure features (days_since_created)
   • Engagement composites (email_engagement_score)

🎯 RECOMMENDED MODELING APPROACH:
   Model type: Linear

======================================================================

# Feature Availability Analysis
from customer_retention.stages.features.feature_selector import FeatureSelector

print("=" * 70)
print("FEATURE AVAILABILITY ANALYSIS")
print("=" * 70)

unavailable_features = []
if findings.has_availability_issues:
    selector = FeatureSelector(target_column=target)
    availability_recs = selector.get_availability_recommendations(findings.feature_availability)
    unavailable_features = [rec.column for rec in availability_recs]

    print(f"\n⚠️  {len(availability_recs)} feature(s) have tracking changes:\n")

    for rec in availability_recs:
        print(f"📌 {rec.column}")
        print(f"   Issue: {rec.issue_type} | Coverage: {rec.coverage_pct:.0f}%")
        print(f"   Available: {rec.first_valid_date} → {rec.last_valid_date}")
        print("\n   Remediation options:")
        for opt in rec.options:
            marker = "→" if opt.get("recommended") else " "
            print(f"   {marker} [{opt['type']}] {opt['description']}")
        print()

    print("-" * 70)
    print("RECOMMENDED ACTION: Remove unavailable features before modeling")
    print("-" * 70)
    print(f"\nFeatures to exclude: {', '.join(unavailable_features)}")
    print("\nAlternative approaches (require additional implementation):")
    print("  • segment_by_cohort: Train separate models for different time periods")
    print("  • add_indicator: Create availability flags, impute missing values")
    print("  • filter_window: Restrict training data to feature's available period")

    findings.metadata["unavailable_features"] = unavailable_features
    findings.metadata["availability_action"] = "exclude"
else:
    print("\n✅ All features have full temporal coverage - no availability issues.")

======================================================================
FEATURE AVAILABILITY ANALYSIS
======================================================================

✅ All features have full temporal coverage - no availability issues.

datetime_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type == ColumnType.DATETIME
]

if datetime_cols:
    print("Datetime Feature Opportunities:")
    print("="*50)
    for col in datetime_cols:
        print(f"\n{col}:")
        print(f"  - {col}_year: Extract year")
        print(f"  - {col}_month: Extract month")
        print(f"  - {col}_day: Extract day of month")
        print(f"  - {col}_dayofweek: Extract day of week (0-6)")
        print(f"  - {col}_is_weekend: Is weekend flag")
        print(f"  - days_since_{col}: Days since date")
else:
    print("No datetime columns found.")

Datetime Feature Opportunities:
==================================================

as_of_date:
  - as_of_date_year: Extract year
  - as_of_date_month: Extract month
  - as_of_date_day: Extract day of month
  - as_of_date_dayofweek: Extract day of week (0-6)
  - as_of_date_is_weekend: Is weekend flag
  - days_since_as_of_date: Days since date

created:
  - created_year: Extract year
  - created_month: Extract month
  - created_day: Extract day of month
  - created_dayofweek: Extract day of week (0-6)
  - created_is_weekend: Is weekend flag
  - days_since_created: Days since date

firstorder:
  - firstorder_year: Extract year
  - firstorder_month: Extract month
  - firstorder_day: Extract day of month
  - firstorder_dayofweek: Extract day of week (0-6)
  - firstorder_is_weekend: Is weekend flag
  - days_since_firstorder: Days since date

lastorder:
  - lastorder_year: Extract year
  - lastorder_month: Extract month
  - lastorder_day: Extract day of month
  - lastorder_dayofweek: Extract day of week (0-6)
  - lastorder_is_weekend: Is weekend flag
  - days_since_lastorder: Days since date

print("=" * 70)
print("CREATING DERIVED FEATURES")
print("=" * 70)

segmenter = CustomerSegmenter()
df_features = df.copy()

datetime_cols = [name for name, col in findings.columns.items()
                 if col.inferred_type == ColumnType.DATETIME
                 and name not in TEMPORAL_METADATA_COLS]
binary_cols = [name for name, col in findings.columns.items()
               if col.inferred_type == ColumnType.BINARY
               and name not in TEMPORAL_METADATA_COLS]
numeric_cols = [name for name, col in findings.columns.items()
                if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]]

from customer_retention.core.compat import as_tz_naive

for col in datetime_cols:
    df_features[col] = as_tz_naive(pd.to_datetime(df_features[col], errors='coerce', format='mixed'))

reference_date = pd.Timestamp.now()
if datetime_cols:
    last_dates = [df_features[col].max() for col in datetime_cols if df_features[col].notna().any()]
    if last_dates:
        reference_date = max(last_dates)
print(f"\nReference date: {reference_date}")

print("\n📅 TIME-BASED FEATURES:")
created_cols = [c for c in datetime_cols if 'creat' in c.lower() or 'signup' in c.lower() or 'register' in c.lower()]
if created_cols:
    created_col = created_cols[0]
    df_features = segmenter.create_tenure_features(df_features, created_column=created_col, reference_date=reference_date)
    print(f"  ✓ tenure_days from {created_col}")
    registry.add_silver_derived(
        column="tenure_days",
        expression=f"(reference_date - {created_col}).days",
        feature_type="tenure",
        rationale=f"Customer tenure in days from {created_col}",
        source_notebook="06_feature_opportunities"
    )

activity_cols = [c for c in datetime_cols if 'last' in c.lower() or 'recent' in c.lower()]
if activity_cols:
    activity_col = activity_cols[0]
    df_features = segmenter.create_recency_features(df_features, last_activity_column=activity_col,
                                                     reference_date=reference_date, output_column='days_since_last_activity')
    print(f"  ✓ days_since_last_activity from {activity_col}")
    registry.add_silver_derived(
        column="days_since_last_activity",
        expression=f"(reference_date - {activity_col}).days",
        feature_type="recency",
        rationale=f"Days since last activity from {activity_col}",
        source_notebook="06_feature_opportunities"
    )

print("\n📧 ENGAGEMENT FEATURES:")
rate_cols = [c for c in numeric_cols if 'rate' in c.lower() or 'pct' in c.lower() or 'percent' in c.lower()]
open_rate_cols = [c for c in rate_cols if 'open' in c.lower()]
click_rate_cols = [c for c in rate_cols if 'click' in c.lower()]

if open_rate_cols and click_rate_cols:
    open_col, click_col = open_rate_cols[0], click_rate_cols[0]
    df_features = segmenter.create_engagement_score(df_features, open_rate_column=open_col,
                                                     click_rate_column=click_col, output_column='email_engagement_score')
    print(f"  ✓ email_engagement_score from {open_col}, {click_col}")
    registry.add_silver_derived(
        column="email_engagement_score",
        expression=f"0.6 * {open_col} + 0.4 * {click_col}",
        feature_type="composite",
        rationale=f"Weighted engagement score from {open_col} and {click_col}",
        source_notebook="06_feature_opportunities"
    )

    df_features['click_to_open_rate'] = np.where(df_features[open_col] > 0, df_features[click_col] / df_features[open_col], 0)
    print("  ✓ click_to_open_rate")
    registry.add_silver_ratio(
        column="click_to_open_rate",
        numerator=click_col,
        denominator=open_col,
        rationale=f"Click-to-open ratio: {click_col} / {open_col}",
        source_notebook="06_feature_opportunities"
    )

print("\n🔧 SERVICE ADOPTION:")
if binary_cols:
    service_binary = [c for c in binary_cols if c != target]
    if service_binary:
        df_features['service_adoption_score'] = df_features[service_binary].sum(axis=1)
        print(f"  ✓ service_adoption_score from {service_binary}")
        registry.add_silver_derived(
            column="service_adoption_score",
            expression=f"sum([{', '.join(service_binary)}])",
            feature_type="composite",
            rationale=f"Service adoption count from {len(service_binary)} binary flags",
            source_notebook="06_feature_opportunities"
        )

print("\n💰 VALUE FEATURES:")
value_cols = [c for c in numeric_cols if 'order' in c.lower() or 'amount' in c.lower() or 'value' in c.lower() or 'avg' in c.lower()]
freq_cols = [c for c in numeric_cols if 'freq' in c.lower() or 'count' in c.lower()]
if value_cols and freq_cols:
    df_features['value_frequency_product'] = df_features[value_cols[0]] * df_features[freq_cols[0]]
    print(f"  ✓ value_frequency_product from {value_cols[0]}, {freq_cols[0]}")
    registry.add_silver_interaction(
        column="value_frequency_product",
        features=[value_cols[0], freq_cols[0]],
        rationale=f"Value-frequency interaction: {value_cols[0]} × {freq_cols[0]}",
        source_notebook="06_feature_opportunities"
    )

new_cols = len(df_features.columns) - len(df.columns)
print(f"\n✓ Created {new_cols} new features (total: {len(df_features.columns)})")
print(f"✅ Persisted {len([c for c in ['tenure_days', 'days_since_last_activity', 'email_engagement_score', 'click_to_open_rate', 'service_adoption_score', 'value_frequency_product'] if c in df_features.columns])} derived feature recommendations to registry")

======================================================================
CREATING DERIVED FEATURES
======================================================================

Reference date: 2018-01-21 00:00:00

📅 TIME-BASED FEATURES:

  ✓ tenure_days from created

  ✓ days_since_last_activity from lastorder

📧 ENGAGEMENT FEATURES:

  ✓ email_engagement_score from eopenrate, eclickrate
  ✓ click_to_open_rate

🔧 SERVICE ADOPTION:

  ✓ service_adoption_score from ['paperless', 'refill', 'doorstep', 'created_is_weekend', 'firstorder_is_weekend', 'is_missing_created', 'is_missing_firstorder']

💰 VALUE FEATURES:
  ✓ value_frequency_product from avgorder, ordfreq

✓ Created 8 new features (total: 43)
✅ Persisted 6 derived feature recommendations to registry

print("=" * 70)
print("CUSTOMER SEGMENTATION")
print("=" * 70)

print("\n🎯 VALUE-FREQUENCY SEGMENTS:")
value_cols = [c for c in numeric_cols if 'order' in c.lower() or 'amount' in c.lower() or 'value' in c.lower() or 'avg' in c.lower()]
freq_cols = [c for c in numeric_cols if 'freq' in c.lower() or 'count' in c.lower()]

if value_cols and freq_cols:
    df_features, vf_result = segmenter.segment_by_value_frequency(
        df_features, value_column=value_cols[0], frequency_column=freq_cols[0])
    print(f"  Using {value_cols[0]} × {freq_cols[0]}")
    for seg in vf_result.segments:
        print(f"    {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)")
else:
    print("  No suitable value/frequency columns found")

print("\n📅 RECENCY SEGMENTS:")
if 'days_since_last_activity' in df_features.columns:
    df_features, recency_result = segmenter.segment_by_recency(df_features, days_since_column='days_since_last_activity')
    for seg in recency_result.segments:
        print(f"    {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)")
else:
    print("  No recency column available")

print("\n📧 ENGAGEMENT SEGMENTS:")
if 'email_engagement_score' in df_features.columns:
    max_score = df_features['email_engagement_score'].max()
    if max_score > 0:
        df_features['engagement_normalized'] = df_features['email_engagement_score'] / max_score
        df_features, eng_result = segmenter.segment_by_engagement(df_features, engagement_column='engagement_normalized')
        for seg in eng_result.segments:
            print(f"    {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)")
        df_features = df_features.drop(columns=['engagement_normalized'])
else:
    print("  No engagement score available")

if 'customer_segment' in df_features.columns and target and target in df_features.columns:
    segment_retention = df_features.groupby('customer_segment')[target].mean() * 100

    max_rate = segment_retention.max()
    fig = go.Figure(go.Bar(
        x=segment_retention.index, y=segment_retention.values,
        marker_color=['#2ca02c' if r > 70 else '#ffbb00' if r > 50 else '#d62728' for r in segment_retention.values],
        text=[f'{r:.1f}%' for r in segment_retention.values], textposition='outside'))
    fig.update_layout(
        title='Retention Rate by Customer Segment',
        xaxis_title='Segment',
        yaxis_title='Retention Rate (%)',
        yaxis_range=[0, max_rate * 1.15],  # Add 15% headroom for labels
        template='plotly_white',
        height=400,
    )
    display_figure(fig)

segment_cols = [c for c in df_features.columns if 'segment' in c.lower() or 'bucket' in c.lower()]
print(f"\n✓ Created {len(segment_cols)} segmentation features")

======================================================================
CUSTOMER SEGMENTATION
======================================================================

🎯 VALUE-FREQUENCY SEGMENTS:

  Using avgorder × ordfreq
    High_Value_Frequent: 6,677,958 (50.0%)
    High_Value_Infrequent: 0 (0.0%)
    Low_Value_Frequent: 6,676,222 (50.0%)
    Low_Value_Infrequent: 0 (0.0%)

📅 RECENCY SEGMENTS:

    Active_30d: 424,886 (3.2%)
    Recent_90d: 675,304 (5.1%)
    Lapsing_180d: 652,736 (4.9%)
    Dormant_180d+: 11,593,442 (86.8%)

📧 ENGAGEMENT SEGMENTS:

    High_Engagement: 226,114 (1.7%)
    Medium_Engagement: 2,939,048 (22.0%)
    Low_Engagement: 10,189,018 (76.3%)

✓ Created 4 segmentation features

numeric_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]
    and name not in TEMPORAL_METADATA_COLS
]

transform_count = 0
if numeric_cols:
    print("Numeric Transformation Opportunities:")
    print("="*50)

    for col_name in numeric_cols:
        col_info = findings.columns[col_name]
        series = df[col_name].dropna()
        skewness = series.skew()

        print(f"\n{col_name}:")
        print(f"  Skewness: {skewness:.2f}")

        if abs(skewness) > 1:
            print("  Recommendation: Apply log transform (highly skewed)")
            registry.add_gold_transformation(
                column=col_name,
                transform="log",
                parameters={"skewness": float(skewness), "reason": "highly_skewed"},
                rationale=f"Log transform for highly skewed distribution (skewness={skewness:.2f})",
                source_notebook="06_feature_opportunities"
            )
            transform_count += 1
        elif abs(skewness) > 0.5:
            print("  Recommendation: Consider sqrt transform (moderately skewed)")
            registry.add_gold_transformation(
                column=col_name,
                transform="sqrt",
                parameters={"skewness": float(skewness), "reason": "moderately_skewed"},
                rationale=f"Sqrt transform for moderately skewed distribution (skewness={skewness:.2f})",
                source_notebook="06_feature_opportunities"
            )
            transform_count += 1
        else:
            print("  Recommendation: Standard scaling sufficient")
            registry.add_gold_scaling(
                column=col_name,
                method="standard",
                rationale=f"Standard scaling for normally distributed column (skewness={skewness:.2f})",
                source_notebook="06_feature_opportunities"
            )
            transform_count += 1

        if col_info.inferred_type == ColumnType.NUMERIC_CONTINUOUS:
            print(f"  Binning: Consider creating bins for {col_name}_binned")

    print(f"\n✅ Persisted {transform_count} transformation recommendations to registry")

Numeric Transformation Opportunities:
==================================================

esent:
  Skewness: -0.05
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for esent_binned

eopenrate:
  Skewness: 1.17
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for eopenrate_binned

eclickrate:
  Skewness: 3.90
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for eclickrate_binned

avgorder:
  Skewness: 11.70
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for avgorder_binned

ordfreq:
  Skewness: 10.47
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for ordfreq_binned

created_delta_hours:
  Skewness: -2.86
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for created_delta_hours_binned

created_hour:
  Skewness: 0.00
  Recommendation: Standard scaling sufficient

created_dow:
  Skewness: 0.16
  Recommendation: Standard scaling sufficient

firstorder_delta_hours:
  Skewness: -3.28
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for firstorder_delta_hours_binned

firstorder_hour:
  Skewness: 0.00
  Recommendation: Standard scaling sufficient

firstorder_dow:
  Skewness: 0.26
  Recommendation: Standard scaling sufficient

days_since_created:
  Skewness: 2.86
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for days_since_created_binned

days_until_created:
  Skewness: -2.86
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for days_until_created_binned

log1p_days_since_created:
  Skewness: 0.21
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for log1p_days_since_created_binned

is_future_created:
  Skewness: 0.00
  Recommendation: Standard scaling sufficient

days_since_firstorder:
  Skewness: 3.28
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for days_since_firstorder_binned

days_until_firstorder:
  Skewness: -3.28
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for days_until_firstorder_binned

log1p_days_since_firstorder:
  Skewness: 0.89
  Recommendation: Consider sqrt transform (moderately skewed)
  Binning: Consider creating bins for log1p_days_since_firstorder_binned

is_future_firstorder:
  Skewness: 0.00
  Recommendation: Standard scaling sufficient

✅ Persisted 19 transformation recommendations to registry

categorical_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]
    and name not in TEMPORAL_METADATA_COLS
]

encoding_count = 0
if categorical_cols:
    print("Categorical Encoding Recommendations:")
    print("="*50)

    for col_name in categorical_cols:
        col_info = findings.columns[col_name]
        distinct = col_info.universal_metrics.get("distinct_count", 0)

        print(f"\n{col_name}: ({distinct} unique values)")

        if distinct <= 5:
            print("  Recommendation: One-hot encoding")
            registry.add_gold_encoding(
                column=col_name,
                method="onehot",
                rationale=f"One-hot encoding for low cardinality ({distinct} unique values)",
                source_notebook="06_feature_opportunities"
            )
            encoding_count += 1
        elif distinct <= 20:
            print("  Recommendation: Target encoding or one-hot with frequency threshold")
            registry.add_gold_encoding(
                column=col_name,
                method="target",
                rationale=f"Target encoding for medium cardinality ({distinct} unique values)",
                source_notebook="06_feature_opportunities"
            )
            encoding_count += 1
        else:
            print("  Recommendation: Target encoding or embedding (high cardinality)")
            registry.add_gold_encoding(
                column=col_name,
                method="target",
                rationale=f"Target encoding for high cardinality ({distinct} unique values)",
                source_notebook="06_feature_opportunities"
            )
            encoding_count += 1

        if col_info.inferred_type == ColumnType.CATEGORICAL_ORDINAL:
            print("  Note: Consider ordinal encoding to preserve order")

    print(f"\n✅ Persisted {encoding_count} encoding recommendations to registry")

Categorical Encoding Recommendations:
==================================================

city: (4 unique values)
  Recommendation: One-hot encoding

✅ Persisted 1 encoding recommendations to registry

print("Potential Interaction Features:")
print("="*50)

if len(numeric_cols) >= 2:
    print("\nNumeric Interactions:")
    for i, col1 in enumerate(numeric_cols[:3]):
        for col2 in numeric_cols[i+1:4]:
            print(f"  - {col1}_x_{col2}: Multiplication")
            print(f"  - {col1}_div_{col2}: Division (if {col2} > 0)")

if categorical_cols and numeric_cols:
    print("\nCategorical-Numeric Interactions:")
    for cat_col in categorical_cols[:2]:
        for num_col in numeric_cols[:2]:
            print(f"  - {num_col}_by_{cat_col}_mean: Group mean")
            print(f"  - {num_col}_by_{cat_col}_std: Group std")

Potential Interaction Features:
==================================================

Numeric Interactions:
  - esent_x_eopenrate: Multiplication
  - esent_div_eopenrate: Division (if eopenrate > 0)
  - esent_x_eclickrate: Multiplication
  - esent_div_eclickrate: Division (if eclickrate > 0)
  - esent_x_avgorder: Multiplication
  - esent_div_avgorder: Division (if avgorder > 0)
  - eopenrate_x_eclickrate: Multiplication
  - eopenrate_div_eclickrate: Division (if eclickrate > 0)
  - eopenrate_x_avgorder: Multiplication
  - eopenrate_div_avgorder: Division (if avgorder > 0)
  - eclickrate_x_avgorder: Multiplication
  - eclickrate_div_avgorder: Division (if avgorder > 0)

Categorical-Numeric Interactions:
  - esent_by_city_mean: Group mean
  - esent_by_city_std: Group std
  - eopenrate_by_city_mean: Group mean
  - eopenrate_by_city_std: Group std

feature_summary = []
for rec in feature_recs:
    feature_summary.append({
        "Feature Name": rec.feature_name,
        "Source": rec.source_column,
        "Type": rec.feature_type,
        "Priority": rec.priority
    })

if feature_summary:
    summary_df = pd.DataFrame(feature_summary)
    display(summary_df)

# Save recommendations
registry.save(RECOMMENDATIONS_PATH)

print(f"✅ Saved {len(registry.all_recommendations)} recommendations to {RECOMMENDATIONS_PATH}")
print("\nRecommendations by layer:")
for layer in ["bronze", "silver", "gold"]:
    recs = registry.get_by_layer(layer)
    print(f"  {layer.upper()}: {len(recs)}")

from customer_retention.analysis.notebook_html_exporter import export_notebook_html

export_notebook_html(Path("06_feature_opportunities.ipynb"), EXPERIMENTS_DIR / "docs")

✅ Saved 1166 recommendations to /Users/Vital/python/CustomerRetention/experiments/runs/retail-e7471284/merged/recommendations.yaml

Recommendations by layer:
  BRONZE: 5
  SILVER: 13
  GOLD: 1148

PosixPath('/Users/Vital/python/CustomerRetention/experiments/docs/06_feature_opportunities.html')

Feature Type	Business Meaning	Predictive Power
Tenure	How long customer has been with us	Loyalty indicator
Recency	Days since last order	Engagement/churn signal
Engagement Score	Combined email metrics	Overall engagement level
Segments	High/Low value × Frequent/Infrequent	Risk stratification

EPV Level	Risk Level	Recommendations
EPV ≥ 20	Low risk	Stable coefficients, reliable inference
EPV = 10-20	Moderate	Standard practice, consider regularization
EPV = 5-10	Elevated	Strong regularization required (L1/Lasso)
EPV < 5	High risk	Reduce features or collect more data

Approach	When to Use	Pros	Cons
Single Model	Small data, uniform segments	More data per model, simpler	May miss segment-specific patterns
Segment Models	Large data, distinct segments	Tailored patterns	Need sufficient data per segment
Hybrid	Mixed segment sizes	Best of both	More complex to maintain

	Segment	Samples	Minority Events	EPV	Max Features (EPV=10)	Status
1	BOM	5014436	1006012	52948.0	100601	Adequate
0	DEL	3810520	1006012	52948.0	100601	Adequate
2	MAA	3793160	638848	33623.6	63884	Adequate
3	BLR	736064	92876	4888.2	9287	Adequate

Metric	What It Means	Rule of Thumb
EPV ≥ 20	Stable, reliable estimates	Conservative, regulatory-grade
EPV = 10-20	Standard practice	Use for most applications
EPV = 5-10	Limited capacity	Requires strong regularization
EPV < 5	High risk	Reduce features or get more data

Chapter 6: Feature Opportunities¶

Why Feature Engineering Matters¶

6.1 Setup¶

6.2 Automated Feature Recommendations¶

6.3 Feature Capacity Analysis¶

6.3.1 Model Complexity Guidance¶

6.3.2 Segment-Specific Capacity (for Multi-Model Strategy)¶

6.3.3 Feature Capacity Action Items¶

6.3.4 Feature Availability Issues¶

6.4 Datetime Feature Opportunities¶

6.5 Business-Driven Derived Features¶

6.6 Customer Segmentation Features¶

6.7 Numeric Transformation Opportunities¶

6.8 Categorical Encoding Opportunities¶

Summary: What We Learned¶

Feature Capacity Analysis¶

Feature Engineering¶

Feature Capacity Key Concepts¶

Key Derived Features Created¶

Next Steps¶

6.9 Feature Summary Table¶

Next Steps¶

	Model Type	Max Features	Current	Status
0	Linear (no regularization)	274374	19	✅ OK
1	Regularized (L1/L2)	548749	19	✅ OK
2	Tree-based	445139	19	✅ OK

Feature	Formula	Business Meaning
`tenure_days`	reference_date - created	Customer longevity
`days_since_last_order`	reference_date - lastorder	Recency/engagement
`email_engagement_score`	0.6×openrate + 0.4×clickrate	Overall engagement
`service_adoption_score`	paperless + refill + doorstep	Service utilization
`customer_segment`	Value × Frequency quadrant	Customer type

	Feature Name	Source	Type	Priority
0	as_of_date_year	as_of_date	temporal	medium
1	as_of_date_month	as_of_date	temporal	medium
2	as_of_date_dayofweek	as_of_date	temporal	medium
3	days_since_as_of_date	as_of_date	datetime	high
4	created_year	created	temporal	medium
5	created_month	created	temporal	medium
6	created_dayofweek	created	temporal	medium
7	days_since_created	created	datetime	high
8	firstorder_year	firstorder	temporal	medium
9	firstorder_month	firstorder	temporal	medium
10	firstorder_dayofweek	firstorder	temporal	medium
11	days_since_firstorder	firstorder	datetime	high
12	lastorder_year	lastorder	temporal	medium
13	lastorder_month	lastorder	temporal	medium
14	lastorder_dayofweek	lastorder	temporal	medium
15	days_since_lastorder	lastorder	datetime	high
16	esent_binned	esent	numeric	low
17	eopenrate_binned	eopenrate	numeric	low
18	eopenrate_log	eopenrate	numeric	high
19	eclickrate_binned	eclickrate	numeric	low
20	eclickrate_log	eclickrate	numeric	high
21	avgorder_binned	avgorder	numeric	low
22	avgorder_log	avgorder	numeric	high
23	ordfreq_binned	ordfreq	numeric	low
24	ordfreq_log	ordfreq	numeric	high
25	favday_sin_cos	favday	cyclical	high
26	city_encoded	city	categorical	high
27	created_delta_hours_binned	created_delta_hours	numeric	low
28	created_delta_hours_log	created_delta_hours	numeric	high
29	created_hour_binned	created_hour	numeric	low
30	created_dow_binned	created_dow	numeric	low
31	firstorder_delta_hours_binned	firstorder_delta_hours	numeric	low
32	firstorder_delta_hours_log	firstorder_delta_hours	numeric	high
33	firstorder_hour_binned	firstorder_hour	numeric	low
34	firstorder_dow_binned	firstorder_dow	numeric	low
35	days_since_created_binned	days_since_created	numeric	low
36	days_since_created_log	days_since_created	numeric	high
37	days_until_created_binned	days_until_created	numeric	low
38	days_until_created_log	days_until_created	numeric	high
39	log1p_days_since_created_binned	log1p_days_since_created	numeric	low
40	is_future_created_binned	is_future_created	numeric	low
41	days_since_firstorder_binned	days_since_firstorder	numeric	low
42	days_since_firstorder_log	days_since_firstorder	numeric	high
43	days_until_firstorder_binned	days_until_firstorder	numeric	low
44	days_until_firstorder_log	days_until_firstorder	numeric	high
45	log1p_days_since_firstorder_binned	log1p_days_since_firstorder	numeric	low
46	is_future_firstorder_binned	is_future_firstorder	numeric	low