from customer_retention.analysis.notebook_progress import track_and_export_previous

track_and_export_previous("06_feature_opportunities.ipynb")

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import yaml

from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine, RecommendationRegistry
from customer_retention.analysis.visualization import ChartBuilder, display_figure
from customer_retention.core.config.column_config import ColumnType
from customer_retention.core.config.experiments import (
    EXPERIMENTS_DIR,
    FINDINGS_DIR,
)
from customer_retention.stages.features import CustomerSegmenter
from customer_retention.stages.profiling import FeatureCapacityAnalyzer

from pathlib import Path

from customer_retention.analysis.auto_explorer import load_notebook_findings, resolve_target_column

FINDINGS_PATH, _namespace, dataset_name = load_notebook_findings(
    "06_feature_opportunities.ipynb", prefer_merged=True
)
print(f"Using: {FINDINGS_PATH}")

RECOMMENDATIONS_PATH = FINDINGS_PATH.replace("_findings.yaml", "_recommendations.yaml")

findings = ExplorationFindings.load(FINDINGS_PATH)
target = resolve_target_column(_namespace, findings)

# Load data
from customer_retention.analysis.auto_explorer.active_dataset_store import load_active_dataset
from customer_retention.core.config.column_config import DatasetGranularity
from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS

if dataset_name is None and _namespace:
    from customer_retention.integrations.adapters.factory import get_delta
    df = get_delta(force_local=True).read(str(_namespace.silver_merged_path))
    data_source = "silver_merged"
    RECOMMENDATIONS_PATH = str(_namespace.merged_recommendations_path)
elif "_aggregated" in FINDINGS_PATH and _namespace:
    from customer_retention.analysis.auto_explorer.active_dataset_store import load_silver_merged
    df = load_silver_merged(_namespace, dataset_name, DatasetGranularity.EVENT_LEVEL)
    data_source = f"aggregated:{dataset_name}"
else:
    df = load_active_dataset(_namespace, dataset_name)
    data_source = dataset_name

charts = ChartBuilder()

if Path(RECOMMENDATIONS_PATH).exists():
    with open(RECOMMENDATIONS_PATH, "r") as f:
        registry = RecommendationRegistry.from_dict(yaml.safe_load(f))
    print(f"Loaded existing recommendations: {len(registry.all_recommendations)} total")
else:
    registry = RecommendationRegistry()
    print("Initialized new recommendation registry")

# Ensure all layers are initialized (even if loaded from file)
if not registry.bronze:
    registry.init_bronze(findings.source_path)
if not registry.silver:
    registry.init_silver(findings.entity_column or "entity_id")
if not registry.gold:
    registry.init_gold(target or "target")
    print("  Initialized gold layer for feature engineering recommendations")

print(f"\nLoaded {len(df):,} rows from: {data_source}")

Using: /Users/Vital/python/CustomerRetention/experiments/runs/email-6301db6c/merged/silver_merged_findings.yaml

Loaded existing recommendations: 674 total

Loaded 2,019,192 rows from: silver_merged

recommender = RecommendationEngine()
feature_recs = recommender.recommend_features(findings)

print(f"Found {len(feature_recs)} feature engineering opportunities:\n")

for rec in feature_recs:
    print(f"{rec.feature_name}")
    print(f"  Source: {rec.source_column}")
    print(f"  Type: {rec.feature_type}")
    print(f"  Priority: {rec.priority}")
    print(f"  Description: {rec.description}")
    print()

Found 339 feature engineering opportunities:

as_of_date_year
  Source: as_of_date
  Type: temporal
  Priority: medium
  Description: Extract year from as_of_date

as_of_date_month
  Source: as_of_date
  Type: temporal
  Priority: medium
  Description: Extract month from as_of_date

as_of_date_dayofweek
  Source: as_of_date
  Type: temporal
  Priority: medium
  Description: Extract day of week from as_of_date

days_since_as_of_date
  Source: as_of_date
  Type: datetime
  Priority: high
  Description: Days since as_of_date until today

event_count_180d_binned
  Source: event_count_180d
  Type: numeric
  Priority: low
  Description: Binned version of event_count_180d

event_count_180d_log
  Source: event_count_180d
  Type: numeric
  Priority: high
  Description: Log transform of event_count_180d (high skewness)

event_count_365d_binned
  Source: event_count_365d
  Type: numeric
  Priority: low
  Description: Binned version of event_count_365d

event_count_365d_log
  Source: event_count_365d
  Type: numeric
  Priority: high
  Description: Log transform of event_count_365d (high skewness)

event_count_all_time_binned
  Source: event_count_all_time
  Type: numeric
  Priority: low
  Description: Binned version of event_count_all_time

event_count_all_time_log
  Source: event_count_all_time
  Type: numeric
  Priority: high
  Description: Log transform of event_count_all_time (high skewness)

opened_sum_180d_binned
  Source: opened_sum_180d
  Type: numeric
  Priority: low
  Description: Binned version of opened_sum_180d

opened_sum_180d_log
  Source: opened_sum_180d
  Type: numeric
  Priority: high
  Description: Log transform of opened_sum_180d (high skewness)

opened_mean_180d_binned
  Source: opened_mean_180d
  Type: numeric
  Priority: low
  Description: Binned version of opened_mean_180d

opened_mean_180d_log
  Source: opened_mean_180d
  Type: numeric
  Priority: high
  Description: Log transform of opened_mean_180d (high skewness)

opened_count_180d_binned
  Source: opened_count_180d
  Type: numeric
  Priority: low
  Description: Binned version of opened_count_180d

opened_count_180d_log
  Source: opened_count_180d
  Type: numeric
  Priority: high
  Description: Log transform of opened_count_180d (high skewness)

clicked_sum_180d_binned
  Source: clicked_sum_180d
  Type: numeric
  Priority: low
  Description: Binned version of clicked_sum_180d

clicked_sum_180d_log
  Source: clicked_sum_180d
  Type: numeric
  Priority: high
  Description: Log transform of clicked_sum_180d (high skewness)

clicked_mean_180d_binned
  Source: clicked_mean_180d
  Type: numeric
  Priority: low
  Description: Binned version of clicked_mean_180d

clicked_mean_180d_log
  Source: clicked_mean_180d
  Type: numeric
  Priority: high
  Description: Log transform of clicked_mean_180d (high skewness)

clicked_count_180d_binned
  Source: clicked_count_180d
  Type: numeric
  Priority: low
  Description: Binned version of clicked_count_180d

clicked_count_180d_log
  Source: clicked_count_180d
  Type: numeric
  Priority: high
  Description: Log transform of clicked_count_180d (high skewness)

send_hour_sum_180d_binned
  Source: send_hour_sum_180d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_sum_180d

send_hour_sum_180d_log
  Source: send_hour_sum_180d
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_sum_180d (high skewness)

send_hour_mean_180d_binned
  Source: send_hour_mean_180d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_mean_180d

send_hour_max_180d_binned
  Source: send_hour_max_180d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_max_180d

send_hour_count_180d_binned
  Source: send_hour_count_180d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_count_180d

send_hour_count_180d_log
  Source: send_hour_count_180d
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_count_180d (high skewness)

bounced_sum_180d_binned
  Source: bounced_sum_180d
  Type: numeric
  Priority: low
  Description: Binned version of bounced_sum_180d

bounced_sum_180d_log
  Source: bounced_sum_180d
  Type: numeric
  Priority: high
  Description: Log transform of bounced_sum_180d (high skewness)

bounced_mean_180d_binned
  Source: bounced_mean_180d
  Type: numeric
  Priority: low
  Description: Binned version of bounced_mean_180d

bounced_mean_180d_log
  Source: bounced_mean_180d
  Type: numeric
  Priority: high
  Description: Log transform of bounced_mean_180d (high skewness)

bounced_count_180d_binned
  Source: bounced_count_180d
  Type: numeric
  Priority: low
  Description: Binned version of bounced_count_180d

bounced_count_180d_log
  Source: bounced_count_180d
  Type: numeric
  Priority: high
  Description: Log transform of bounced_count_180d (high skewness)

time_to_open_hours_sum_180d_binned
  Source: time_to_open_hours_sum_180d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_sum_180d

time_to_open_hours_sum_180d_log
  Source: time_to_open_hours_sum_180d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_sum_180d (high skewness)

time_to_open_hours_mean_180d_binned
  Source: time_to_open_hours_mean_180d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_mean_180d

time_to_open_hours_mean_180d_log
  Source: time_to_open_hours_mean_180d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_mean_180d (high skewness)

time_to_open_hours_max_180d_binned
  Source: time_to_open_hours_max_180d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_max_180d

time_to_open_hours_max_180d_log
  Source: time_to_open_hours_max_180d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_max_180d (high skewness)

time_to_open_hours_count_180d_binned
  Source: time_to_open_hours_count_180d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_count_180d

time_to_open_hours_count_180d_log
  Source: time_to_open_hours_count_180d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_count_180d (high skewness)

opened_sum_365d_binned
  Source: opened_sum_365d
  Type: numeric
  Priority: low
  Description: Binned version of opened_sum_365d

opened_sum_365d_log
  Source: opened_sum_365d
  Type: numeric
  Priority: high
  Description: Log transform of opened_sum_365d (high skewness)

opened_mean_365d_binned
  Source: opened_mean_365d
  Type: numeric
  Priority: low
  Description: Binned version of opened_mean_365d

opened_mean_365d_log
  Source: opened_mean_365d
  Type: numeric
  Priority: high
  Description: Log transform of opened_mean_365d (high skewness)

opened_count_365d_binned
  Source: opened_count_365d
  Type: numeric
  Priority: low
  Description: Binned version of opened_count_365d

opened_count_365d_log
  Source: opened_count_365d
  Type: numeric
  Priority: high
  Description: Log transform of opened_count_365d (high skewness)

clicked_sum_365d_binned
  Source: clicked_sum_365d
  Type: numeric
  Priority: low
  Description: Binned version of clicked_sum_365d

clicked_sum_365d_log
  Source: clicked_sum_365d
  Type: numeric
  Priority: high
  Description: Log transform of clicked_sum_365d (high skewness)

clicked_mean_365d_binned
  Source: clicked_mean_365d
  Type: numeric
  Priority: low
  Description: Binned version of clicked_mean_365d

clicked_mean_365d_log
  Source: clicked_mean_365d
  Type: numeric
  Priority: high
  Description: Log transform of clicked_mean_365d (high skewness)

clicked_count_365d_binned
  Source: clicked_count_365d
  Type: numeric
  Priority: low
  Description: Binned version of clicked_count_365d

clicked_count_365d_log
  Source: clicked_count_365d
  Type: numeric
  Priority: high
  Description: Log transform of clicked_count_365d (high skewness)

send_hour_sum_365d_binned
  Source: send_hour_sum_365d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_sum_365d

send_hour_sum_365d_log
  Source: send_hour_sum_365d
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_sum_365d (high skewness)

send_hour_mean_365d_binned
  Source: send_hour_mean_365d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_mean_365d

send_hour_max_365d_binned
  Source: send_hour_max_365d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_max_365d

send_hour_count_365d_binned
  Source: send_hour_count_365d
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_count_365d

send_hour_count_365d_log
  Source: send_hour_count_365d
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_count_365d (high skewness)

bounced_sum_365d_binned
  Source: bounced_sum_365d
  Type: numeric
  Priority: low
  Description: Binned version of bounced_sum_365d

bounced_sum_365d_log
  Source: bounced_sum_365d
  Type: numeric
  Priority: high
  Description: Log transform of bounced_sum_365d (high skewness)

bounced_mean_365d_binned
  Source: bounced_mean_365d
  Type: numeric
  Priority: low
  Description: Binned version of bounced_mean_365d

bounced_mean_365d_log
  Source: bounced_mean_365d
  Type: numeric
  Priority: high
  Description: Log transform of bounced_mean_365d (high skewness)

bounced_count_365d_binned
  Source: bounced_count_365d
  Type: numeric
  Priority: low
  Description: Binned version of bounced_count_365d

bounced_count_365d_log
  Source: bounced_count_365d
  Type: numeric
  Priority: high
  Description: Log transform of bounced_count_365d (high skewness)

time_to_open_hours_sum_365d_binned
  Source: time_to_open_hours_sum_365d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_sum_365d

time_to_open_hours_sum_365d_log
  Source: time_to_open_hours_sum_365d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_sum_365d (high skewness)

time_to_open_hours_mean_365d_binned
  Source: time_to_open_hours_mean_365d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_mean_365d

time_to_open_hours_mean_365d_log
  Source: time_to_open_hours_mean_365d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_mean_365d (high skewness)

time_to_open_hours_max_365d_binned
  Source: time_to_open_hours_max_365d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_max_365d

time_to_open_hours_max_365d_log
  Source: time_to_open_hours_max_365d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_max_365d (high skewness)

time_to_open_hours_count_365d_binned
  Source: time_to_open_hours_count_365d
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_count_365d

time_to_open_hours_count_365d_log
  Source: time_to_open_hours_count_365d
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_count_365d (high skewness)

opened_sum_all_time_binned
  Source: opened_sum_all_time
  Type: numeric
  Priority: low
  Description: Binned version of opened_sum_all_time

opened_sum_all_time_log
  Source: opened_sum_all_time
  Type: numeric
  Priority: high
  Description: Log transform of opened_sum_all_time (high skewness)

opened_mean_all_time_binned
  Source: opened_mean_all_time
  Type: numeric
  Priority: low
  Description: Binned version of opened_mean_all_time

opened_count_all_time_binned
  Source: opened_count_all_time
  Type: numeric
  Priority: low
  Description: Binned version of opened_count_all_time

opened_count_all_time_log
  Source: opened_count_all_time
  Type: numeric
  Priority: high
  Description: Log transform of opened_count_all_time (high skewness)

clicked_sum_all_time_binned
  Source: clicked_sum_all_time
  Type: numeric
  Priority: low
  Description: Binned version of clicked_sum_all_time

clicked_sum_all_time_log
  Source: clicked_sum_all_time
  Type: numeric
  Priority: high
  Description: Log transform of clicked_sum_all_time (high skewness)

clicked_mean_all_time_binned
  Source: clicked_mean_all_time
  Type: numeric
  Priority: low
  Description: Binned version of clicked_mean_all_time

clicked_mean_all_time_log
  Source: clicked_mean_all_time
  Type: numeric
  Priority: high
  Description: Log transform of clicked_mean_all_time (high skewness)

clicked_count_all_time_binned
  Source: clicked_count_all_time
  Type: numeric
  Priority: low
  Description: Binned version of clicked_count_all_time

clicked_count_all_time_log
  Source: clicked_count_all_time
  Type: numeric
  Priority: high
  Description: Log transform of clicked_count_all_time (high skewness)

send_hour_sum_all_time_binned
  Source: send_hour_sum_all_time
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_sum_all_time

send_hour_sum_all_time_log
  Source: send_hour_sum_all_time
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_sum_all_time (high skewness)

send_hour_mean_all_time_binned
  Source: send_hour_mean_all_time
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_mean_all_time

send_hour_max_all_time_binned
  Source: send_hour_max_all_time
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_max_all_time

send_hour_max_all_time_log
  Source: send_hour_max_all_time
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_max_all_time (high skewness)

send_hour_count_all_time_binned
  Source: send_hour_count_all_time
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_count_all_time

send_hour_count_all_time_log
  Source: send_hour_count_all_time
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_count_all_time (high skewness)

bounced_sum_all_time_binned
  Source: bounced_sum_all_time
  Type: numeric
  Priority: low
  Description: Binned version of bounced_sum_all_time

bounced_sum_all_time_log
  Source: bounced_sum_all_time
  Type: numeric
  Priority: high
  Description: Log transform of bounced_sum_all_time (high skewness)

bounced_mean_all_time_binned
  Source: bounced_mean_all_time
  Type: numeric
  Priority: low
  Description: Binned version of bounced_mean_all_time

bounced_mean_all_time_log
  Source: bounced_mean_all_time
  Type: numeric
  Priority: high
  Description: Log transform of bounced_mean_all_time (high skewness)

bounced_count_all_time_binned
  Source: bounced_count_all_time
  Type: numeric
  Priority: low
  Description: Binned version of bounced_count_all_time

bounced_count_all_time_log
  Source: bounced_count_all_time
  Type: numeric
  Priority: high
  Description: Log transform of bounced_count_all_time (high skewness)

time_to_open_hours_sum_all_time_binned
  Source: time_to_open_hours_sum_all_time
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_sum_all_time

time_to_open_hours_sum_all_time_log
  Source: time_to_open_hours_sum_all_time
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_sum_all_time (high skewness)

time_to_open_hours_mean_all_time_binned
  Source: time_to_open_hours_mean_all_time
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_mean_all_time

time_to_open_hours_mean_all_time_log
  Source: time_to_open_hours_mean_all_time
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_mean_all_time (high skewness)

time_to_open_hours_max_all_time_binned
  Source: time_to_open_hours_max_all_time
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_max_all_time

time_to_open_hours_max_all_time_log
  Source: time_to_open_hours_max_all_time
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_max_all_time (high skewness)

time_to_open_hours_count_all_time_binned
  Source: time_to_open_hours_count_all_time
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_count_all_time

time_to_open_hours_count_all_time_log
  Source: time_to_open_hours_count_all_time
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_count_all_time (high skewness)

days_since_last_event_x_binned
  Source: days_since_last_event_x
  Type: numeric
  Priority: low
  Description: Binned version of days_since_last_event_x

days_since_last_event_x_log
  Source: days_since_last_event_x
  Type: numeric
  Priority: high
  Description: Log transform of days_since_last_event_x (high skewness)

days_since_first_event_x_binned
  Source: days_since_first_event_x
  Type: numeric
  Priority: low
  Description: Binned version of days_since_first_event_x

days_since_first_event_x_log
  Source: days_since_first_event_x
  Type: numeric
  Priority: high
  Description: Log transform of days_since_first_event_x (high skewness)

lifecycle_quadrant_encoded
  Source: lifecycle_quadrant
  Type: categorical
  Priority: high
  Description: One-hot encoded lifecycle_quadrant

dow_sin_binned
  Source: dow_sin
  Type: numeric
  Priority: low
  Description: Binned version of dow_sin

dow_cos_binned
  Source: dow_cos
  Type: numeric
  Priority: low
  Description: Binned version of dow_cos

dow_cos_log
  Source: dow_cos
  Type: numeric
  Priority: high
  Description: Log transform of dow_cos (high skewness)

bounced_momentum_180_365_binned
  Source: bounced_momentum_180_365
  Type: numeric
  Priority: low
  Description: Binned version of bounced_momentum_180_365

bounced_momentum_180_365_log
  Source: bounced_momentum_180_365
  Type: numeric
  Priority: high
  Description: Log transform of bounced_momentum_180_365 (high skewness)

clicked_momentum_180_365_binned
  Source: clicked_momentum_180_365
  Type: numeric
  Priority: low
  Description: Binned version of clicked_momentum_180_365

clicked_momentum_180_365_log
  Source: clicked_momentum_180_365
  Type: numeric
  Priority: high
  Description: Log transform of clicked_momentum_180_365 (high skewness)

recency_bucket_encoded
  Source: recency_bucket
  Type: categorical
  Priority: high
  Description: One-hot encoded recency_bucket

lag0_opened_sum_binned
  Source: lag0_opened_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag0_opened_sum

lag0_opened_sum_log
  Source: lag0_opened_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag0_opened_sum (high skewness)

lag0_opened_mean_binned
  Source: lag0_opened_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag0_opened_mean

lag0_opened_mean_log
  Source: lag0_opened_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag0_opened_mean (high skewness)

lag0_opened_count_binned
  Source: lag0_opened_count
  Type: numeric
  Priority: low
  Description: Binned version of lag0_opened_count

lag0_opened_count_log
  Source: lag0_opened_count
  Type: numeric
  Priority: high
  Description: Log transform of lag0_opened_count (high skewness)

lag0_clicked_sum_binned
  Source: lag0_clicked_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag0_clicked_sum

lag0_clicked_sum_log
  Source: lag0_clicked_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag0_clicked_sum (high skewness)

lag0_clicked_mean_binned
  Source: lag0_clicked_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag0_clicked_mean

lag0_clicked_mean_log
  Source: lag0_clicked_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag0_clicked_mean (high skewness)

lag0_clicked_count_binned
  Source: lag0_clicked_count
  Type: numeric
  Priority: low
  Description: Binned version of lag0_clicked_count

lag0_clicked_count_log
  Source: lag0_clicked_count
  Type: numeric
  Priority: high
  Description: Log transform of lag0_clicked_count (high skewness)

lag0_send_hour_sum_binned
  Source: lag0_send_hour_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag0_send_hour_sum

lag0_send_hour_sum_log
  Source: lag0_send_hour_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag0_send_hour_sum (high skewness)

lag0_send_hour_mean_binned
  Source: lag0_send_hour_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag0_send_hour_mean

lag0_send_hour_count_binned
  Source: lag0_send_hour_count
  Type: numeric
  Priority: low
  Description: Binned version of lag0_send_hour_count

lag0_send_hour_count_log
  Source: lag0_send_hour_count
  Type: numeric
  Priority: high
  Description: Log transform of lag0_send_hour_count (high skewness)

lag0_send_hour_max_binned
  Source: lag0_send_hour_max
  Type: numeric
  Priority: low
  Description: Binned version of lag0_send_hour_max

lag0_bounced_sum_binned
  Source: lag0_bounced_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag0_bounced_sum

lag0_bounced_sum_log
  Source: lag0_bounced_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag0_bounced_sum (high skewness)

lag0_bounced_mean_binned
  Source: lag0_bounced_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag0_bounced_mean

lag0_bounced_mean_log
  Source: lag0_bounced_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag0_bounced_mean (high skewness)

lag0_bounced_count_binned
  Source: lag0_bounced_count
  Type: numeric
  Priority: low
  Description: Binned version of lag0_bounced_count

lag0_bounced_count_log
  Source: lag0_bounced_count
  Type: numeric
  Priority: high
  Description: Log transform of lag0_bounced_count (high skewness)

lag0_time_to_open_hours_sum_binned
  Source: lag0_time_to_open_hours_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag0_time_to_open_hours_sum

lag0_time_to_open_hours_sum_log
  Source: lag0_time_to_open_hours_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag0_time_to_open_hours_sum (high skewness)

lag0_time_to_open_hours_mean_binned
  Source: lag0_time_to_open_hours_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag0_time_to_open_hours_mean

lag0_time_to_open_hours_mean_log
  Source: lag0_time_to_open_hours_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag0_time_to_open_hours_mean (high skewness)

lag0_time_to_open_hours_count_binned
  Source: lag0_time_to_open_hours_count
  Type: numeric
  Priority: low
  Description: Binned version of lag0_time_to_open_hours_count

lag0_time_to_open_hours_count_log
  Source: lag0_time_to_open_hours_count
  Type: numeric
  Priority: high
  Description: Log transform of lag0_time_to_open_hours_count (high skewness)

lag0_time_to_open_hours_max_binned
  Source: lag0_time_to_open_hours_max
  Type: numeric
  Priority: low
  Description: Binned version of lag0_time_to_open_hours_max

lag0_time_to_open_hours_max_log
  Source: lag0_time_to_open_hours_max
  Type: numeric
  Priority: high
  Description: Log transform of lag0_time_to_open_hours_max (high skewness)

lag1_opened_sum_binned
  Source: lag1_opened_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag1_opened_sum

lag1_opened_sum_log
  Source: lag1_opened_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag1_opened_sum (high skewness)

lag1_opened_mean_binned
  Source: lag1_opened_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag1_opened_mean

lag1_opened_mean_log
  Source: lag1_opened_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag1_opened_mean (high skewness)

lag1_opened_count_binned
  Source: lag1_opened_count
  Type: numeric
  Priority: low
  Description: Binned version of lag1_opened_count

lag1_opened_count_log
  Source: lag1_opened_count
  Type: numeric
  Priority: high
  Description: Log transform of lag1_opened_count (high skewness)

lag1_clicked_sum_binned
  Source: lag1_clicked_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag1_clicked_sum

lag1_clicked_sum_log
  Source: lag1_clicked_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag1_clicked_sum (high skewness)

lag1_clicked_mean_binned
  Source: lag1_clicked_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag1_clicked_mean

lag1_clicked_mean_log
  Source: lag1_clicked_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag1_clicked_mean (high skewness)

lag1_clicked_count_binned
  Source: lag1_clicked_count
  Type: numeric
  Priority: low
  Description: Binned version of lag1_clicked_count

lag1_clicked_count_log
  Source: lag1_clicked_count
  Type: numeric
  Priority: high
  Description: Log transform of lag1_clicked_count (high skewness)

lag1_send_hour_sum_binned
  Source: lag1_send_hour_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag1_send_hour_sum

lag1_send_hour_sum_log
  Source: lag1_send_hour_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag1_send_hour_sum (high skewness)

lag1_send_hour_mean_binned
  Source: lag1_send_hour_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag1_send_hour_mean

lag1_send_hour_count_binned
  Source: lag1_send_hour_count
  Type: numeric
  Priority: low
  Description: Binned version of lag1_send_hour_count

lag1_send_hour_count_log
  Source: lag1_send_hour_count
  Type: numeric
  Priority: high
  Description: Log transform of lag1_send_hour_count (high skewness)

lag1_send_hour_max_binned
  Source: lag1_send_hour_max
  Type: numeric
  Priority: low
  Description: Binned version of lag1_send_hour_max

lag1_bounced_mean_binned
  Source: lag1_bounced_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag1_bounced_mean

lag1_bounced_mean_log
  Source: lag1_bounced_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag1_bounced_mean (high skewness)

lag1_bounced_count_binned
  Source: lag1_bounced_count
  Type: numeric
  Priority: low
  Description: Binned version of lag1_bounced_count

lag1_bounced_count_log
  Source: lag1_bounced_count
  Type: numeric
  Priority: high
  Description: Log transform of lag1_bounced_count (high skewness)

lag1_time_to_open_hours_sum_binned
  Source: lag1_time_to_open_hours_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag1_time_to_open_hours_sum

lag1_time_to_open_hours_sum_log
  Source: lag1_time_to_open_hours_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag1_time_to_open_hours_sum (high skewness)

lag1_time_to_open_hours_mean_binned
  Source: lag1_time_to_open_hours_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag1_time_to_open_hours_mean

lag1_time_to_open_hours_mean_log
  Source: lag1_time_to_open_hours_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag1_time_to_open_hours_mean (high skewness)

lag1_time_to_open_hours_count_binned
  Source: lag1_time_to_open_hours_count
  Type: numeric
  Priority: low
  Description: Binned version of lag1_time_to_open_hours_count

lag1_time_to_open_hours_count_log
  Source: lag1_time_to_open_hours_count
  Type: numeric
  Priority: high
  Description: Log transform of lag1_time_to_open_hours_count (high skewness)

lag1_time_to_open_hours_max_binned
  Source: lag1_time_to_open_hours_max
  Type: numeric
  Priority: low
  Description: Binned version of lag1_time_to_open_hours_max

lag1_time_to_open_hours_max_log
  Source: lag1_time_to_open_hours_max
  Type: numeric
  Priority: high
  Description: Log transform of lag1_time_to_open_hours_max (high skewness)

lag2_opened_sum_binned
  Source: lag2_opened_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag2_opened_sum

lag2_opened_sum_log
  Source: lag2_opened_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag2_opened_sum (high skewness)

lag2_opened_mean_binned
  Source: lag2_opened_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag2_opened_mean

lag2_opened_mean_log
  Source: lag2_opened_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag2_opened_mean (high skewness)

lag2_opened_count_binned
  Source: lag2_opened_count
  Type: numeric
  Priority: low
  Description: Binned version of lag2_opened_count

lag2_opened_count_log
  Source: lag2_opened_count
  Type: numeric
  Priority: high
  Description: Log transform of lag2_opened_count (high skewness)

lag2_clicked_sum_binned
  Source: lag2_clicked_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag2_clicked_sum

lag2_clicked_sum_log
  Source: lag2_clicked_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag2_clicked_sum (high skewness)

lag2_clicked_mean_binned
  Source: lag2_clicked_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag2_clicked_mean

lag2_clicked_mean_log
  Source: lag2_clicked_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag2_clicked_mean (high skewness)

lag2_clicked_count_binned
  Source: lag2_clicked_count
  Type: numeric
  Priority: low
  Description: Binned version of lag2_clicked_count

lag2_clicked_count_log
  Source: lag2_clicked_count
  Type: numeric
  Priority: high
  Description: Log transform of lag2_clicked_count (high skewness)

lag2_send_hour_sum_binned
  Source: lag2_send_hour_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag2_send_hour_sum

lag2_send_hour_sum_log
  Source: lag2_send_hour_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag2_send_hour_sum (high skewness)

lag2_send_hour_mean_binned
  Source: lag2_send_hour_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag2_send_hour_mean

lag2_send_hour_count_binned
  Source: lag2_send_hour_count
  Type: numeric
  Priority: low
  Description: Binned version of lag2_send_hour_count

lag2_send_hour_count_log
  Source: lag2_send_hour_count
  Type: numeric
  Priority: high
  Description: Log transform of lag2_send_hour_count (high skewness)

lag2_send_hour_max_binned
  Source: lag2_send_hour_max
  Type: numeric
  Priority: low
  Description: Binned version of lag2_send_hour_max

lag2_bounced_mean_binned
  Source: lag2_bounced_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag2_bounced_mean

lag2_bounced_mean_log
  Source: lag2_bounced_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag2_bounced_mean (high skewness)

lag2_bounced_count_binned
  Source: lag2_bounced_count
  Type: numeric
  Priority: low
  Description: Binned version of lag2_bounced_count

lag2_bounced_count_log
  Source: lag2_bounced_count
  Type: numeric
  Priority: high
  Description: Log transform of lag2_bounced_count (high skewness)

lag2_time_to_open_hours_sum_binned
  Source: lag2_time_to_open_hours_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag2_time_to_open_hours_sum

lag2_time_to_open_hours_sum_log
  Source: lag2_time_to_open_hours_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag2_time_to_open_hours_sum (high skewness)

lag2_time_to_open_hours_mean_binned
  Source: lag2_time_to_open_hours_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag2_time_to_open_hours_mean

lag2_time_to_open_hours_mean_log
  Source: lag2_time_to_open_hours_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag2_time_to_open_hours_mean (high skewness)

lag2_time_to_open_hours_count_binned
  Source: lag2_time_to_open_hours_count
  Type: numeric
  Priority: low
  Description: Binned version of lag2_time_to_open_hours_count

lag2_time_to_open_hours_count_log
  Source: lag2_time_to_open_hours_count
  Type: numeric
  Priority: high
  Description: Log transform of lag2_time_to_open_hours_count (high skewness)

lag2_time_to_open_hours_max_binned
  Source: lag2_time_to_open_hours_max
  Type: numeric
  Priority: low
  Description: Binned version of lag2_time_to_open_hours_max

lag2_time_to_open_hours_max_log
  Source: lag2_time_to_open_hours_max
  Type: numeric
  Priority: high
  Description: Log transform of lag2_time_to_open_hours_max (high skewness)

lag3_opened_sum_binned
  Source: lag3_opened_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag3_opened_sum

lag3_opened_sum_log
  Source: lag3_opened_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag3_opened_sum (high skewness)

lag3_opened_mean_binned
  Source: lag3_opened_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag3_opened_mean

lag3_opened_mean_log
  Source: lag3_opened_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag3_opened_mean (high skewness)

lag3_opened_count_binned
  Source: lag3_opened_count
  Type: numeric
  Priority: low
  Description: Binned version of lag3_opened_count

lag3_opened_count_log
  Source: lag3_opened_count
  Type: numeric
  Priority: high
  Description: Log transform of lag3_opened_count (high skewness)

lag3_clicked_mean_binned
  Source: lag3_clicked_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag3_clicked_mean

lag3_clicked_mean_log
  Source: lag3_clicked_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag3_clicked_mean (high skewness)

lag3_clicked_count_binned
  Source: lag3_clicked_count
  Type: numeric
  Priority: low
  Description: Binned version of lag3_clicked_count

lag3_clicked_count_log
  Source: lag3_clicked_count
  Type: numeric
  Priority: high
  Description: Log transform of lag3_clicked_count (high skewness)

lag3_send_hour_sum_binned
  Source: lag3_send_hour_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag3_send_hour_sum

lag3_send_hour_sum_log
  Source: lag3_send_hour_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag3_send_hour_sum (high skewness)

lag3_send_hour_mean_binned
  Source: lag3_send_hour_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag3_send_hour_mean

lag3_send_hour_count_binned
  Source: lag3_send_hour_count
  Type: numeric
  Priority: low
  Description: Binned version of lag3_send_hour_count

lag3_send_hour_count_log
  Source: lag3_send_hour_count
  Type: numeric
  Priority: high
  Description: Log transform of lag3_send_hour_count (high skewness)

lag3_send_hour_max_binned
  Source: lag3_send_hour_max
  Type: numeric
  Priority: low
  Description: Binned version of lag3_send_hour_max

lag3_bounced_mean_binned
  Source: lag3_bounced_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag3_bounced_mean

lag3_bounced_mean_log
  Source: lag3_bounced_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag3_bounced_mean (high skewness)

lag3_bounced_count_binned
  Source: lag3_bounced_count
  Type: numeric
  Priority: low
  Description: Binned version of lag3_bounced_count

lag3_bounced_count_log
  Source: lag3_bounced_count
  Type: numeric
  Priority: high
  Description: Log transform of lag3_bounced_count (high skewness)

lag3_time_to_open_hours_sum_binned
  Source: lag3_time_to_open_hours_sum
  Type: numeric
  Priority: low
  Description: Binned version of lag3_time_to_open_hours_sum

lag3_time_to_open_hours_sum_log
  Source: lag3_time_to_open_hours_sum
  Type: numeric
  Priority: high
  Description: Log transform of lag3_time_to_open_hours_sum (high skewness)

lag3_time_to_open_hours_mean_binned
  Source: lag3_time_to_open_hours_mean
  Type: numeric
  Priority: low
  Description: Binned version of lag3_time_to_open_hours_mean

lag3_time_to_open_hours_mean_log
  Source: lag3_time_to_open_hours_mean
  Type: numeric
  Priority: high
  Description: Log transform of lag3_time_to_open_hours_mean (high skewness)

lag3_time_to_open_hours_count_binned
  Source: lag3_time_to_open_hours_count
  Type: numeric
  Priority: low
  Description: Binned version of lag3_time_to_open_hours_count

lag3_time_to_open_hours_count_log
  Source: lag3_time_to_open_hours_count
  Type: numeric
  Priority: high
  Description: Log transform of lag3_time_to_open_hours_count (high skewness)

lag3_time_to_open_hours_max_binned
  Source: lag3_time_to_open_hours_max
  Type: numeric
  Priority: low
  Description: Binned version of lag3_time_to_open_hours_max

lag3_time_to_open_hours_max_log
  Source: lag3_time_to_open_hours_max
  Type: numeric
  Priority: high
  Description: Log transform of lag3_time_to_open_hours_max (high skewness)

opened_velocity_binned
  Source: opened_velocity
  Type: numeric
  Priority: low
  Description: Binned version of opened_velocity

opened_velocity_pct_binned
  Source: opened_velocity_pct
  Type: numeric
  Priority: low
  Description: Binned version of opened_velocity_pct

opened_velocity_pct_log
  Source: opened_velocity_pct
  Type: numeric
  Priority: high
  Description: Log transform of opened_velocity_pct (high skewness)

clicked_velocity_binned
  Source: clicked_velocity
  Type: numeric
  Priority: low
  Description: Binned version of clicked_velocity

send_hour_velocity_binned
  Source: send_hour_velocity
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_velocity

send_hour_velocity_log
  Source: send_hour_velocity
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_velocity (high skewness)

send_hour_velocity_pct_binned
  Source: send_hour_velocity_pct
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_velocity_pct

send_hour_velocity_pct_log
  Source: send_hour_velocity_pct
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_velocity_pct (high skewness)

bounced_velocity_binned
  Source: bounced_velocity
  Type: numeric
  Priority: low
  Description: Binned version of bounced_velocity

bounced_velocity_log
  Source: bounced_velocity
  Type: numeric
  Priority: high
  Description: Log transform of bounced_velocity (high skewness)

time_to_open_hours_velocity_binned
  Source: time_to_open_hours_velocity
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_velocity

time_to_open_hours_velocity_log
  Source: time_to_open_hours_velocity
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_velocity (high skewness)

time_to_open_hours_velocity_pct_binned
  Source: time_to_open_hours_velocity_pct
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_velocity_pct

time_to_open_hours_velocity_pct_log
  Source: time_to_open_hours_velocity_pct
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_velocity_pct (high skewness)

opened_acceleration_binned
  Source: opened_acceleration
  Type: numeric
  Priority: low
  Description: Binned version of opened_acceleration

opened_acceleration_log
  Source: opened_acceleration
  Type: numeric
  Priority: high
  Description: Log transform of opened_acceleration (high skewness)

opened_momentum_binned
  Source: opened_momentum
  Type: numeric
  Priority: low
  Description: Binned version of opened_momentum

opened_momentum_log
  Source: opened_momentum
  Type: numeric
  Priority: high
  Description: Log transform of opened_momentum (high skewness)

clicked_acceleration_binned
  Source: clicked_acceleration
  Type: numeric
  Priority: low
  Description: Binned version of clicked_acceleration

clicked_acceleration_log
  Source: clicked_acceleration
  Type: numeric
  Priority: high
  Description: Log transform of clicked_acceleration (high skewness)

send_hour_acceleration_binned
  Source: send_hour_acceleration
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_acceleration

send_hour_momentum_binned
  Source: send_hour_momentum
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_momentum

send_hour_momentum_log
  Source: send_hour_momentum
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_momentum (high skewness)

bounced_acceleration_binned
  Source: bounced_acceleration
  Type: numeric
  Priority: low
  Description: Binned version of bounced_acceleration

bounced_acceleration_log
  Source: bounced_acceleration
  Type: numeric
  Priority: high
  Description: Log transform of bounced_acceleration (high skewness)

time_to_open_hours_acceleration_binned
  Source: time_to_open_hours_acceleration
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_acceleration

time_to_open_hours_acceleration_log
  Source: time_to_open_hours_acceleration
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_acceleration (high skewness)

time_to_open_hours_momentum_binned
  Source: time_to_open_hours_momentum
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_momentum

time_to_open_hours_momentum_log
  Source: time_to_open_hours_momentum
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_momentum (high skewness)

opened_beginning_binned
  Source: opened_beginning
  Type: numeric
  Priority: low
  Description: Binned version of opened_beginning

opened_beginning_log
  Source: opened_beginning
  Type: numeric
  Priority: high
  Description: Log transform of opened_beginning (high skewness)

opened_end_binned
  Source: opened_end
  Type: numeric
  Priority: low
  Description: Binned version of opened_end

opened_end_log
  Source: opened_end
  Type: numeric
  Priority: high
  Description: Log transform of opened_end (high skewness)

opened_trend_ratio_binned
  Source: opened_trend_ratio
  Type: numeric
  Priority: low
  Description: Binned version of opened_trend_ratio

opened_trend_ratio_log
  Source: opened_trend_ratio
  Type: numeric
  Priority: high
  Description: Log transform of opened_trend_ratio (high skewness)

clicked_beginning_binned
  Source: clicked_beginning
  Type: numeric
  Priority: low
  Description: Binned version of clicked_beginning

clicked_beginning_log
  Source: clicked_beginning
  Type: numeric
  Priority: high
  Description: Log transform of clicked_beginning (high skewness)

clicked_end_binned
  Source: clicked_end
  Type: numeric
  Priority: low
  Description: Binned version of clicked_end

clicked_end_log
  Source: clicked_end
  Type: numeric
  Priority: high
  Description: Log transform of clicked_end (high skewness)

clicked_trend_ratio_binned
  Source: clicked_trend_ratio
  Type: numeric
  Priority: low
  Description: Binned version of clicked_trend_ratio

clicked_trend_ratio_log
  Source: clicked_trend_ratio
  Type: numeric
  Priority: high
  Description: Log transform of clicked_trend_ratio (high skewness)

send_hour_beginning_binned
  Source: send_hour_beginning
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_beginning

send_hour_beginning_log
  Source: send_hour_beginning
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_beginning (high skewness)

send_hour_end_binned
  Source: send_hour_end
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_end

send_hour_end_log
  Source: send_hour_end
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_end (high skewness)

send_hour_trend_ratio_binned
  Source: send_hour_trend_ratio
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_trend_ratio

send_hour_trend_ratio_log
  Source: send_hour_trend_ratio
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_trend_ratio (high skewness)

bounced_beginning_binned
  Source: bounced_beginning
  Type: numeric
  Priority: low
  Description: Binned version of bounced_beginning

bounced_beginning_log
  Source: bounced_beginning
  Type: numeric
  Priority: high
  Description: Log transform of bounced_beginning (high skewness)

bounced_end_binned
  Source: bounced_end
  Type: numeric
  Priority: low
  Description: Binned version of bounced_end

bounced_end_log
  Source: bounced_end
  Type: numeric
  Priority: high
  Description: Log transform of bounced_end (high skewness)

bounced_trend_ratio_binned
  Source: bounced_trend_ratio
  Type: numeric
  Priority: low
  Description: Binned version of bounced_trend_ratio

bounced_trend_ratio_log
  Source: bounced_trend_ratio
  Type: numeric
  Priority: high
  Description: Log transform of bounced_trend_ratio (high skewness)

time_to_open_hours_beginning_binned
  Source: time_to_open_hours_beginning
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_beginning

time_to_open_hours_beginning_log
  Source: time_to_open_hours_beginning
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_beginning (high skewness)

time_to_open_hours_end_binned
  Source: time_to_open_hours_end
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_end

time_to_open_hours_end_log
  Source: time_to_open_hours_end
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_end (high skewness)

time_to_open_hours_trend_ratio_binned
  Source: time_to_open_hours_trend_ratio
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_trend_ratio

time_to_open_hours_trend_ratio_log
  Source: time_to_open_hours_trend_ratio
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_trend_ratio (high skewness)

days_since_last_event_y_binned
  Source: days_since_last_event_y
  Type: numeric
  Priority: low
  Description: Binned version of days_since_last_event_y

days_since_first_event_y_binned
  Source: days_since_first_event_y
  Type: numeric
  Priority: low
  Description: Binned version of days_since_first_event_y

active_span_days_binned
  Source: active_span_days
  Type: numeric
  Priority: low
  Description: Binned version of active_span_days

recency_ratio_binned
  Source: recency_ratio
  Type: numeric
  Priority: low
  Description: Binned version of recency_ratio

event_frequency_binned
  Source: event_frequency
  Type: numeric
  Priority: low
  Description: Binned version of event_frequency

event_frequency_log
  Source: event_frequency
  Type: numeric
  Priority: high
  Description: Log transform of event_frequency (high skewness)

inter_event_gap_mean_binned
  Source: inter_event_gap_mean
  Type: numeric
  Priority: low
  Description: Binned version of inter_event_gap_mean

inter_event_gap_std_binned
  Source: inter_event_gap_std
  Type: numeric
  Priority: low
  Description: Binned version of inter_event_gap_std

inter_event_gap_max_binned
  Source: inter_event_gap_max
  Type: numeric
  Priority: low
  Description: Binned version of inter_event_gap_max

regularity_score_binned
  Source: regularity_score
  Type: numeric
  Priority: low
  Description: Binned version of regularity_score

regularity_score_log
  Source: regularity_score
  Type: numeric
  Priority: high
  Description: Log transform of regularity_score (high skewness)

opened_vs_cohort_mean_binned
  Source: opened_vs_cohort_mean
  Type: numeric
  Priority: low
  Description: Binned version of opened_vs_cohort_mean

opened_vs_cohort_mean_log
  Source: opened_vs_cohort_mean
  Type: numeric
  Priority: high
  Description: Log transform of opened_vs_cohort_mean (high skewness)

opened_vs_cohort_pct_binned
  Source: opened_vs_cohort_pct
  Type: numeric
  Priority: low
  Description: Binned version of opened_vs_cohort_pct

opened_vs_cohort_pct_log
  Source: opened_vs_cohort_pct
  Type: numeric
  Priority: high
  Description: Log transform of opened_vs_cohort_pct (high skewness)

opened_cohort_zscore_binned
  Source: opened_cohort_zscore
  Type: numeric
  Priority: low
  Description: Binned version of opened_cohort_zscore

opened_cohort_zscore_log
  Source: opened_cohort_zscore
  Type: numeric
  Priority: high
  Description: Log transform of opened_cohort_zscore (high skewness)

clicked_vs_cohort_mean_binned
  Source: clicked_vs_cohort_mean
  Type: numeric
  Priority: low
  Description: Binned version of clicked_vs_cohort_mean

clicked_vs_cohort_mean_log
  Source: clicked_vs_cohort_mean
  Type: numeric
  Priority: high
  Description: Log transform of clicked_vs_cohort_mean (high skewness)

clicked_vs_cohort_pct_binned
  Source: clicked_vs_cohort_pct
  Type: numeric
  Priority: low
  Description: Binned version of clicked_vs_cohort_pct

clicked_vs_cohort_pct_log
  Source: clicked_vs_cohort_pct
  Type: numeric
  Priority: high
  Description: Log transform of clicked_vs_cohort_pct (high skewness)

clicked_cohort_zscore_binned
  Source: clicked_cohort_zscore
  Type: numeric
  Priority: low
  Description: Binned version of clicked_cohort_zscore

clicked_cohort_zscore_log
  Source: clicked_cohort_zscore
  Type: numeric
  Priority: high
  Description: Log transform of clicked_cohort_zscore (high skewness)

send_hour_vs_cohort_mean_binned
  Source: send_hour_vs_cohort_mean
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_vs_cohort_mean

send_hour_vs_cohort_mean_log
  Source: send_hour_vs_cohort_mean
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_vs_cohort_mean (high skewness)

send_hour_vs_cohort_pct_binned
  Source: send_hour_vs_cohort_pct
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_vs_cohort_pct

send_hour_vs_cohort_pct_log
  Source: send_hour_vs_cohort_pct
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_vs_cohort_pct (high skewness)

send_hour_cohort_zscore_binned
  Source: send_hour_cohort_zscore
  Type: numeric
  Priority: low
  Description: Binned version of send_hour_cohort_zscore

send_hour_cohort_zscore_log
  Source: send_hour_cohort_zscore
  Type: numeric
  Priority: high
  Description: Log transform of send_hour_cohort_zscore (high skewness)

bounced_vs_cohort_mean_binned
  Source: bounced_vs_cohort_mean
  Type: numeric
  Priority: low
  Description: Binned version of bounced_vs_cohort_mean

bounced_vs_cohort_mean_log
  Source: bounced_vs_cohort_mean
  Type: numeric
  Priority: high
  Description: Log transform of bounced_vs_cohort_mean (high skewness)

bounced_vs_cohort_pct_binned
  Source: bounced_vs_cohort_pct
  Type: numeric
  Priority: low
  Description: Binned version of bounced_vs_cohort_pct

bounced_vs_cohort_pct_log
  Source: bounced_vs_cohort_pct
  Type: numeric
  Priority: high
  Description: Log transform of bounced_vs_cohort_pct (high skewness)

bounced_cohort_zscore_binned
  Source: bounced_cohort_zscore
  Type: numeric
  Priority: low
  Description: Binned version of bounced_cohort_zscore

bounced_cohort_zscore_log
  Source: bounced_cohort_zscore
  Type: numeric
  Priority: high
  Description: Log transform of bounced_cohort_zscore (high skewness)

time_to_open_hours_vs_cohort_mean_binned
  Source: time_to_open_hours_vs_cohort_mean
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_vs_cohort_mean

time_to_open_hours_vs_cohort_mean_log
  Source: time_to_open_hours_vs_cohort_mean
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_vs_cohort_mean (high skewness)

time_to_open_hours_vs_cohort_pct_binned
  Source: time_to_open_hours_vs_cohort_pct
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_vs_cohort_pct

time_to_open_hours_vs_cohort_pct_log
  Source: time_to_open_hours_vs_cohort_pct
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_vs_cohort_pct (high skewness)

time_to_open_hours_cohort_zscore_binned
  Source: time_to_open_hours_cohort_zscore
  Type: numeric
  Priority: low
  Description: Binned version of time_to_open_hours_cohort_zscore

time_to_open_hours_cohort_zscore_log
  Source: time_to_open_hours_cohort_zscore
  Type: numeric
  Priority: high
  Description: Log transform of time_to_open_hours_cohort_zscore (high skewness)

# Feature Capacity Analysis
capacity_analyzer = FeatureCapacityAnalyzer()

# Get all potential feature columns (excluding target and identifiers)
feature_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [
        ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE,
        ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL,
        ColumnType.BINARY
    ] and name != target
    and name not in TEMPORAL_METADATA_COLS
]

print("=" * 80)
print("FEATURE CAPACITY ANALYSIS")
print("=" * 80)

if target:
    # Analyze capacity with current features
    numeric_features = [
        name for name, col in findings.columns.items()
        if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]
        and name != target
    ]

    capacity_result = capacity_analyzer.analyze(
        df,
        feature_cols=numeric_features,
        target_col=target,
    )

    print("\n📊 DATA SUMMARY:")
    print(f"   Total samples: {capacity_result.total_samples:,}")
    print(f"   Minority class samples: {capacity_result.minority_class_samples:,}")
    print(f"   Minority class rate: {capacity_result.minority_class_samples/capacity_result.total_samples:.1%}")
    print(f"   Current numeric features: {capacity_result.total_features}")

    print("\n📈 FEATURE CAPACITY METRICS:")
    print(f"   Events Per Variable (EPV): {capacity_result.events_per_variable:.1f}")
    print(f"   Samples Per Feature: {capacity_result.samples_per_feature:.1f}")
    print(f"   Capacity Status: {capacity_result.capacity_status.upper()}")

    # Capacity status visualization
    status_colors = {"adequate": "#2ecc71", "limited": "#f39c12", "inadequate": "#e74c3c"}
    status_color = status_colors.get(capacity_result.capacity_status, "#95a5a6")

    print("\n🎯 RECOMMENDED FEATURE COUNTS:")
    print(f"   Conservative (EPV=20): {capacity_result.recommended_features_conservative} features")
    print(f"   Moderate (EPV=10):     {capacity_result.recommended_features_moderate} features")
    print(f"   Aggressive (EPV=5):    {capacity_result.recommended_features_aggressive} features")

    # Effective features analysis
    if capacity_result.effective_features_result:
        eff = capacity_result.effective_features_result
        print("\n🔍 EFFECTIVE FEATURES (accounting for correlation):")
        print(f"   Total features analyzed: {eff.total_count}")
        print(f"   Effective independent features: {eff.effective_count:.1f}")
        print(f"   Redundant features identified: {len(eff.redundant_features)}")

        if eff.redundant_features:
            print("\n   ⚠️ Redundant features (highly correlated):")
            for feat in eff.redundant_features[:5]:
                print(f"      • {feat}")

        if eff.feature_clusters:
            print(f"\n   📦 Correlated feature clusters ({len(eff.feature_clusters)}):")
            for i, cluster in enumerate(eff.feature_clusters[:3]):
                print(f"      Cluster {i+1}: {', '.join(cluster[:4])}")
                if len(cluster) > 4:
                    print(f"                  ... and {len(cluster)-4} more")

    # Persist feature capacity to registry
    registry.add_bronze_feature_capacity(
        epv=capacity_result.events_per_variable,
        capacity_status=capacity_result.capacity_status,
        recommended_features=capacity_result.recommended_features_moderate,
        current_features=capacity_result.total_features,
        rationale=f"EPV={capacity_result.events_per_variable:.1f}, status={capacity_result.capacity_status}",
        source_notebook="06_feature_opportunities"
    )
    print("\n✅ Persisted feature capacity recommendation to registry")

    # Store capacity info in findings
    findings.metadata["feature_capacity"] = capacity_result.to_dict()
else:
    print("\n⚠️ No target column detected. Capacity analysis requires a target variable.")

================================================================================
FEATURE CAPACITY ANALYSIS
================================================================================

📊 DATA SUMMARY:
   Total samples: 2,019,192
   Minority class samples: 899,708
   Minority class rate: 44.6%
   Current numeric features: 179

📈 FEATURE CAPACITY METRICS:
   Events Per Variable (EPV): 5026.3
   Samples Per Feature: 11280.4
   Capacity Status: ADEQUATE

🎯 RECOMMENDED FEATURE COUNTS:
   Conservative (EPV=20): 44985 features
   Moderate (EPV=10):     89970 features
   Aggressive (EPV=5):    179941 features

🔍 EFFECTIVE FEATURES (accounting for correlation):
   Total features analyzed: 179
   Effective independent features: 78.0
   Redundant features identified: 101

   ⚠️ Redundant features (highly correlated):
      • time_to_open_hours_count_180d
      • opened_vs_cohort_mean
      • time_to_open_hours_vs_cohort_pct
      • days_since_first_event_y
      • clicked_cohort_zscore

   📦 Correlated feature clusters (38):
      Cluster 1: event_count_180d, event_count_365d, opened_count_180d, clicked_count_180d
                  ... and 8 more
      Cluster 2: event_count_all_time, opened_sum_all_time, opened_count_all_time, clicked_count_all_time
                  ... and 6 more
      Cluster 3: opened_sum_180d, opened_mean_180d, time_to_open_hours_count_180d

✅ Persisted feature capacity recommendation to registry

# Model Complexity Guidance
if target and 'capacity_result' in dir():
    guidance = capacity_result.complexity_guidance

    print("=" * 70)
    print("MODEL COMPLEXITY GUIDANCE")
    print("=" * 70)

    # Create visualization of feature limits by model type
    model_types = ["Linear\n(no regularization)", "Regularized\n(L1/L2)", "Tree-based\n(RF/XGBoost)"]
    max_features = [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree]
    current_features = capacity_result.total_features

    colors = ['#e74c3c' if m < current_features else '#2ecc71' for m in max_features]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=model_types,
        y=max_features,
        marker_color=colors,
        text=[f"{m}" for m in max_features],
        textposition='outside',
        name='Max Features'
    ))

    # Add horizontal line for current feature count
    fig.add_hline(
        y=current_features,
        line_dash="dash",
        line_color="#3498db",
        annotation_text=f"Current: {current_features}",
        annotation_position="right"
    )

    # Calculate y-axis range to fit labels
    max_val = max(max_features)
    fig.update_layout(
        title="Maximum Recommended Features by Model Type",
        xaxis_title="Model Type",
        yaxis_title="Max Features",
        yaxis_range=[0, max_val * 1.15],  # Add 15% headroom for labels
        template='plotly_white',
        height=400,
        showlegend=False,
    )

    display_figure(fig)

    print(f"\n🎯 RECOMMENDED MODEL TYPE: {guidance.recommended_model_type.replace('_', ' ').title()}")

    print("\n📋 MODEL-SPECIFIC RECOMMENDATIONS:")
    for rec in guidance.model_recommendations:
        print(f"   • {rec}")

    print("\n💡 GENERAL GUIDANCE:")
    for rec in guidance.recommendations:
        print(f"   {rec}")

    # Summary table
    print("\n" + "-" * 70)
    print("FEATURE BUDGET SUMMARY:")
    print("-" * 70)
    summary_data = {
        "Model Type": ["Linear (no regularization)", "Regularized (L1/L2)", "Tree-based"],
        "Max Features": [guidance.max_features_linear, guidance.max_features_regularized, guidance.max_features_tree],
        "Current": [current_features] * 3,
        "Status": [
            "✅ OK" if guidance.max_features_linear >= current_features else "⚠️ Reduce",
            "✅ OK" if guidance.max_features_regularized >= current_features else "⚠️ Reduce",
            "✅ OK" if guidance.max_features_tree >= current_features else "⚠️ Reduce"
        ]
    }
    display(pd.DataFrame(summary_data))

    # Persist model type recommendation to registry
    registry.add_bronze_model_type(
        model_type=guidance.recommended_model_type,
        max_features_linear=guidance.max_features_linear,
        max_features_regularized=guidance.max_features_regularized,
        max_features_tree=guidance.max_features_tree,
        rationale=f"Recommended: {guidance.recommended_model_type}",
        source_notebook="06_feature_opportunities"
    )
    print(f"\n✅ Persisted model type recommendation to registry: {guidance.recommended_model_type}")

======================================================================
MODEL COMPLEXITY GUIDANCE
======================================================================

🎯 RECOMMENDED MODEL TYPE: Linear

📋 MODEL-SPECIFIC RECOMMENDATIONS:
   • Adequate data for standard logistic regression
   • Can use all features without regularization
   • Consider tree models for comparison

💡 GENERAL GUIDANCE:
   Adequate: EPV=5026.3. Sufficient data for robust modeling.

----------------------------------------------------------------------
FEATURE BUDGET SUMMARY:
----------------------------------------------------------------------

✅ Persisted model type recommendation to registry: linear

# Segment Capacity Analysis
categorical_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]
    and name not in TEMPORAL_METADATA_COLS
]

print("=" * 70)
print("SEGMENT CAPACITY ANALYSIS")
print("=" * 70)

if target and categorical_cols and 'numeric_features' in dir():
    # Analyze the first categorical column as potential segment
    segment_col = categorical_cols[0]

    print(f"\n📊 Analyzing segments by: {segment_col}")
    print(f"   Features to evaluate: {len(numeric_features)}")

    segment_result = capacity_analyzer.analyze_segment_capacity(
        df,
        feature_cols=numeric_features,
        target_col=target,
        segment_col=segment_col,
    )

    print(f"\n🎯 RECOMMENDED STRATEGY: {segment_result.recommended_strategy.replace('_', ' ').title()}")
    print(f"   Reason: {segment_result.strategy_reason}")

    # Segment details table
    segment_data = []
    for seg_name, cap in segment_result.segment_capacities.items():
        segment_data.append({
            "Segment": seg_name,
            "Samples": cap.total_samples,
            "Minority Events": cap.minority_class_samples,
            "EPV": f"{cap.events_per_variable:.1f}",
            "Max Features (EPV=10)": cap.recommended_features_moderate,
            "Status": cap.capacity_status.title()
        })

    segment_df = pd.DataFrame(segment_data)
    segment_df = segment_df.sort_values("Samples", ascending=False)
    display(segment_df)

    # Visualization
    fig = go.Figure()

    max_events = 0
    for seg_name, cap in segment_result.segment_capacities.items():
        color = "#2ecc71" if cap.capacity_status == "adequate" else "#f39c12" if cap.capacity_status == "limited" else "#e74c3c"
        fig.add_trace(go.Bar(
            name=seg_name,
            x=[seg_name],
            y=[cap.minority_class_samples],
            marker_color=color,
            text=[f"EPV={cap.events_per_variable:.1f}"],
            textposition='outside'
        ))
        max_events = max(max_events, cap.minority_class_samples)

    # Add threshold line
    threshold_events = len(numeric_features) * 10  # EPV=10 threshold
    fig.add_hline(
        y=threshold_events,
        line_dash="dash",
        line_color="#3498db",
        annotation_text=f"Min events for {len(numeric_features)} features (EPV=10)",
        annotation_position="right"
    )

    # Calculate y-axis range to fit labels
    y_max = max(max_events, threshold_events)
    fig.update_layout(
        title=f"Minority Class Events by Segment ({segment_col})",
        xaxis_title="Segment",
        yaxis_title="Minority Class Events",
        yaxis_range=[0, y_max * 1.15],  # Add 15% headroom for labels
        template='plotly_white',
        height=400,
        showlegend=False,
    )
    display_figure(fig)

    print("\n📋 SEGMENT RECOMMENDATIONS:")
    for rec in segment_result.recommendations:
        print(f"   {rec}")

    if segment_result.viable_segments:
        print(f"\n   ✅ Viable for separate models: {', '.join(segment_result.viable_segments)}")
    if segment_result.insufficient_segments:
        print(f"   ⚠️ Insufficient data: {', '.join(segment_result.insufficient_segments)}")

    # Store in findings
    findings.metadata["segment_capacity"] = segment_result.to_dict()
else:
    print("\n⚠️ No categorical columns available for segment analysis.")
    print("   Segment capacity analysis requires at least one categorical column.")

======================================================================
SEGMENT CAPACITY ANALYSIS
======================================================================

📊 Analyzing segments by: lifecycle_quadrant
   Features to evaluate: 179

🎯 RECOMMENDED STRATEGY: Segment Models
   Reason: All segments have adequate data for separate models.

📋 SEGMENT RECOMMENDATIONS:
   ✅ All 4 segments have sufficient data for independent models.
   Consider: Separate models may capture segment-specific patterns better.

   ✅ Viable for separate models: Intense & Brief, Occasional & Loyal, Steady & Loyal, One-shot

# Feature Capacity Action Items Summary
if target and 'capacity_result' in dir():
    print("=" * 70)
    print("FEATURE CAPACITY ACTION ITEMS")
    print("=" * 70)

    print("\n📋 BASED ON YOUR DATA CAPACITY:")

    # Action items based on capacity status
    if capacity_result.capacity_status == "adequate":
        print("\n✅ ADEQUATE CAPACITY - You have room to add features")
        print(f"   • Current features: {capacity_result.total_features}")
        print(f"   • Can add up to: {capacity_result.recommended_features_moderate - capacity_result.total_features} more features (EPV=10)")
        print("   • Consider: Creating derived features from datetime and categorical columns")
    elif capacity_result.capacity_status == "limited":
        print("\n⚠️ LIMITED CAPACITY - Be selective with new features")
        print(f"   • Current features: {capacity_result.total_features}")
        print(f"   • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)")
        print(f"   • Action: Remove {max(0, capacity_result.total_features - capacity_result.recommended_features_moderate)} redundant features before adding new ones")
        print("   • Consider: Using regularization (L1/Lasso) if keeping all features")
    else:
        print("\n🔴 INADEQUATE CAPACITY - Reduce features or get more data")
        print(f"   • Current features: {capacity_result.total_features}")
        print(f"   • Recommended max: {capacity_result.recommended_features_moderate} features (EPV=10)")
        print(f"   • CRITICAL: Reduce to {capacity_result.recommended_features_conservative} features for stable estimates")
        print("   • Options: (1) Feature selection, (2) PCA, (3) Collect more data")

    # Redundancy recommendations
    if capacity_result.effective_features_result and capacity_result.effective_features_result.redundant_features:
        redundant = capacity_result.effective_features_result.redundant_features
        print("\n🔄 REDUNDANT FEATURES TO CONSIDER REMOVING:")
        print("   These features are highly correlated with others and add little new information:")
        for feat in redundant[:5]:
            print(f"   • {feat}")
        if len(redundant) > 5:
            print(f"   ... and {len(redundant) - 5} more")

    # New feature budget
    print("\n💰 FEATURE BUDGET FOR NEW FEATURES:")
    remaining_budget = capacity_result.recommended_features_moderate - capacity_result.total_features
    if remaining_budget > 0:
        print(f"   You can safely add {remaining_budget} new features")
        print("   Prioritize:")
        print("   • Recency features (days_since_last_activity)")
        print("   • Tenure features (days_since_created)")
        print("   • Engagement composites (email_engagement_score)")
    else:
        print(f"   ⚠️ At or over capacity. Remove {-remaining_budget} features before adding new ones.")

    # Model selection summary
    print("\n🎯 RECOMMENDED MODELING APPROACH:")
    if capacity_result.complexity_guidance:
        print(f"   Model type: {capacity_result.complexity_guidance.recommended_model_type.replace('_', ' ').title()}")
        if "regularized" in capacity_result.complexity_guidance.recommended_model_type:
            print("   → Use Lasso (L1) for automatic feature selection")
            print("   → Use Ridge (L2) if you want to keep all features")
        elif "tree" in capacity_result.complexity_guidance.recommended_model_type:
            print("   → Random Forest or XGBoost recommended")
            print("   → Trees handle correlated features naturally")

    print("\n" + "=" * 70)

======================================================================
FEATURE CAPACITY ACTION ITEMS
======================================================================

📋 BASED ON YOUR DATA CAPACITY:

✅ ADEQUATE CAPACITY - You have room to add features
   • Current features: 179
   • Can add up to: 89791 more features (EPV=10)
   • Consider: Creating derived features from datetime and categorical columns

🔄 REDUNDANT FEATURES TO CONSIDER REMOVING:
   These features are highly correlated with others and add little new information:
   • time_to_open_hours_count_180d
   • opened_vs_cohort_mean
   • time_to_open_hours_vs_cohort_pct
   • days_since_first_event_y
   • clicked_cohort_zscore
   ... and 96 more

💰 FEATURE BUDGET FOR NEW FEATURES:
   You can safely add 89791 new features
   Prioritize:
   • Recency features (days_since_last_activity)
   • Tenure features (days_since_created)
   • Engagement composites (email_engagement_score)

🎯 RECOMMENDED MODELING APPROACH:
   Model type: Linear

======================================================================

# Feature Availability Analysis
from customer_retention.stages.features.feature_selector import FeatureSelector

print("=" * 70)
print("FEATURE AVAILABILITY ANALYSIS")
print("=" * 70)

unavailable_features = []
if findings.has_availability_issues:
    selector = FeatureSelector(target_column=target)
    availability_recs = selector.get_availability_recommendations(findings.feature_availability)
    unavailable_features = [rec.column for rec in availability_recs]

    print(f"\n⚠️  {len(availability_recs)} feature(s) have tracking changes:\n")

    for rec in availability_recs:
        print(f"📌 {rec.column}")
        print(f"   Issue: {rec.issue_type} | Coverage: {rec.coverage_pct:.0f}%")
        print(f"   Available: {rec.first_valid_date} → {rec.last_valid_date}")
        print("\n   Remediation options:")
        for opt in rec.options:
            marker = "→" if opt.get("recommended") else " "
            print(f"   {marker} [{opt['type']}] {opt['description']}")
        print()

    print("-" * 70)
    print("RECOMMENDED ACTION: Remove unavailable features before modeling")
    print("-" * 70)
    print(f"\nFeatures to exclude: {', '.join(unavailable_features)}")
    print("\nAlternative approaches (require additional implementation):")
    print("  • segment_by_cohort: Train separate models for different time periods")
    print("  • add_indicator: Create availability flags, impute missing values")
    print("  • filter_window: Restrict training data to feature's available period")

    findings.metadata["unavailable_features"] = unavailable_features
    findings.metadata["availability_action"] = "exclude"
else:
    print("\n✅ All features have full temporal coverage - no availability issues.")

======================================================================
FEATURE AVAILABILITY ANALYSIS
======================================================================

✅ All features have full temporal coverage - no availability issues.

datetime_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type == ColumnType.DATETIME
]

if datetime_cols:
    print("Datetime Feature Opportunities:")
    print("="*50)
    for col in datetime_cols:
        print(f"\n{col}:")
        print(f"  - {col}_year: Extract year")
        print(f"  - {col}_month: Extract month")
        print(f"  - {col}_day: Extract day of month")
        print(f"  - {col}_dayofweek: Extract day of week (0-6)")
        print(f"  - {col}_is_weekend: Is weekend flag")
        print(f"  - days_since_{col}: Days since date")
else:
    print("No datetime columns found.")

Datetime Feature Opportunities:
==================================================

as_of_date:
  - as_of_date_year: Extract year
  - as_of_date_month: Extract month
  - as_of_date_day: Extract day of month
  - as_of_date_dayofweek: Extract day of week (0-6)
  - as_of_date_is_weekend: Is weekend flag
  - days_since_as_of_date: Days since date

print("=" * 70)
print("CREATING DERIVED FEATURES")
print("=" * 70)

segmenter = CustomerSegmenter()
df_features = df.copy()

datetime_cols = [name for name, col in findings.columns.items()
                 if col.inferred_type == ColumnType.DATETIME
                 and name not in TEMPORAL_METADATA_COLS]
binary_cols = [name for name, col in findings.columns.items()
               if col.inferred_type == ColumnType.BINARY
               and name not in TEMPORAL_METADATA_COLS]
numeric_cols = [name for name, col in findings.columns.items()
                if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]]

from customer_retention.core.compat import as_tz_naive

for col in datetime_cols:
    df_features[col] = as_tz_naive(pd.to_datetime(df_features[col], errors='coerce', format='mixed'))

reference_date = pd.Timestamp.now()
if datetime_cols:
    last_dates = [df_features[col].max() for col in datetime_cols if df_features[col].notna().any()]
    if last_dates:
        reference_date = max(last_dates)
print(f"\nReference date: {reference_date}")

print("\n📅 TIME-BASED FEATURES:")
created_cols = [c for c in datetime_cols if 'creat' in c.lower() or 'signup' in c.lower() or 'register' in c.lower()]
if created_cols:
    created_col = created_cols[0]
    df_features = segmenter.create_tenure_features(df_features, created_column=created_col, reference_date=reference_date)
    print(f"  ✓ tenure_days from {created_col}")
    registry.add_silver_derived(
        column="tenure_days",
        expression=f"(reference_date - {created_col}).days",
        feature_type="tenure",
        rationale=f"Customer tenure in days from {created_col}",
        source_notebook="06_feature_opportunities"
    )

activity_cols = [c for c in datetime_cols if 'last' in c.lower() or 'recent' in c.lower()]
if activity_cols:
    activity_col = activity_cols[0]
    df_features = segmenter.create_recency_features(df_features, last_activity_column=activity_col,
                                                     reference_date=reference_date, output_column='days_since_last_activity')
    print(f"  ✓ days_since_last_activity from {activity_col}")
    registry.add_silver_derived(
        column="days_since_last_activity",
        expression=f"(reference_date - {activity_col}).days",
        feature_type="recency",
        rationale=f"Days since last activity from {activity_col}",
        source_notebook="06_feature_opportunities"
    )

print("\n📧 ENGAGEMENT FEATURES:")
rate_cols = [c for c in numeric_cols if 'rate' in c.lower() or 'pct' in c.lower() or 'percent' in c.lower()]
open_rate_cols = [c for c in rate_cols if 'open' in c.lower()]
click_rate_cols = [c for c in rate_cols if 'click' in c.lower()]

if open_rate_cols and click_rate_cols:
    open_col, click_col = open_rate_cols[0], click_rate_cols[0]
    df_features = segmenter.create_engagement_score(df_features, open_rate_column=open_col,
                                                     click_rate_column=click_col, output_column='email_engagement_score')
    print(f"  ✓ email_engagement_score from {open_col}, {click_col}")
    registry.add_silver_derived(
        column="email_engagement_score",
        expression=f"0.6 * {open_col} + 0.4 * {click_col}",
        feature_type="composite",
        rationale=f"Weighted engagement score from {open_col} and {click_col}",
        source_notebook="06_feature_opportunities"
    )

    df_features['click_to_open_rate'] = np.where(df_features[open_col] > 0, df_features[click_col] / df_features[open_col], 0)
    print("  ✓ click_to_open_rate")
    registry.add_silver_ratio(
        column="click_to_open_rate",
        numerator=click_col,
        denominator=open_col,
        rationale=f"Click-to-open ratio: {click_col} / {open_col}",
        source_notebook="06_feature_opportunities"
    )

print("\n🔧 SERVICE ADOPTION:")
if binary_cols:
    service_binary = [c for c in binary_cols if c != target]
    if service_binary:
        df_features['service_adoption_score'] = df_features[service_binary].sum(axis=1)
        print(f"  ✓ service_adoption_score from {service_binary}")
        registry.add_silver_derived(
            column="service_adoption_score",
            expression=f"sum([{', '.join(service_binary)}])",
            feature_type="composite",
            rationale=f"Service adoption count from {len(service_binary)} binary flags",
            source_notebook="06_feature_opportunities"
        )

print("\n💰 VALUE FEATURES:")
value_cols = [c for c in numeric_cols if 'order' in c.lower() or 'amount' in c.lower() or 'value' in c.lower() or 'avg' in c.lower()]
freq_cols = [c for c in numeric_cols if 'freq' in c.lower() or 'count' in c.lower()]
if value_cols and freq_cols:
    df_features['value_frequency_product'] = df_features[value_cols[0]] * df_features[freq_cols[0]]
    print(f"  ✓ value_frequency_product from {value_cols[0]}, {freq_cols[0]}")
    registry.add_silver_interaction(
        column="value_frequency_product",
        features=[value_cols[0], freq_cols[0]],
        rationale=f"Value-frequency interaction: {value_cols[0]} × {freq_cols[0]}",
        source_notebook="06_feature_opportunities"
    )

new_cols = len(df_features.columns) - len(df.columns)
print(f"\n✓ Created {new_cols} new features (total: {len(df_features.columns)})")
print(f"✅ Persisted {len([c for c in ['tenure_days', 'days_since_last_activity', 'email_engagement_score', 'click_to_open_rate', 'service_adoption_score', 'value_frequency_product'] if c in df_features.columns])} derived feature recommendations to registry")

======================================================================
CREATING DERIVED FEATURES
======================================================================

Reference date: 2023-06-19 00:00:00

📅 TIME-BASED FEATURES:

📧 ENGAGEMENT FEATURES:

  ✓ email_engagement_score from opened_velocity_pct, clicked_vs_cohort_pct
  ✓ click_to_open_rate

🔧 SERVICE ADOPTION:

  ✓ service_adoption_score from ['opened_max_180d', 'clicked_max_180d', 'bounced_max_180d', 'opened_max_365d', 'clicked_max_365d', 'bounced_max_365d', 'opened_max_all_time', 'clicked_max_all_time', 'bounced_max_all_time', 'lag0_opened_max', 'lag0_clicked_max', 'lag0_bounced_max', 'lag1_opened_max', 'lag1_clicked_max', 'lag1_bounced_sum', 'lag1_bounced_max', 'lag2_opened_max', 'lag2_clicked_max', 'lag2_bounced_sum', 'lag2_bounced_max', 'lag3_opened_max', 'lag3_clicked_sum', 'lag3_clicked_max', 'lag3_bounced_sum', 'lag3_bounced_max', 'clicked_velocity_pct', 'bounced_velocity_pct', 'clicked_momentum', 'bounced_momentum']

💰 VALUE FEATURES:

✓ Created 3 new features (total: 221)
✅ Persisted 3 derived feature recommendations to registry

print("=" * 70)
print("CUSTOMER SEGMENTATION")
print("=" * 70)

print("\n🎯 VALUE-FREQUENCY SEGMENTS:")
value_cols = [c for c in numeric_cols if 'order' in c.lower() or 'amount' in c.lower() or 'value' in c.lower() or 'avg' in c.lower()]
freq_cols = [c for c in numeric_cols if 'freq' in c.lower() or 'count' in c.lower()]

if value_cols and freq_cols:
    df_features, vf_result = segmenter.segment_by_value_frequency(
        df_features, value_column=value_cols[0], frequency_column=freq_cols[0])
    print(f"  Using {value_cols[0]} × {freq_cols[0]}")
    for seg in vf_result.segments:
        print(f"    {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)")
else:
    print("  No suitable value/frequency columns found")

print("\n📅 RECENCY SEGMENTS:")
if 'days_since_last_activity' in df_features.columns:
    df_features, recency_result = segmenter.segment_by_recency(df_features, days_since_column='days_since_last_activity')
    for seg in recency_result.segments:
        print(f"    {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)")
else:
    print("  No recency column available")

print("\n📧 ENGAGEMENT SEGMENTS:")
if 'email_engagement_score' in df_features.columns:
    max_score = df_features['email_engagement_score'].max()
    if max_score > 0:
        df_features['engagement_normalized'] = df_features['email_engagement_score'] / max_score
        df_features, eng_result = segmenter.segment_by_engagement(df_features, engagement_column='engagement_normalized')
        for seg in eng_result.segments:
            print(f"    {seg.name}: {seg.count:,} ({seg.percentage:.1f}%)")
        df_features = df_features.drop(columns=['engagement_normalized'])
else:
    print("  No engagement score available")

if 'customer_segment' in df_features.columns and target and target in df_features.columns:
    segment_retention = df_features.groupby('customer_segment')[target].mean() * 100

    max_rate = segment_retention.max()
    fig = go.Figure(go.Bar(
        x=segment_retention.index, y=segment_retention.values,
        marker_color=['#2ca02c' if r > 70 else '#ffbb00' if r > 50 else '#d62728' for r in segment_retention.values],
        text=[f'{r:.1f}%' for r in segment_retention.values], textposition='outside'))
    fig.update_layout(
        title='Retention Rate by Customer Segment',
        xaxis_title='Segment',
        yaxis_title='Retention Rate (%)',
        yaxis_range=[0, max_rate * 1.15],  # Add 15% headroom for labels
        template='plotly_white',
        height=400,
    )
    display_figure(fig)

segment_cols = [c for c in df_features.columns if 'segment' in c.lower() or 'bucket' in c.lower()]
print(f"\n✓ Created {len(segment_cols)} segmentation features")

======================================================================
CUSTOMER SEGMENTATION
======================================================================

🎯 VALUE-FREQUENCY SEGMENTS:
  No suitable value/frequency columns found

📅 RECENCY SEGMENTS:
  No recency column available

📧 ENGAGEMENT SEGMENTS:

    High_Engagement: 5,656 (0.3%)
    Medium_Engagement: 0 (0.0%)
    Low_Engagement: 67,468 (3.3%)

✓ Created 2 segmentation features

numeric_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]
    and name not in TEMPORAL_METADATA_COLS
]

transform_count = 0
if numeric_cols:
    print("Numeric Transformation Opportunities:")
    print("="*50)

    for col_name in numeric_cols:
        col_info = findings.columns[col_name]
        series = df[col_name].dropna()
        skewness = series.skew()

        print(f"\n{col_name}:")
        print(f"  Skewness: {skewness:.2f}")

        if abs(skewness) > 1:
            print("  Recommendation: Apply log transform (highly skewed)")
            registry.add_gold_transformation(
                column=col_name,
                transform="log",
                parameters={"skewness": float(skewness), "reason": "highly_skewed"},
                rationale=f"Log transform for highly skewed distribution (skewness={skewness:.2f})",
                source_notebook="06_feature_opportunities"
            )
            transform_count += 1
        elif abs(skewness) > 0.5:
            print("  Recommendation: Consider sqrt transform (moderately skewed)")
            registry.add_gold_transformation(
                column=col_name,
                transform="sqrt",
                parameters={"skewness": float(skewness), "reason": "moderately_skewed"},
                rationale=f"Sqrt transform for moderately skewed distribution (skewness={skewness:.2f})",
                source_notebook="06_feature_opportunities"
            )
            transform_count += 1
        else:
            print("  Recommendation: Standard scaling sufficient")
            registry.add_gold_scaling(
                column=col_name,
                method="standard",
                rationale=f"Standard scaling for normally distributed column (skewness={skewness:.2f})",
                source_notebook="06_feature_opportunities"
            )
            transform_count += 1

        if col_info.inferred_type == ColumnType.NUMERIC_CONTINUOUS:
            print(f"  Binning: Consider creating bins for {col_name}_binned")

    print(f"\n✅ Persisted {transform_count} transformation recommendations to registry")

Numeric Transformation Opportunities:
==================================================

event_count_180d:
  Skewness: 2.15
  Recommendation: Apply log transform (highly skewed)

event_count_365d:
  Skewness: 1.61
  Recommendation: Apply log transform (highly skewed)

event_count_all_time:
  Skewness: 2.61
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for event_count_all_time_binned

opened_sum_180d:
  Skewness: 3.14
  Recommendation: Apply log transform (highly skewed)

opened_mean_180d:
  Skewness: 1.27
  Recommendation: Apply log transform (highly skewed)

opened_count_180d:
  Skewness: 2.15
  Recommendation: Apply log transform (highly skewed)

clicked_sum_180d:
  Skewness: 4.78
  Recommendation: Apply log transform (highly skewed)

clicked_mean_180d:
  Skewness: 3.26
  Recommendation: Apply log transform (highly skewed)

clicked_count_180d:
  Skewness: 2.15
  Recommendation: Apply log transform (highly skewed)

send_hour_sum_180d:
  Skewness: 2.19
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for send_hour_sum_180d_binned

send_hour_mean_180d:
  Skewness: 0.04
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for send_hour_mean_180d_binned

send_hour_max_180d:
  Skewness: -0.17
  Recommendation: Standard scaling sufficient

send_hour_count_180d:
  Skewness: 2.15
  Recommendation: Apply log transform (highly skewed)

bounced_sum_180d:
  Skewness: 9.70
  Recommendation: Apply log transform (highly skewed)

bounced_mean_180d:
  Skewness: 7.19
  Recommendation: Apply log transform (highly skewed)

bounced_count_180d:
  Skewness: 2.15
  Recommendation: Apply log transform (highly skewed)

time_to_open_hours_sum_180d:
  Skewness: 5.51
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_sum_180d_binned

time_to_open_hours_mean_180d:
  Skewness: 1.89
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_mean_180d_binned

time_to_open_hours_max_180d:
  Skewness: 1.76
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_max_180d_binned

time_to_open_hours_count_180d:
  Skewness: 3.14
  Recommendation: Apply log transform (highly skewed)

opened_sum_365d:
  Skewness: 2.37
  Recommendation: Apply log transform (highly skewed)

opened_mean_365d:
  Skewness: 1.19
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for opened_mean_365d_binned

opened_count_365d:
  Skewness: 1.61
  Recommendation: Apply log transform (highly skewed)

clicked_sum_365d:
  Skewness: 3.47
  Recommendation: Apply log transform (highly skewed)

clicked_mean_365d:
  Skewness: 3.06
  Recommendation: Apply log transform (highly skewed)

clicked_count_365d:
  Skewness: 1.61
  Recommendation: Apply log transform (highly skewed)

send_hour_sum_365d:
  Skewness: 1.62
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for send_hour_sum_365d_binned

send_hour_mean_365d:
  Skewness: 0.09
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for send_hour_mean_365d_binned

send_hour_max_365d:
  Skewness: -0.34
  Recommendation: Standard scaling sufficient

send_hour_count_365d:
  Skewness: 1.61
  Recommendation: Apply log transform (highly skewed)

bounced_sum_365d:
  Skewness: 6.94
  Recommendation: Apply log transform (highly skewed)

bounced_mean_365d:
  Skewness: 6.70
  Recommendation: Apply log transform (highly skewed)

bounced_count_365d:
  Skewness: 1.61
  Recommendation: Apply log transform (highly skewed)

time_to_open_hours_sum_365d:
  Skewness: 3.94
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_sum_365d_binned

time_to_open_hours_mean_365d:
  Skewness: 2.25
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_mean_365d_binned

time_to_open_hours_max_365d:
  Skewness: 1.90
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_max_365d_binned

time_to_open_hours_count_365d:
  Skewness: 2.37
  Recommendation: Apply log transform (highly skewed)

opened_sum_all_time:
  Skewness: 2.27
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for opened_sum_all_time_binned

opened_mean_all_time:
  Skewness: 0.38
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for opened_mean_all_time_binned

opened_count_all_time:
  Skewness: 2.61
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for opened_count_all_time_binned

clicked_sum_all_time:
  Skewness: 2.00
  Recommendation: Apply log transform (highly skewed)

clicked_mean_all_time:
  Skewness: 1.26
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for clicked_mean_all_time_binned

clicked_count_all_time:
  Skewness: 2.61
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for clicked_count_all_time_binned

send_hour_sum_all_time:
  Skewness: 2.52
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for send_hour_sum_all_time_binned

send_hour_mean_all_time:
  Skewness: -0.25
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for send_hour_mean_all_time_binned

send_hour_max_all_time:
  Skewness: -1.44
  Recommendation: Apply log transform (highly skewed)

send_hour_count_all_time:
  Skewness: 2.61
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for send_hour_count_all_time_binned

bounced_sum_all_time:
  Skewness: 1.92
  Recommendation: Apply log transform (highly skewed)

bounced_mean_all_time:
  Skewness: 7.27
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for bounced_mean_all_time_binned

bounced_count_all_time:
  Skewness: 2.61
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for bounced_count_all_time_binned

time_to_open_hours_sum_all_time:
  Skewness: 2.23
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_sum_all_time_binned

time_to_open_hours_mean_all_time:
  Skewness: 2.00
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_mean_all_time_binned

time_to_open_hours_max_all_time:
  Skewness: 1.27
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_max_all_time_binned

time_to_open_hours_count_all_time:
  Skewness: 2.27
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for time_to_open_hours_count_all_time_binned

days_since_last_event_x:
  Skewness: 1.03
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for days_since_last_event_x_binned

days_since_first_event_x:
  Skewness: -1.88
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for days_since_first_event_x_binned

dow_sin:
  Skewness: -0.73
  Recommendation: Consider sqrt transform (moderately skewed)
  Binning: Consider creating bins for dow_sin_binned

dow_cos:
  Skewness: 2.58
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for dow_cos_binned

bounced_momentum_180_365:
  Skewness: 4.10
  Recommendation: Apply log transform (highly skewed)

clicked_momentum_180_365:
  Skewness: 1.94
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for clicked_momentum_180_365_binned

lag0_opened_sum:
  Skewness: 2.65
  Recommendation: Apply log transform (highly skewed)

lag0_opened_mean:
  Skewness: 1.99
  Recommendation: Apply log transform (highly skewed)

lag0_opened_count:
  Skewness: 12.96
  Recommendation: Apply log transform (highly skewed)

lag0_clicked_sum:
  Skewness: 4.72
  Recommendation: Apply log transform (highly skewed)

lag0_clicked_mean:
  Skewness: 4.51
  Recommendation: Apply log transform (highly skewed)

lag0_clicked_count:
  Skewness: 12.96
  Recommendation: Apply log transform (highly skewed)

lag0_send_hour_sum:
  Skewness: 10.64
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag0_send_hour_sum_binned

lag0_send_hour_mean:
  Skewness: 0.05
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for lag0_send_hour_mean_binned

lag0_send_hour_count:
  Skewness: 12.96
  Recommendation: Apply log transform (highly skewed)

lag0_send_hour_max:
  Skewness: -0.03
  Recommendation: Standard scaling sufficient

lag0_bounced_sum:
  Skewness: 6.44
  Recommendation: Apply log transform (highly skewed)

lag0_bounced_mean:
  Skewness: 7.14
  Recommendation: Apply log transform (highly skewed)

lag0_bounced_count:
  Skewness: 12.96
  Recommendation: Apply log transform (highly skewed)

lag0_time_to_open_hours_sum:
  Skewness: 5.34
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag0_time_to_open_hours_sum_binned

lag0_time_to_open_hours_mean:
  Skewness: 2.36
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag0_time_to_open_hours_mean_binned

lag0_time_to_open_hours_count:
  Skewness: 2.65
  Recommendation: Apply log transform (highly skewed)

lag0_time_to_open_hours_max:
  Skewness: 2.27
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag0_time_to_open_hours_max_binned

lag1_opened_sum:
  Skewness: 2.01
  Recommendation: Apply log transform (highly skewed)

lag1_opened_mean:
  Skewness: 1.74
  Recommendation: Apply log transform (highly skewed)

lag1_opened_count:
  Skewness: 3.59
  Recommendation: Apply log transform (highly skewed)

lag1_clicked_sum:
  Skewness: 3.87
  Recommendation: Apply log transform (highly skewed)

lag1_clicked_mean:
  Skewness: 4.02
  Recommendation: Apply log transform (highly skewed)

lag1_clicked_count:
  Skewness: 3.59
  Recommendation: Apply log transform (highly skewed)

lag1_send_hour_sum:
  Skewness: 3.61
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag1_send_hour_sum_binned

lag1_send_hour_mean:
  Skewness: 0.01
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for lag1_send_hour_mean_binned

lag1_send_hour_count:
  Skewness: 3.59
  Recommendation: Apply log transform (highly skewed)

lag1_send_hour_max:
  Skewness: -0.05
  Recommendation: Standard scaling sufficient

lag1_bounced_mean:
  Skewness: 7.80
  Recommendation: Apply log transform (highly skewed)

lag1_bounced_count:
  Skewness: 3.59
  Recommendation: Apply log transform (highly skewed)

lag1_time_to_open_hours_sum:
  Skewness: 4.04
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag1_time_to_open_hours_sum_binned

lag1_time_to_open_hours_mean:
  Skewness: 1.90
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag1_time_to_open_hours_mean_binned

lag1_time_to_open_hours_count:
  Skewness: 5.76
  Recommendation: Apply log transform (highly skewed)

lag1_time_to_open_hours_max:
  Skewness: 1.86
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag1_time_to_open_hours_max_binned

lag2_opened_sum:
  Skewness: 2.22
  Recommendation: Apply log transform (highly skewed)

lag2_opened_mean:
  Skewness: 1.67
  Recommendation: Apply log transform (highly skewed)

lag2_opened_count:
  Skewness: 3.53
  Recommendation: Apply log transform (highly skewed)

lag2_clicked_sum:
  Skewness: 3.72
  Recommendation: Apply log transform (highly skewed)

lag2_clicked_mean:
  Skewness: 4.01
  Recommendation: Apply log transform (highly skewed)

lag2_clicked_count:
  Skewness: 3.53
  Recommendation: Apply log transform (highly skewed)

lag2_send_hour_sum:
  Skewness: 3.23
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag2_send_hour_sum_binned

lag2_send_hour_mean:
  Skewness: 0.05
  Recommendation: Standard scaling sufficient
  Binning: Consider creating bins for lag2_send_hour_mean_binned

lag2_send_hour_count:
  Skewness: 3.53
  Recommendation: Apply log transform (highly skewed)

lag2_send_hour_max:
  Skewness: -0.02
  Recommendation: Standard scaling sufficient

lag2_bounced_mean:
  Skewness: 6.60
  Recommendation: Apply log transform (highly skewed)

lag2_bounced_count:
  Skewness: 3.53
  Recommendation: Apply log transform (highly skewed)

lag2_time_to_open_hours_sum:
  Skewness: 3.64
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag2_time_to_open_hours_sum_binned

lag2_time_to_open_hours_mean:
  Skewness: 1.32
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag2_time_to_open_hours_mean_binned

lag2_time_to_open_hours_count:
  Skewness: 6.27
  Recommendation: Apply log transform (highly skewed)

lag2_time_to_open_hours_max:
  Skewness: 1.28
  Recommendation: Apply log transform (highly skewed)
  Binning: Consider creating bins for lag2_time_to_open_hours_max_binned

categorical_cols = [
    name for name, col in findings.columns.items()
    if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]
    and name not in TEMPORAL_METADATA_COLS
]

encoding_count = 0
if categorical_cols:
    print("Categorical Encoding Recommendations:")
    print("="*50)

    for col_name in categorical_cols:
        col_info = findings.columns[col_name]
        distinct = col_info.universal_metrics.get("distinct_count", 0)

        print(f"\n{col_name}: ({distinct} unique values)")

        if distinct <= 5:
            print("  Recommendation: One-hot encoding")
            registry.add_gold_encoding(
                column=col_name,
                method="onehot",
                rationale=f"One-hot encoding for low cardinality ({distinct} unique values)",
                source_notebook="06_feature_opportunities"
            )
            encoding_count += 1
        elif distinct <= 20:
            print("  Recommendation: Target encoding or one-hot with frequency threshold")
            registry.add_gold_encoding(
                column=col_name,
                method="target",
                rationale=f"Target encoding for medium cardinality ({distinct} unique values)",
                source_notebook="06_feature_opportunities"
            )
            encoding_count += 1
        else:
            print("  Recommendation: Target encoding or embedding (high cardinality)")
            registry.add_gold_encoding(
                column=col_name,
                method="target",
                rationale=f"Target encoding for high cardinality ({distinct} unique values)",
                source_notebook="06_feature_opportunities"
            )
            encoding_count += 1

        if col_info.inferred_type == ColumnType.CATEGORICAL_ORDINAL:
            print("  Note: Consider ordinal encoding to preserve order")

    print(f"\n✅ Persisted {encoding_count} encoding recommendations to registry")

Categorical Encoding Recommendations:
==================================================

lifecycle_quadrant: (4 unique values)
  Recommendation: One-hot encoding

recency_bucket: (5 unique values)
  Recommendation: One-hot encoding

✅ Persisted 2 encoding recommendations to registry

print("Potential Interaction Features:")
print("="*50)

if len(numeric_cols) >= 2:
    print("\nNumeric Interactions:")
    for i, col1 in enumerate(numeric_cols[:3]):
        for col2 in numeric_cols[i+1:4]:
            print(f"  - {col1}_x_{col2}: Multiplication")
            print(f"  - {col1}_div_{col2}: Division (if {col2} > 0)")

if categorical_cols and numeric_cols:
    print("\nCategorical-Numeric Interactions:")
    for cat_col in categorical_cols[:2]:
        for num_col in numeric_cols[:2]:
            print(f"  - {num_col}_by_{cat_col}_mean: Group mean")
            print(f"  - {num_col}_by_{cat_col}_std: Group std")

Potential Interaction Features:
==================================================

Numeric Interactions:
  - event_count_180d_x_event_count_365d: Multiplication
  - event_count_180d_div_event_count_365d: Division (if event_count_365d > 0)
  - event_count_180d_x_event_count_all_time: Multiplication
  - event_count_180d_div_event_count_all_time: Division (if event_count_all_time > 0)
  - event_count_180d_x_opened_sum_180d: Multiplication
  - event_count_180d_div_opened_sum_180d: Division (if opened_sum_180d > 0)
  - event_count_365d_x_event_count_all_time: Multiplication
  - event_count_365d_div_event_count_all_time: Division (if event_count_all_time > 0)
  - event_count_365d_x_opened_sum_180d: Multiplication
  - event_count_365d_div_opened_sum_180d: Division (if opened_sum_180d > 0)
  - event_count_all_time_x_opened_sum_180d: Multiplication
  - event_count_all_time_div_opened_sum_180d: Division (if opened_sum_180d > 0)

Categorical-Numeric Interactions:
  - event_count_180d_by_lifecycle_quadrant_mean: Group mean
  - event_count_180d_by_lifecycle_quadrant_std: Group std
  - event_count_365d_by_lifecycle_quadrant_mean: Group mean
  - event_count_365d_by_lifecycle_quadrant_std: Group std
  - event_count_180d_by_recency_bucket_mean: Group mean
  - event_count_180d_by_recency_bucket_std: Group std
  - event_count_365d_by_recency_bucket_mean: Group mean
  - event_count_365d_by_recency_bucket_std: Group std

feature_summary = []
for rec in feature_recs:
    feature_summary.append({
        "Feature Name": rec.feature_name,
        "Source": rec.source_column,
        "Type": rec.feature_type,
        "Priority": rec.priority
    })

if feature_summary:
    summary_df = pd.DataFrame(feature_summary)
    display(summary_df)

# Save recommendations
registry.save(RECOMMENDATIONS_PATH)

print(f"✅ Saved {len(registry.all_recommendations)} recommendations to {RECOMMENDATIONS_PATH}")
print("\nRecommendations by layer:")
for layer in ["bronze", "silver", "gold"]:
    recs = registry.get_by_layer(layer)
    print(f"  {layer.upper()}: {len(recs)}")

from customer_retention.analysis.notebook_html_exporter import export_notebook_html

export_notebook_html(Path("06_feature_opportunities.ipynb"), EXPERIMENTS_DIR / "docs")

✅ Saved 860 recommendations to /Users/Vital/python/CustomerRetention/experiments/runs/email-6301db6c/merged/recommendations.yaml

Recommendations by layer:
  BRONZE: 5
  SILVER: 10
  GOLD: 845

PosixPath('/Users/Vital/python/CustomerRetention/experiments/docs/06_feature_opportunities.html')

Feature Type	Business Meaning	Predictive Power
Tenure	How long customer has been with us	Loyalty indicator
Recency	Days since last order	Engagement/churn signal
Engagement Score	Combined email metrics	Overall engagement level
Segments	High/Low value × Frequent/Infrequent	Risk stratification

EPV Level	Risk Level	Recommendations
EPV ≥ 20	Low risk	Stable coefficients, reliable inference
EPV = 10-20	Moderate	Standard practice, consider regularization
EPV = 5-10	Elevated	Strong regularization required (L1/Lasso)
EPV < 5	High risk	Reduce features or collect more data

Approach	When to Use	Pros	Cons
Single Model	Small data, uniform segments	More data per model, simpler	May miss segment-specific patterns
Segment Models	Large data, distinct segments	Tailored patterns	Need sufficient data per segment
Hybrid	Mixed segment sizes	Best of both	More complex to maintain

	Segment	Samples	Minority Events	EPV	Max Features (EPV=10)	Status
1	Occasional & Loyal	679932	51712	288.9	5171	Adequate
0	Intense & Brief	678316	117564	656.8	11756	Adequate
2	Steady & Loyal	331280	34340	191.8	3434	Adequate
3	One-shot	329664	76760	428.8	7676	Adequate

Metric	What It Means	Rule of Thumb
EPV ≥ 20	Stable, reliable estimates	Conservative, regulatory-grade
EPV = 10-20	Standard practice	Use for most applications
EPV = 5-10	Limited capacity	Requires strong regularization
EPV < 5	High risk	Reduce features or get more data

Chapter 6: Feature Opportunities¶

Why Feature Engineering Matters¶

6.1 Setup¶

6.2 Automated Feature Recommendations¶

6.3 Feature Capacity Analysis¶

6.3.1 Model Complexity Guidance¶

6.3.2 Segment-Specific Capacity (for Multi-Model Strategy)¶

6.3.3 Feature Capacity Action Items¶

6.3.4 Feature Availability Issues¶

6.4 Datetime Feature Opportunities¶

6.5 Business-Driven Derived Features¶

6.6 Customer Segmentation Features¶

6.7 Numeric Transformation Opportunities¶

6.8 Categorical Encoding Opportunities¶

Summary: What We Learned¶

Feature Capacity Analysis¶

Feature Engineering¶

Feature Capacity Key Concepts¶

Key Derived Features Created¶

Next Steps¶

6.9 Feature Summary Table¶

Next Steps¶

	Model Type	Max Features	Current	Status
0	Linear (no regularization)	89970	179	✅ OK
1	Regularized (L1/L2)	179941	179	✅ OK
2	Tree-based	67306	179	✅ OK

Feature	Formula	Business Meaning
`tenure_days`	reference_date - created	Customer longevity
`days_since_last_order`	reference_date - lastorder	Recency/engagement
`email_engagement_score`	0.6×openrate + 0.4×clickrate	Overall engagement
`service_adoption_score`	paperless + refill + doorstep	Service utilization
`customer_segment`	Value × Frequency quadrant	Customer type

	Feature Name	Source	Type	Priority
0	as_of_date_year	as_of_date	temporal	medium
1	as_of_date_month	as_of_date	temporal	medium
2	as_of_date_dayofweek	as_of_date	temporal	medium
3	days_since_as_of_date	as_of_date	datetime	high
4	event_count_180d_binned	event_count_180d	numeric	low
...	...	...	...	...
334	time_to_open_hours_vs_cohort_mean_log	time_to_open_hours_vs_cohort_mean	numeric	high
335	time_to_open_hours_vs_cohort_pct_binned	time_to_open_hours_vs_cohort_pct	numeric	low
336	time_to_open_hours_vs_cohort_pct_log	time_to_open_hours_vs_cohort_pct	numeric	high
337	time_to_open_hours_cohort_zscore_binned	time_to_open_hours_cohort_zscore	numeric	low
338	time_to_open_hours_cohort_zscore_log	time_to_open_hours_cohort_zscore	numeric	high