from customer_retention.analysis.notebook_progress import track_and_export_previous

track_and_export_previous("01a_a_temporal_text_deep_dive.ipynb")

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata
from customer_retention.analysis.visualization import ChartBuilder, display_figure
from customer_retention.core.config.column_config import ColumnType
from customer_retention.core.config.experiments import (
    EXPERIMENTS_DIR,
    FINDINGS_DIR,
)
from customer_retention.stages.profiling import (
    TextColumnProcessor,
    TextProcessingConfig,
    TimeWindowAggregator,
    get_model_info,
    list_available_models,
)

from customer_retention.analysis.auto_explorer import load_notebook_findings

DATASET_NAME = None  # Set to override auto-resolved dataset, e.g. "3set_support_tickets"

FINDINGS_PATH, _namespace, dataset_name = load_notebook_findings("01a_a_temporal_text_deep_dive.ipynb")
if DATASET_NAME is not None:
    dataset_name = DATASET_NAME
print(f"Using: {FINDINGS_PATH}")

findings = ExplorationFindings.load(FINDINGS_PATH)
print(f"\nLoaded findings for {findings.column_count} columns from {findings.source_path}")

Using: /Users/Vital/python/CustomerRetention/experiments/runs/email-6301db6c/datasets/customer_emails/findings/customer_emails_findings.yaml

Loaded findings for 13 columns from ../tests/fixtures/customer_emails.csv

from customer_retention.analysis.auto_explorer.project_context import ProjectContext

LIGHT_RUN = False
if _namespace and _namespace.project_context_path.exists():
    _project_ctx = ProjectContext.load(_namespace.project_context_path)
    LIGHT_RUN = _project_ctx.light_run

if LIGHT_RUN:
    print("LIGHT_RUN mode: text embedding analysis will be skipped")

# Verify this is a time series dataset
# This notebook is ONLY for event-level (time series) data with multiple rows per entity

if not findings.is_time_series:
    print("=" * 70)
    print("WRONG NOTEBOOK FOR THIS DATASET")
    print("=" * 70)
    print()
    print("This dataset is ENTITY-LEVEL (one row per entity), not event-level.")
    print()
    print("For TEXT columns in entity-level data, use:")
    print("   04a_text_columns_deep_dive.ipynb")
    print()
    print("This notebook (01a_a) is for TEXT columns in EVENT-LEVEL data where:")
    print("   - Multiple events per entity (e.g., support tickets, transactions)")
    print("   - Text is embedded per-event, then aggregated across time windows")
    print()
    raise SystemExit("Please use 04a_text_columns_deep_dive.ipynb for entity-level data.")

ts_meta = findings.time_series_metadata
temporal_pattern = (ts_meta.temporal_pattern or "unknown").upper()
print(f"Dataset confirmed as {temporal_pattern} (event-level)")
ENTITY_COLUMN = ts_meta.entity_column
TIME_COLUMN = ts_meta.time_column
print(f"   Entity column: {ENTITY_COLUMN}")
print(f"   Time column: {TIME_COLUMN}")

Dataset confirmed as EVENT_LOG (event-level)
   Entity column: customer_id
   Time column: sent_date

# Identify TEXT columns
text_columns = [
    name for name, col in findings.columns.items()
    if col.inferred_type == ColumnType.TEXT
]

if not text_columns:
    print("\u26a0\ufe0f No TEXT columns detected in this dataset.")
    print("   This notebook is only needed when TEXT columns are present.")
    print("   Continue to notebook 01b_temporal_quality.ipynb")
else:
    print(f"\u2705 Found {len(text_columns)} TEXT column(s):")
    for col in text_columns:
        col_info = findings.columns[col]
        print(f"   - {col} (Confidence: {col_info.confidence:.0%})")

⚠️ No TEXT columns detected in this dataset.
   This notebook is only needed when TEXT columns are present.
   Continue to notebook 01b_temporal_quality.ipynb

from customer_retention.analysis.auto_explorer.active_dataset_store import load_active_dataset

df = load_active_dataset(_namespace, dataset_name)
charts = ChartBuilder()

print(f"Loaded {len(df):,} events x {len(df.columns)} columns")
print(f"Data source: {dataset_name}")
print(f"Unique entities: {df[ENTITY_COLUMN].nunique():,}")

Loaded 83,198 events x 13 columns
Data source: customer_emails
Unique entities: 4,998

# Display available embedding models
print("Available Embedding Models")
print("=" * 80)
print(f"{'Preset':<15} {'Model':<35} {'Size':<10} {'Dim':<8} {'GPU?'}")
print("-" * 80)

for preset in list_available_models():
    info = get_model_info(preset)
    size = f"{info['size_mb']} MB" if info['size_mb'] < 1000 else f"{info['size_mb']/1000:.1f} GB"
    gpu = "Yes" if info['gpu_recommended'] else "No"
    print(f"{preset:<15} {info['model_name']:<35} {size:<10} {info['embedding_dim']:<8} {gpu}")

print("\nFor event-level data with many rows, MiniLM is recommended for faster processing.")
print("Qwen3 models produce higher quality embeddings but require GPU for reasonable speed.")

Available Embedding Models
================================================================================
Preset          Model                               Size       Dim      GPU?
--------------------------------------------------------------------------------
minilm          all-MiniLM-L6-v2                    90 MB      384      No
qwen3-0.6b      Qwen/Qwen3-Embedding-0.6B           1.2 GB     1024     Yes
qwen3-4b        Qwen/Qwen3-Embedding-4B             8.0 GB     2560     Yes
qwen3-8b        Qwen/Qwen3-Embedding-8B             16.0 GB    4096     Yes

For event-level data with many rows, MiniLM is recommended for faster processing.
Qwen3 models produce higher quality embeddings but require GPU for reasonable speed.

# === TEXT PROCESSING CONFIGURATION ===
# Choose your embedding model preset:
#   "minilm"     - Fast, CPU-friendly, recommended for event-level data (default)
#   "qwen3-0.6b" - Better quality, needs GPU
#   "qwen3-4b"   - High quality, needs 16GB+ GPU
#   "qwen3-8b"   - Highest quality, needs 32GB+ GPU

EMBEDDING_PRESET = "minilm"  # Recommended for event-level data

# PCA configuration (capped at 10 for manageability in aggregation)
VARIANCE_THRESHOLD = 0.95  # Keep components explaining 95% of variance
MIN_COMPONENTS = 2         # At least 2 features per text column
MAX_COMPONENTS = 10        # Cap at 10 to keep aggregation manageable

# Aggregation configuration
AGGREGATION_WINDOWS = ["7d", "30d", "90d", "all_time"]
AGGREGATION_FUNCS = ["mean", "std", "first", "last"]

# Create configuration
model_info = get_model_info(EMBEDDING_PRESET)
text_config = TextProcessingConfig(
    embedding_model=model_info["model_name"],
    variance_threshold=VARIANCE_THRESHOLD,
    max_components=MAX_COMPONENTS,
    min_components=MIN_COMPONENTS,
    batch_size=32
)

print("Text Processing Configuration")
print("=" * 50)
print(f"  Preset: {EMBEDDING_PRESET}")
print(f"  Model: {text_config.embedding_model}")
print(f"  Model size: {model_info['size_mb']} MB")
print(f"  Embedding dimension: {model_info['embedding_dim']}")
print(f"  GPU recommended: {'Yes' if model_info['gpu_recommended'] else 'No'}")
print()
print(f"  Variance threshold: {text_config.variance_threshold:.0%}")
print(f"  Max components: {text_config.max_components}")
print()
print("Aggregation Configuration")
print("=" * 50)
print(f"  Windows: {AGGREGATION_WINDOWS}")
print(f"  Functions: {AGGREGATION_FUNCS}")

if model_info['gpu_recommended']:
    print()
    print("Warning: This model works best with GPU. Consider 'minilm' for faster processing.")

Text Processing Configuration
==================================================
  Preset: minilm
  Model: all-MiniLM-L6-v2
  Model size: 90 MB
  Embedding dimension: 384
  GPU recommended: No

  Variance threshold: 95%
  Max components: 10

Aggregation Configuration
==================================================
  Windows: ['7d', '30d', '90d', 'all_time']
  Functions: ['mean', 'std', 'first', 'last']

if text_columns:
    for col_name in text_columns:
        print(f"\n{'='*70}")
        print(f"Column: {col_name}")
        print(f"{'='*70}")

        text_series = df[col_name].fillna("")

        non_empty = (text_series.str.len() > 0).sum()
        avg_length = text_series.str.len().mean()

        print("\n\U0001f4ca Statistics:")
        print(f"   Total events: {len(text_series):,}")
        print(f"   Non-empty: {non_empty:,} ({non_empty/len(text_series)*100:.1f}%)")
        print(f"   Avg length: {avg_length:.0f} characters")

        # Texts per entity
        texts_per_entity = df.groupby(ENTITY_COLUMN)[col_name].apply(
            lambda x: (x.fillna("").str.len() > 0).sum()
        )
        print("\n\U0001f465 Text events per entity:")
        print(f"   Mean: {texts_per_entity.mean():.1f}")
        print(f"   Median: {texts_per_entity.median():.0f}")
        print(f"   Max: {texts_per_entity.max():,}")

        # Sample texts
        print("\n\U0001f4dd Sample texts:")
        samples = text_series[text_series.str.len() > 10].head(3)
        for i, sample in enumerate(samples, 1):
            truncated = sample[:80] + "..." if len(sample) > 80 else sample
            print(f"   {i}. {truncated}")

results = []
if text_columns and findings.is_time_series and not LIGHT_RUN:
    processor = TextColumnProcessor(text_config)

    print("Processing TEXT columns...")
    print("(This may take a moment for large datasets)\n")

    df_with_pcs = df.copy()

    for col_name in text_columns:
        print(f"\n{'='*70}")
        print(f"Processing: {col_name}")
        print(f"{'='*70}")

        df_with_pcs, result = processor.process_column(df_with_pcs, col_name)
        results.append(result)

        print("\n\u2705 Per-event processing complete:")
        print(f"   Components: {result.n_components}")
        print(f"   Explained variance: {result.explained_variance:.1%}")
        print(f"   Features: {', '.join(result.component_columns)}")

    print(f"\n\nDataFrame now has {len(df_with_pcs.columns)} columns (added {len(df_with_pcs.columns) - len(df.columns)} PC columns)")
elif LIGHT_RUN:
    print("Text embedding analysis skipped (LIGHT_RUN)")

if text_columns and findings.is_time_series and results:
    # Collect all PC columns
    all_pc_columns = []
    for result in results:
        all_pc_columns.extend(result.component_columns)

    print(f"\n{'='*70}")
    print("AGGREGATION PLAN")
    print(f"{'='*70}")

    aggregator = TimeWindowAggregator(ENTITY_COLUMN, TIME_COLUMN)
    plan = aggregator.generate_plan(
        df_with_pcs,
        windows=AGGREGATION_WINDOWS,
        value_columns=all_pc_columns,
        agg_funcs=AGGREGATION_FUNCS,
        include_event_count=False,
        include_recency=False
    )

    print("\n\U0001f4ca Plan Summary:")
    print(f"   Entity column: {plan.entity_column}")
    print(f"   Time column: {plan.time_column}")
    print(f"   Windows: {[w.name for w in plan.windows]}")
    print(f"   Value columns: {len(plan.value_columns)}")
    print(f"   Aggregation functions: {plan.agg_funcs}")
    print(f"   Total features to create: {len(plan.feature_columns)}")

    print("\n\U0001f4dd Sample feature names:")
    for feat in plan.feature_columns[:10]:
        print(f"   - {feat}")
    if len(plan.feature_columns) > 10:
        print(f"   ... and {len(plan.feature_columns) - 10} more")

if text_columns and results:
    for result in results:
        print(f"\n{'='*70}")
        print(f"PC Feature Distributions: {result.column_name}")
        print(f"{'='*70}")

        # Distribution of PC1 and PC2
        if len(result.component_columns) >= 2:
            fig = make_subplots(rows=1, cols=2,
                                subplot_titles=(result.component_columns[0], result.component_columns[1]))

            fig.add_trace(go.Histogram(
                x=df_with_pcs[result.component_columns[0]],
                nbinsx=50, marker_color='steelblue', opacity=0.7
            ), row=1, col=1)

            fig.add_trace(go.Histogram(
                x=df_with_pcs[result.component_columns[1]],
                nbinsx=50, marker_color='coral', opacity=0.7
            ), row=1, col=2)

            fig.update_layout(
                title=f"PC Feature Distributions: {result.column_name}",
                height=350, template="plotly_white", showlegend=False
            )
            display_figure(fig)

        # Scatter plot of PC1 vs PC2
        if len(result.component_columns) >= 2:
            fig = px.scatter(
                df_with_pcs.sample(min(5000, len(df_with_pcs))),
                x=result.component_columns[0],
                y=result.component_columns[1],
                title=f"PC1 vs PC2 (sample): {result.column_name}",
                opacity=0.4
            )
            fig.update_layout(template="plotly_white", height=400)
            display_figure(fig)

if text_columns and results:
    for result in results:
        metadata = TextProcessingMetadata(
            column_name=result.column_name,
            embedding_model=text_config.embedding_model,
            embedding_dim=result.embeddings_shape[1],
            n_components=result.n_components,
            explained_variance=result.explained_variance,
            component_columns=result.component_columns,
            variance_threshold_used=text_config.variance_threshold,
            processing_approach="pca"
        )
        findings.text_processing[result.column_name] = metadata

        print(f"\u2705 Added text processing metadata for {result.column_name}")

    findings.save(FINDINGS_PATH)
    print(f"\nFindings saved to: {FINDINGS_PATH}")

from customer_retention.analysis.notebook_html_exporter import export_notebook_html

export_notebook_html(Path("01a_a_temporal_text_deep_dive.ipynb"), EXPERIMENTS_DIR / "docs")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 22
     18     print(f"\nFindings saved to: {FINDINGS_PATH}")
     20 from customer_retention.analysis.notebook_html_exporter import export_notebook_html
---> 22 export_notebook_html(Path("01a_a_temporal_text_deep_dive.ipynb"), EXPERIMENTS_DIR / "docs")

NameError: name 'Path' is not defined

if text_columns and results:
    print("\n" + "="*70)
    print("PRODUCTION PIPELINE RECOMMENDATIONS")
    print("="*70)

    print("\n\U0001f527 Bronze Layer (per-event processing):")
    for result in results:
        print(f"\n   {result.column_name}:")
        print("     Action: embed_reduce")
        print(f"     Model: {text_config.embedding_model}")
        print(f"     Components: {result.n_components}")
        print(f"     Output: {', '.join(result.component_columns[:3])}...")

    print("\n\U0001f527 Silver Layer (entity aggregation):")
    print(f"   Windows: {AGGREGATION_WINDOWS}")
    print(f"   Functions: {AGGREGATION_FUNCS}")
    print("   Example features:")
    for result in results[:1]:
        pc1 = result.component_columns[0]
        for window in AGGREGATION_WINDOWS[:2]:
            for func in AGGREGATION_FUNCS[:2]:
                print(f"     - {pc1}_{func}_{window}")

    print("\n\U0001f4a1 The pipeline generator will create these transformations automatically.")

Approach	Method	When to Use
1. Embeddings + PCA + Aggregation (This notebook)	Per-event PCA → aggregate	Temporal patterns in text
2. LLM Labeling (Future)	LLM labels → categorical aggregation	Specific categories needed

Model	Size	Embedding Dim	Speed	Quality	Best For
MiniLM (default)	90 MB	384	Fast	Good	CPU, quick iteration, small datasets
Qwen3-0.6B	1.2 GB	1024	Medium	Better	GPU available, production quality
Qwen3-4B	8 GB	2560	Slow	High	16GB+ GPU, multilingual, high accuracy
Qwen3-8B	16 GB	4096	Slowest	Highest	32GB+ GPU, research, max quality

Chapter 1a.a: Temporal Text Columns Deep Dive¶

Two Approaches to Text Feature Engineering¶

Embedding Model Options¶

Processing Flow¶

1a.a.1 Load Previous Findings¶

1a.a.2 Load Source Data¶

1a.a.3 Configuration¶

Available Embedding Models¶

1a.a.4 Text Column Analysis¶

1a.a.5 Process Text Columns (Per-Event Embeddings)¶

1a.a.6 Plan Time Window Aggregation¶

1a.a.7 Visualize PC Distributions¶

1a.a.8 Update Findings¶

1a.a.9 Production Recommendations¶

Summary¶

Processing Flow¶

Example Output Features¶

Next Steps¶