from customer_retention.analysis.notebook_progress import track_and_export_previous

track_and_export_previous("04a_text_columns_deep_dive.ipynb")

import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata
from customer_retention.analysis.visualization import ChartBuilder, display_figure
from customer_retention.core.config.column_config import ColumnType
from customer_retention.core.config.experiments import (
    EXPERIMENTS_DIR,
    FINDINGS_DIR,
)
from customer_retention.stages.profiling import (
    TextColumnProcessor,
    TextProcessingConfig,
    get_model_info,
    list_available_models,
)

from customer_retention.analysis.auto_explorer import load_notebook_findings

FINDINGS_PATH, _namespace, dataset_name = load_notebook_findings("04a_text_columns_deep_dive.ipynb")
print(f"Using: {FINDINGS_PATH}")

findings = ExplorationFindings.load(FINDINGS_PATH)
print(f"\nLoaded findings for {findings.column_count} columns from {findings.source_path}")

Using: /Users/Vital/python/CustomerRetention/experiments/runs/email-6301db6c/datasets/customer_emails/findings/customer_emails_findings.yaml

Loaded findings for 13 columns from ../tests/fixtures/customer_emails.csv

# Identify TEXT columns
text_columns = [
    name for name, col in findings.columns.items()
    if col.inferred_type == ColumnType.TEXT
]

if not text_columns:
    print("\u26a0\ufe0f No TEXT columns detected in this dataset.")
    print("   This notebook is only needed when TEXT columns are present.")
    print("   Continue to notebook 02_source_integrity.ipynb")
else:
    print(f"\u2705 Found {len(text_columns)} TEXT column(s):")
    for col in text_columns:
        col_info = findings.columns[col]
        print(f"   - {col} (Confidence: {col_info.confidence:.0%})")

⚠️ No TEXT columns detected in this dataset.
   This notebook is only needed when TEXT columns are present.
   Continue to notebook 02_source_integrity.ipynb

from customer_retention.analysis.auto_explorer.active_dataset_store import load_active_dataset

df = load_active_dataset(_namespace, dataset_name)
charts = ChartBuilder()

print(f"Loaded {len(df):,} rows x {len(df.columns)} columns")
print(f"Data source: {dataset_name}")

Loaded 83,198 rows x 13 columns
Data source: customer_emails

# Display available embedding models
print("Available Embedding Models")
print("=" * 80)
print(f"{'Preset':<15} {'Model':<35} {'Size':<10} {'Dim':<8} {'GPU?'}")
print("-" * 80)

for preset in list_available_models():
    info = get_model_info(preset)
    size = f"{info['size_mb']} MB" if info['size_mb'] < 1000 else f"{info['size_mb']/1000:.1f} GB"
    gpu = "Yes" if info['gpu_recommended'] else "No"
    print(f"{preset:<15} {info['model_name']:<35} {size:<10} {info['embedding_dim']:<8} {gpu}")
    print(f"                {info['description']}")
    print()

print("\nModels are downloaded on first use. Choose based on your hardware and quality needs.")

Available Embedding Models
================================================================================
Preset          Model                               Size       Dim      GPU?
--------------------------------------------------------------------------------
minilm          all-MiniLM-L6-v2                    90 MB      384      No
                Fast, lightweight model. Good for CPU and quick experimentation.

qwen3-0.6b      Qwen/Qwen3-Embedding-0.6B           1.2 GB     1024     Yes
                Higher quality embeddings, multilingual. Requires GPU for reasonable speed.

qwen3-4b        Qwen/Qwen3-Embedding-4B             8.0 GB     2560     Yes
                High quality, large model. Requires significant GPU memory (16GB+).

qwen3-8b        Qwen/Qwen3-Embedding-8B             16.0 GB    4096     Yes
                Highest quality, very large model. Requires 32GB+ GPU memory.


Models are downloaded on first use. Choose based on your hardware and quality needs.

# === TEXT PROCESSING CONFIGURATION ===
# Choose your embedding model preset:
#   "minilm"     - Fast, CPU-friendly, good for exploration (default)
#   "qwen3-0.6b" - Better quality, needs GPU
#   "qwen3-4b"   - High quality, needs 16GB+ GPU
#   "qwen3-8b"   - Highest quality, needs 32GB+ GPU

EMBEDDING_PRESET = "minilm"  # Change this to try different models

# PCA configuration
VARIANCE_THRESHOLD = 0.95  # Keep components explaining 95% of variance
MIN_COMPONENTS = 2         # At least 2 features per text column
MAX_COMPONENTS = None      # No upper limit (set to e.g., 20 to cap)

# Get model info and create config
model_info = get_model_info(EMBEDDING_PRESET)
config = TextProcessingConfig(
    embedding_model=model_info["model_name"],
    variance_threshold=VARIANCE_THRESHOLD,
    max_components=MAX_COMPONENTS,
    min_components=MIN_COMPONENTS,
    batch_size=32
)

print("Text Processing Configuration")
print("=" * 50)
print(f"  Preset: {EMBEDDING_PRESET}")
print(f"  Model: {config.embedding_model}")
print(f"  Model size: {model_info['size_mb']} MB")
print(f"  Embedding dimension: {model_info['embedding_dim']}")
print(f"  GPU recommended: {'Yes' if model_info['gpu_recommended'] else 'No'}")
print()
print(f"  Variance threshold: {config.variance_threshold:.0%}")
print(f"  Min components: {config.min_components}")
print(f"  Max components: {config.max_components or 'unlimited'}")

if model_info['gpu_recommended']:
    print()
    print("Note: This model works best with GPU. Processing may be slow on CPU.")

Text Processing Configuration
==================================================
  Preset: minilm
  Model: all-MiniLM-L6-v2
  Model size: 90 MB
  Embedding dimension: 384
  GPU recommended: No

  Variance threshold: 95%
  Min components: 2
  Max components: unlimited

if text_columns:
    for col_name in text_columns:
        print(f"\n{'='*70}")
        print(f"Column: {col_name}")
        print(f"{'='*70}")

        text_series = df[col_name].fillna("")

        # Basic statistics
        non_empty = (text_series.str.len() > 0).sum()
        avg_length = text_series.str.len().mean()
        max_length = text_series.str.len().max()

        print("\n\U0001f4ca Statistics:")
        print(f"   Total rows: {len(text_series):,}")
        print(f"   Non-empty: {non_empty:,} ({non_empty/len(text_series)*100:.1f}%)")
        print(f"   Avg length: {avg_length:.0f} characters")
        print(f"   Max length: {max_length:,} characters")

        # Sample texts
        print("\n\U0001f4dd Sample texts:")
        samples = text_series[text_series.str.len() > 10].head(3)
        for i, sample in enumerate(samples, 1):
            truncated = sample[:100] + "..." if len(sample) > 100 else sample
            print(f"   {i}. {truncated}")

        # Text length distribution
        lengths = text_series.str.len()
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=lengths[lengths > 0], nbinsx=50,
                                    marker_color='steelblue', opacity=0.7))
        fig.add_vline(x=lengths.median(), line_dash="solid", line_color="green",
                      annotation_text=f"Median: {lengths.median():.0f}")
        fig.update_layout(
            title=f"Text Length Distribution: {col_name}",
            xaxis_title="Character Count",
            yaxis_title="Frequency",
            template="plotly_white",
            height=350
        )
        display_figure(fig)

if text_columns:
    processor = TextColumnProcessor(config)

    print("Processing TEXT columns...")
    print("(This may take a moment for large datasets)\n")

    results = []
    df_processed = df.copy()

    for col_name in text_columns:
        print(f"\n{'='*70}")
        print(f"Processing: {col_name}")
        print(f"{'='*70}")

        df_processed, result = processor.process_column(df_processed, col_name)
        results.append(result)

        print("\n\u2705 Processing complete:")
        print(f"   Embedding shape: {result.embeddings_shape}")
        print(f"   Components kept: {result.n_components}")
        print(f"   Explained variance: {result.explained_variance:.1%}")
        print(f"   Features created: {', '.join(result.component_columns)}")

    print(f"\n\n{'='*70}")
    print("PROCESSING SUMMARY")
    print(f"{'='*70}")
    print(f"\nOriginal columns: {len(df.columns)}")
    print(f"New columns added: {len(df_processed.columns) - len(df.columns)}")
    print(f"Total columns: {len(df_processed.columns)}")

if text_columns and results:
    for result in results:
        print(f"\n{'='*70}")
        print(f"Results: {result.column_name}")
        print(f"{'='*70}")

        # Explained variance per component
        reducer = processor._reducers[result.column_name]
        var_ratios = reducer._pca.explained_variance_ratio_
        cumulative = np.cumsum(var_ratios)

        fig = make_subplots(rows=1, cols=2,
                            subplot_titles=("Variance per Component", "Cumulative Variance"))

        fig.add_trace(go.Bar(
            x=[f"PC{i+1}" for i in range(len(var_ratios))],
            y=var_ratios,
            marker_color='steelblue'
        ), row=1, col=1)

        fig.add_trace(go.Scatter(
            x=[f"PC{i+1}" for i in range(len(cumulative))],
            y=cumulative,
            mode='lines+markers',
            line_color='green'
        ), row=1, col=2)

        fig.add_hline(y=config.variance_threshold, line_dash="dash", line_color="red",
                      annotation_text=f"Target: {config.variance_threshold:.0%}",
                      row=1, col=2)

        fig.update_layout(
            title=f"PCA Results: {result.column_name}",
            height=400,
            template="plotly_white",
            showlegend=False
        )
        fig.update_yaxes(title_text="Variance Ratio", row=1, col=1)
        fig.update_yaxes(title_text="Cumulative Variance", row=1, col=2)
        display_figure(fig)

        # PC feature distributions
        if len(result.component_columns) >= 2:
            fig = px.scatter(
                df_processed,
                x=result.component_columns[0],
                y=result.component_columns[1],
                title=f"PC1 vs PC2: {result.column_name}",
                opacity=0.5
            )
            fig.update_layout(template="plotly_white", height=400)
            display_figure(fig)

if text_columns and results:
    for result in results:
        metadata = TextProcessingMetadata(
            column_name=result.column_name,
            embedding_model=config.embedding_model,
            embedding_dim=result.embeddings_shape[1],
            n_components=result.n_components,
            explained_variance=result.explained_variance,
            component_columns=result.component_columns,
            variance_threshold_used=config.variance_threshold,
            processing_approach="pca"
        )
        findings.text_processing[result.column_name] = metadata

        print(f"\u2705 Added metadata for {result.column_name}:")
        print(f"   Model: {metadata.embedding_model}")
        print(f"   Components: {metadata.n_components}")
        print(f"   Explained variance: {metadata.explained_variance:.1%}")

    findings.save(FINDINGS_PATH)
    print(f"\nFindings saved to: {FINDINGS_PATH}")

from customer_retention.analysis.notebook_html_exporter import export_notebook_html

export_notebook_html(Path("04a_text_columns_deep_dive.ipynb"), EXPERIMENTS_DIR / "docs")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 25
     21     print(f"\nFindings saved to: {FINDINGS_PATH}")
     23 from customer_retention.analysis.notebook_html_exporter import export_notebook_html
---> 25 export_notebook_html(Path("04a_text_columns_deep_dive.ipynb"), EXPERIMENTS_DIR / "docs")

NameError: name 'Path' is not defined

if text_columns and results:
    print("\n" + "="*70)
    print("PRODUCTION RECOMMENDATIONS")
    print("="*70)

    for result in results:
        print(f"\n\U0001f527 {result.column_name}:")
        print("   Action: embed_reduce (embeddings + PCA)")
        print(f"   Model: {config.embedding_model}")
        print(f"   Variance threshold: {config.variance_threshold:.0%}")
        print(f"   Expected features: {result.n_components}")
        print(f"   Feature names: {', '.join(result.component_columns[:3])}...")

    print("\n\U0001f4a1 These recommendations will be used by the pipeline generator.")
    print("   The same processing will be applied in production.")

Approach	Method	When to Use
1. Embeddings + PCA (This notebook)	Sentence-transformers → PCA	General semantic features
2. LLM Labeling (Future)	LLM on samples → Train classifier	Specific categories needed

Model	Size	Embedding Dim	Speed	Quality	Best For
MiniLM (default)	90 MB	384	Fast	Good	CPU, quick iteration, small datasets
Qwen3-0.6B	1.2 GB	1024	Medium	Better	GPU available, production quality
Qwen3-4B	8 GB	2560	Slow	High	16GB+ GPU, multilingual, high accuracy
Qwen3-8B	16 GB	4096	Slowest	Highest	32GB+ GPU, research, max quality

Chapter 4a: Text Columns Deep Dive¶

Two Approaches to Text Feature Engineering¶

Approach 1: Embeddings + Dimensionality Reduction (Current)¶

Embedding Model Options¶

Approach 2: LLM Labeling (Future Enhancement)¶

4a.1 Load Previous Findings¶

4a.2 Load Source Data¶

4a.3 Configuration¶

Available Embedding Models¶

4a.4 Text Column Analysis¶

4a.5 Process Text Columns¶

4a.6 Visualize Results¶

4a.7 Update Findings with Text Processing Metadata¶

4a.8 Generate Recommendations¶

Summary¶

Key Results¶

Next Steps¶