import os os.environ["OMP_NUM_THREADS"] = "1" os.environ["TOKENIZERS_PARALLELISM"] = "false" import streamlit as st import pandas as pd import warnings import numpy as np import matplotlib.pyplot as plt import seaborn as sns from bertopic import BERTopic from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired from sentence_transformers import SentenceTransformer, models from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from umap import UMAP from hdbscan import HDBSCAN import gensim.corpora as corpora from gensim.models.coherencemodel import CoherenceModel warnings.filterwarnings("ignore") # ========================================== # 1. PAGE CONFIGURATION & MAPPINGS # ========================================== st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="expanded") # Add this right below set_page_config to stop the screen from jumping left/right st.markdown(""" """, unsafe_allow_html=True) EMBEDDING_MAP = { "MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2", "MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2", "Specter2 (Scientific/Academic)": "allenai/specter2_base" } POOLING_MAP = { "Mean (Smooth context)": "mean", "Max (Sharp keywords)": "max", "CLS (Classification)": "cls", "Mean-Max (Combined)": "mean-max" } # --- CACHE THE NEURAL NETWORK --- @st.cache_resource def load_embedder(model_name, pool_strat): word_emb = models.Transformer(model_name) pool_model = models.Pooling( word_emb.get_word_embedding_dimension(), pooling_mode_mean_tokens=("mean" in pool_strat), pooling_mode_max_tokens=("max" in pool_strat), pooling_mode_cls_token=("cls" in pool_strat) ) # HARDCODED MATH: Permanently forces L2 Normalization so distance math works flawlessly return SentenceTransformer(modules=[word_emb, pool_model, models.Normalize()]) # ========================================== # 2. THE GUIDED UI # ========================================== st.title("BERTopic - Topic Modeling Pipeline with Math Visualization") try: st.image("pipeline.png", use_container_width=True) except FileNotFoundError: pass st.divider() st.header("📥 Step 0: Input Data & Core Settings") st.info("💡 **Preprocessing Note:** You do not need to manually lowercase or strip punctuation. The `CountVectorizer` algorithm and the `Uncased` BERT Neural Networks handle casing and token normalization autonomously at the mathematical level.") data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True) sample_abstract = """ Students who registered for the Mapping with Google massive open online course (MOOC) were asked several questions during the registration process to identify prior experience with eleven skills as well as their goals for registering for the course. At the end of the course, we compared students' self reports of goal achievement with behavioral click-stream analysis. In addition, we assessed how well prior skill in a subject predicts a student's course completion and found no correlation. Our research shows that students who completed course activities were more likely to earn certificates of completion than peers who did not. """ raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150) col_a, col_b = st.columns(2) with col_a: n_themes = st.slider("Target Number of Themes", 2, 20, 3) with col_b: words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5) # --- THE VERTICAL CONFIGURATION WIZARD --- st.header("⚙️ Model Configuration") with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True): st.markdown("*💡 **BERTopic Default:** Uses `all-MiniLM-L6-v2` with `Mean` pooling.*") ui_embedding = st.selectbox("Embedding Model (Override Default):", list(EMBEDDING_MAP.keys())) ui_pooling = st.selectbox("Pooling Strategy (Override Default):", list(POOLING_MAP.keys())) with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True): st.markdown("*💡 **BERTopic Default:** Uses `UMAP` with `Cosine` distance to reduce 384D to 5D space.*") ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"]) if "UMAP" in ui_algo: ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"]) else: ui_metric = "euclidean" st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.") with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True): st.markdown("*💡 **BERTopic Default:** Uses `HDBSCAN` exclusively (which crashes on tiny datasets).*") st.markdown(""" *The model mathematically draws boundaries around similar sentences. * * **Primary clustering algorithm (HDBSCAN):** Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds. *(Defaults: min_cluster_size=10)* * **Fallback clustering algorithm (K-Means):** Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples. """) with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True): st.markdown("*💡 **BERTopic Default:** Uses `Unigrams` (1 word) and does **not** filter redundant dataset noise.*") ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')") auto_noise = st.checkbox("Auto-Remove Redundant Noise (max_df)", value=True, help="Mathematically deletes words appearing in >85% of documents.") with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True): st.markdown("*💡 **BERTopic Default:** ALWAYS extracts baseline words using **c-TF-IDF** (Word Frequency).*") ui_extraction = st.selectbox("Apply Advanced Filter on top of c-TF-IDF:", ["None (Base c-TF-IDF only)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"]) if "MMR" in ui_extraction: mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3) else: mmr_diversity = None st.header("📊 Evaluation Metrics") eval_metrics = st.multiselect( "Select KPIs to generate a final report card:", ["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"], default=["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"] ) st.divider() # ========================================== # 3. ENGINE EXECUTION # ========================================== if st.button("🚀 Run Topic Modeling Pipeline", type="primary", use_container_width=True): if not raw_data or len(raw_data) < 20: st.error("Please provide more text data!") st.stop() with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"): sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10] dataset_size = len(sentences) academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did'] from sklearn.feature_extraction import text stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise)) vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0) custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling]) embeddings = custom_embedder.encode(sentences) is_fallback = False if dataset_size < 15 or "PCA" in ui_algo: safe_n_themes = min(n_themes, dataset_size) dim_model = PCA(n_components=2, random_state=42) cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42) reduce_topics = None is_fallback = True algo_used = "PCA" cluster_algo = "K-Means" else: dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42) clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom') reduce_topics = n_themes algo_used = "UMAP" cluster_algo = "HDBSCAN" if "MMR" in ui_extraction: rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme) elif "KeyBERT" in ui_extraction: rep_model = KeyBERTInspired(top_n_words=words_per_theme) else: rep_model = None topic_model = BERTopic( embedding_model=custom_embedder, umap_model=dim_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, representation_model=rep_model, nr_topics=reduce_topics, top_n_words=words_per_theme, language="english" ) topics, _ = topic_model.fit_transform(sentences) # ========================================== # 4. UI DISPLAY & METRICS # ========================================== st.success("Analysis Complete!") if is_fallback: if safe_n_themes < n_themes: st.warning(f"⚠️ **Reduced requested themes from {n_themes} to {safe_n_themes}.**\n\n" f"*Reason:* BERTopic clusters complete sentences to preserve context. " f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, " f"which mathematically breaks the clustering algorithms!") else: st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).") st.markdown("### 🏆 Discovered Themes") topic_info = topic_model.get_topic_info() all_words = [] cols = st.columns(3) col_idx = 0 for t_id in topic_info['Topic']: if t_id == -1: continue theme_w = [w[0] for w in topic_model.get_topic(t_id)] all_words.append(theme_w) with cols[col_idx % 3]: st.info(f"**Theme {t_id + 1}**\n\n" + "\n".join([f"🔹 {w}" for w in theme_w])) col_idx += 1 # --- METRICS CALCULATIONS --- div_val, npmi_val, umass_val, sil_val = 0.0, 0.0, 0.0, 0.0 div_status, npmi_status, umass_status, sil_status = "Skipped", "Skipped", "Skipped", "Skipped" u_words_len, t_words_len = 0, 0 if len(eval_metrics) > 0: with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"): # 1. Diversity if "Topic Diversity" in eval_metrics and len(all_words) > 0: u_words = set([w for t in all_words for w in t]) t_words = sum([len(t) for t in all_words]) u_words_len, t_words_len = len(u_words), t_words div_val = float(len(u_words) / t_words) if t_words > 0 else 0.0 div_status = f"{div_val:.2f}" # 2. Coherence Models (NPMI & UMass) if "NPMI Coherence" in eval_metrics or "UMass Coherence" in eval_metrics: try: tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences] dictionary = corpora.Dictionary(tokenized) if "NPMI Coherence" in eval_metrics: cm_npmi = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi') temp_npmi = cm_npmi.get_coherence() if np.isnan(temp_npmi): npmi_status = "N/A (Too few words)" else: npmi_val = float(temp_npmi) npmi_status = f"{npmi_val:.2f}" if "UMass Coherence" in eval_metrics: cm_umass = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='u_mass') temp_umass = cm_umass.get_coherence() if np.isnan(temp_umass): umass_status = "N/A (Too few words)" else: umass_val = float(temp_umass) umass_status = f"{umass_val:.2f}" except Exception: npmi_status = "Skipped (Data too small)" umass_status = "Skipped (Data too small)" # 3. Silhouette Score if "Silhouette Score" in eval_metrics: valid_idx = [i for i, t in enumerate(topics) if t != -1] unique_topics = set([topics[i] for i in valid_idx]) if 1 < len(unique_topics) < len(valid_idx): sil_val = float(silhouette_score( np.array([embeddings[i] for i in valid_idx]), [topics[i] for i in valid_idx], metric='cosine' )) sil_status = f"{sil_val:.2f}" else: sil_status = "Skipped (Themes need ≥2 sentences)" # --- RENDER KPI DASHBOARD WITH TOOLTIPS --- st.markdown("### 📊 Key Performance Indicators (KPI)") kpi_cols = st.columns(len(eval_metrics)) for idx, metric in enumerate(eval_metrics): with kpi_cols[idx]: if metric == "Topic Diversity": st.metric( label="Topic Diversity", value=div_status, help="Math: Unique Words / Total Words.\nTarget: 1.0 (No redundant words across themes)." ) elif metric == "NPMI Coherence": st.metric( label="NPMI Coherence", value=npmi_status, help="Math: Normalized Pointwise Mutual Information.\nCalculates joint probability of words existing together.\nTarget: >0.1" ) elif metric == "UMass Coherence": st.metric( label="UMass Coherence", value=umass_status, help="Math: Internal log-conditional probability.\nEvaluates if words co-occur strictly inside your uploaded dataset.\nTarget: Closer to 0." ) elif metric == "Silhouette Score": st.metric( label="Silhouette Score", value=sil_status, help="Math: (b - a) / max(a,b).\nMeasures intra-cluster density (a) vs nearest-cluster distance (b).\nTarget: >0.0" ) # ========================================== # 5. XAI VISUALIZATION GRAPH (With Live Math & Matrices) # ========================================== st.markdown("### 📈 Explainable AI (XAI) Architecture Map") with st.spinner("Rendering Mathematical Dashboard..."): sns.set_theme(style="whitegrid") fig = plt.figure(figsize=(18, 16)) # Safe extraction for the title pool_title = ui_pooling.split()[0] rep_title = ui_extraction.split()[0] fig.suptitle(f"Topic Modeling Mathematical Pipeline\n(Pooling: {pool_title} | Rep: {rep_title})", fontsize=20, fontweight='bold', y=0.98) # Style for the Math/Data boxes box_style = dict(boxstyle="round,pad=0.5", facecolor='#f8f9fa', edgecolor='#4b72b8', alpha=0.95, lw=2) # -------------------------------------------------- # 1. Embeddings & Pooling # -------------------------------------------------- ax1 = plt.subplot(3, 2, 1) sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1) ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold') ax1.set_ylabel("Sentences (Docs)") ax1.set_xlabel("Vector Dimensions (First 50 shown)") # Live Data Extraction emb_shape = embeddings.shape emb_sample = np.round(embeddings[0, :5], 3).tolist() # First 5 numbers of Doc 1 math_text_1 = ( r"$\mathbf{Math (Mean Pool):} \quad v = \frac{1}{N} \sum_{i=1}^{N} \text{BERT}(w_i)$" + "\n" f"Matrix Shape: {emb_shape} (Docs x Dims)\n" f"Doc 1 [Dims 1-5]: {emb_sample}..." ) ax1.text(0.5, -0.25, math_text_1, fontsize=11, ha='center', va='top', transform=ax1.transAxes, bbox=box_style) # -------------------------------------------------- # 2. Geometry (Dimensionality Reduction) # -------------------------------------------------- ax2 = plt.subplot(3, 2, 2) reduced_embeddings = topic_model.umap_model.transform(embeddings) ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k') ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold') # Live Data Extraction red_shape = reduced_embeddings.shape red_sample = np.round(reduced_embeddings[0, :2], 3).tolist() # X, Y coord of Doc 1 eq_2 = r"$\mathbf{Math (PCA):} \quad Z = X \cdot W_{2D}$" if algo_used == "PCA" else r"$\mathbf{Math (UMAP):} \quad \text{Topological Manifold Approx.}$" math_text_2 = ( f"{eq_2}\n" f"Matrix Shape: {red_shape} (Docs x 2D Coordinates)\n" f"Doc 1 Coordinate: [X: {red_sample[0]}, Y: {red_sample[1]}]" ) ax2.text(0.5, -0.25, math_text_2, fontsize=11, ha='center', va='top', transform=ax2.transAxes, bbox=box_style) # -------------------------------------------------- # 3. Clustering # -------------------------------------------------- ax3 = plt.subplot(3, 2, 3) ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k') ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold') # Live Data Extraction topic_sample = topics[:5] # Grabs up to the first 5 eq_3 = r"$\mathbf{Math (K-Means):} \quad \arg\min_S \sum ||x_i - \mu_c||^2$" if cluster_algo == "K-Means" else r"$\mathbf{Math (HDBSCAN):} \quad \text{Density} = \frac{1}{\text{core\_dist}(x)}$" math_text_3 = ( f"{eq_3}\n" f"Output Array Shape: ({len(topics)},) (1 Label per Doc)\n" f"First {len(topic_sample)} Doc Assignments: {topic_sample}" ) ax3.text(0.5, -0.25, math_text_3, fontsize=11, ha='center', va='top', transform=ax3.transAxes, bbox=box_style) # -------------------------------------------------- # 4. Representation # -------------------------------------------------- ax4 = plt.subplot(3, 2, 4) # Safely grab the first valid topic found valid_topics = [t for t in topic_model.get_topics().keys() if t != -1] theme_1_data = topic_model.get_topic(valid_topics[0]) if valid_topics else None if theme_1_data: words = [x[0] for x in theme_1_data][::-1] scores = [x[1] for x in theme_1_data][::-1] ax4.barh(words, scores, color='coral', edgecolor='black') ax4.set_title(f"STEP 4: Topic Representation ({rep_title})", fontsize=13, fontweight='bold') # Live Data Extraction top_word_score = round(scores[-1], 4) vocab_len = len(vectorizer_model.vocabulary_) if hasattr(vectorizer_model, 'vocabulary_') else 'N/A' math_text_4 = ( r"$\mathbf{Math (c-TF-IDF):} \quad W_{t,c} = tf_{t,c} \times \log\left(1 + \frac{A}{df_t}\right)$" + "\n" f"Global Vocab Extracted: {vocab_len} terms\n" f"Top Word ('{words[-1]}') Score: {top_word_score}" ) ax4.text(0.5, -0.25, math_text_4, fontsize=11, ha='center', va='top', transform=ax4.transAxes, bbox=box_style) else: ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes) # -------------------------------------------------- # 5. KPI Dashboard (Updated with UMass) # -------------------------------------------------- ax5 = plt.subplot(3, 2, 5) ax5.axis('off') ax5.set_title("STEP 5: Post-Hoc Evaluation Formulas", fontsize=13, fontweight='bold', y=0.95) dist_used = ui_metric if algo_used == "UMAP" else "euclidean" # Condensed to fit all 4 metrics beautifully! kpi_math = ( r"$\mathbf{Diversity:} \quad D = \frac{| \text{Unique} |}{| \text{Total} |}$" + f" [Live: {div_status}]\n\n" r"$\mathbf{Silhouette:} \quad S = \frac{b - a}{\max(a, b)}$" + f" [Live: {sil_status}]\n\n" r"$\mathbf{NPMI:} \quad \frac{\log(P(x,y) / P(x)P(y))}{-\log P(x,y)}$" + f" [Live: {npmi_status}]\n\n" r"$\mathbf{UMass:} \quad \log \frac{P(x,y) + \epsilon}{P(x)}$" + f" [Live: {umass_status}]" ) ax5.text(0.5, 0.45, kpi_math, fontsize=12, va='center', ha='center', bbox=dict(boxstyle="square,pad=1.2", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2)) ax5.text(0.5, -0.15, "Math: UMass measures internal dataset logic. NPMI measures external logic.\nSilhouette measures geometric separation.", fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style) # -------------------------------------------------- # 6. Summary Matrix Transformations # -------------------------------------------------- ax6 = plt.subplot(3, 2, 6) ax6.axis('off') summary_text = ( "=== THE MATRIX TRANSFORMATION LIFECYCLE ===\n\n" f"1. Raw Text $\\rightarrow$ {emb_shape} Matrix (Dense Meaning)\n" f"2. {emb_shape} $\\rightarrow$ {red_shape} Matrix (Geometric Compression)\n" f"3. {red_shape} $\\rightarrow$ ({len(topics)},) Array (Discrete Bucketing)\n" f"4. ({len(topics)},) $\\rightarrow$ c-TF-IDF Matrix (Word Extraction)\n" f"5. c-TF-IDF $\\rightarrow$ {words_per_theme} Output Words (Per Theme)\n\n" # THE BUG FIX! "This proves Topic Modeling is a sequence of \ndimensionality reductions and matrix multiplications." ) ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left', bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2)) plt.subplots_adjust(hspace=0.7, wspace=0.3) st.pyplot(fig, use_container_width=True)