Spaces:

saherPervaiz
/

ModelTrain

Running

App Files Files Community

saherPervaiz commited on Jan 13

Commit

f6cff1e

verified ·

1 Parent(s): 3d3a6dd

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -176

app.py CHANGED Viewed

@@ -1,198 +1,34 @@
-import streamlit as st
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.linear_model import LogisticRegression, LinearRegression
-from sklearn.svm import SVC, SVR
-from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
-from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-from sklearn.naive_bayes import GaussianNB
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-from io import BytesIO
-# File uploader
-st.title("Model Training with Metrics and Correlation Heatmap")
-uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
-if uploaded_file is not None:
-    df = pd.read_csv(uploaded_file)
-    # Show the dataset
-    st.write("Dataset:")
-    st.dataframe(df)
-    # Convert categorical (str) data to numerical
-    st.write("Converting Categorical Columns to Numerical Values:")
-    label_encoder = LabelEncoder()
-    for col in df.columns:
-        if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
-            st.write(f"Encoding Column: **{col}**")
-            df[col] = label_encoder.fit_transform(df[col])
-    # Display the dataset after conversion
-    st.write("Dataset After Conversion:")
-    st.dataframe(df)
-    # Handle Null Values (Missing Data)
-    st.write("Handling Missing (Null) Values:")
-    fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
-    if fill_method == "Drop rows":
-        df = df.dropna()
-    elif fill_method == "Fill with mean/median":
-        for col in df.columns:
-            if df[col].dtype in ['float64', 'int64']:
-                df[col].fillna(df[col].mean(), inplace=True)
-            else:
-                df[col].fillna(df[col].mode()[0], inplace=True)
-    # Handle Outliers using IQR method
-    st.write("Handling Outliers:")
-    def remove_outliers_iqr(dataframe):
-        Q1 = dataframe.quantile(0.25)
-        Q3 = dataframe.quantile(0.75)
-        IQR = Q3 - Q1
-        return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
-    df = remove_outliers_iqr(df)
-    # Cap Extreme Values
-    st.write("Handling Extreme Values (Capping):")
-    def cap_extreme_values(dataframe):
-        for col in dataframe.select_dtypes(include=[np.number]).columns:
-            lower_limit = dataframe[col].quantile(0.05)
-            upper_limit = dataframe[col].quantile(0.95)
-            dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
-        return dataframe
-    df = cap_extreme_values(df)
-    # Show cleaned dataset
-    st.write("Cleaned Dataset:")
-    st.dataframe(df)
-    # Add clean data download option
-    st.subheader("Download Cleaned Dataset")
-    st.download_button(
-        label="Download Cleaned Dataset (CSV)",
-        data=df.to_csv(index=False),
-        file_name="cleaned_dataset.csv",
-        mime="text/csv"
-    )
-    # Correlation Heatmap
-    st.subheader("Correlation Heatmap")
-    corr = df.corr()
-    plt.figure(figsize=(10, 8))
-    sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
-    st.pyplot(plt)
-    # Save heatmap as PNG
-    buf = BytesIO()
-    plt.savefig(buf, format="png")
-    buf.seek(0)
-    st.download_button(
-        label="Download Correlation Heatmap as PNG",
-        data=buf,
-        file_name="correlation_heatmap.png",
-        mime="image/png"
-    )
-    # Highlight highly correlated pairs
-    st.subheader("Highly Correlated Features")
-    high_corr = corr.abs().unstack().sort_values(ascending=False).drop_duplicates()
-    high_corr = high_corr[high_corr >= 0.8]
-    high_corr_df = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)]
-    st.write(high_corr_df)
-    target = st.selectbox("Select Target Variable", df.columns)
-    features = [col for col in df.columns if col != target]
-    X = df[features]
-    y = df[target]
     if y.dtype == 'object' or len(y.unique()) <= 10:  # Categorical target (classification)
-        st.subheader("Classification Model Training")
-        classifiers = {
-            'Logistic Regression': LogisticRegression(max_iter=5000, solver='saga', penalty='l1'),
-            'Decision Tree': DecisionTreeClassifier(),
-            'Random Forest': RandomForestClassifier(),
-            'Support Vector Machine (SVM)': SVC(),
-            'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
-            'Naive Bayes': GaussianNB()
-        }
-        metrics = []
-        train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
-        for name, classifier in classifiers.items():
-            classifier.fit(X_train, y_train)
-            y_pred = classifier.predict(X_test)
-            metrics.append({
-                'Model': name,
-                'Accuracy': round(accuracy_score(y_test, y_pred), 2),
-                'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2),
-                'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2),
-                'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2)
-            })
-        metrics_df = pd.DataFrame(metrics)
-        st.subheader("Classification Model Performance Metrics")
-        st.dataframe(metrics_df)
-        # Save metrics as PNG
-        fig, ax = plt.subplots()
         sns.barplot(data=metrics_df, x="Model", y="Accuracy", ax=ax)
         ax.set_title("Classification Model Performance")
         buf = BytesIO()
         fig.savefig(buf, format="png")
         buf.seek(0)
         st.download_button(
             label="Download Classification Report as PNG",
             data=buf,
             file_name="classification_report.png",
             mime="image/png"
         )
     else:  # Continuous target (regression)
-        st.subheader("Regression Model Training")
-        regressors = {
-            'Linear Regression': LinearRegression(),
-            'Decision Tree Regressor': DecisionTreeRegressor(),
-            'Random Forest Regressor': RandomForestRegressor(),
-            'Support Vector Regressor (SVR)': SVR(),
-            'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor()
-        }
-        regression_metrics = []
-        train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8)
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_size, random_state=42)
-        for name, regressor in regressors.items():
-            regressor.fit(X_train, y_train)
-            y_pred = regressor.predict(X_test)
-            regression_metrics.append({
-                'Model': name,
-                'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2),
-                'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2),
-                'R² Score': round(r2_score(y_test, y_pred), 2)
-            })
-        regression_metrics_df = pd.DataFrame(regression_metrics)
-        st.subheader("Regression Model Performance Metrics")
-        st.dataframe(regression_metrics_df)
-        # Save metrics as PNG
-        fig, ax = plt.subplots()
         sns.barplot(data=regression_metrics_df, x="Model", y="R² Score", ax=ax)
         ax.set_title("Regression Model Performance")
         buf = BytesIO()
         fig.savefig(buf, format="png")
         buf.seek(0)
         st.download_button(
             label="Download Regression Report as PNG",
             data=buf,

+# After generating the metrics (classification or regression)
+# Add a button to generate the performance report as an image
+generate_report_button = st.button("Generate Performance Report as Image")
+if generate_report_button:
     if y.dtype == 'object' or len(y.unique()) <= 10:  # Categorical target (classification)
+        fig, ax = plt.subplots(figsize=(10, 6))
         sns.barplot(data=metrics_df, x="Model", y="Accuracy", ax=ax)
         ax.set_title("Classification Model Performance")
+        # Save the classification report as PNG
         buf = BytesIO()
         fig.savefig(buf, format="png")
         buf.seek(0)
         st.download_button(
             label="Download Classification Report as PNG",
             data=buf,
             file_name="classification_report.png",
             mime="image/png"
         )
     else:  # Continuous target (regression)
+        fig, ax = plt.subplots(figsize=(10, 6))
         sns.barplot(data=regression_metrics_df, x="Model", y="R² Score", ax=ax)
         ax.set_title("Regression Model Performance")
+        # Save the regression report as PNG
         buf = BytesIO()
         fig.savefig(buf, format="png")
         buf.seek(0)
         st.download_button(
             label="Download Regression Report as PNG",
             data=buf,