Spaces:

saherPervaiz
/

ModelTrain

Sleeping

App Files Files Community

saherPervaiz commited on Jan 12

Commit

9d542da

verified ·

1 Parent(s): c002b05

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -66

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import streamlit as st
 import pandas as pd
-import matplotlib.pyplot as plt
-import io
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 from sklearn.ensemble import RandomForestClassifier
@@ -11,40 +9,9 @@ from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-from tabulate import tabulate
-# Function to convert DataFrame to Excel format
-def to_excel(df):
-    output = io.BytesIO()
-    with pd.ExcelWriter(output, engine='openpyxl') as writer:
-        df.to_excel(writer, index=False, sheet_name='Cleaned Dataset')
-    output.seek(0)
-    return output
-# Function to save table as PNG with bold headings
-def save_table_as_png(df):
-    fig, ax = plt.subplots(figsize=(8, 6))
-    ax.axis('tight')
-    ax.axis('off')
-    # Create a table from the DataFrame
-    table = ax.table(cellText=df.values, colLabels=df.columns, loc='center', cellLoc='center')
-    # Set the font size and bold the header row
-    table.auto_set_font_size(False)
-    table.set_fontsize(10)
-    table.scale(1.2, 1.2)
-    # Bold the column headers
-    for (i, j) in zip(range(len(df.columns)), table[0]):
-        table[0, j].set_text_props(weight='bold')  # Make column headers bold
-    # Save the table as a PNG image
-    img_path = "/tmp/model_report.png"
-    plt.savefig(img_path, format="png", bbox_inches="tight")
-    plt.close(fig)
-    return img_path
 # File uploader
 st.title("Model Training with Metrics")
@@ -62,13 +29,65 @@ if uploaded_file is not None:
     if df.empty:
         st.warning("The dataset is empty. Please upload a valid CSV file.")
     else:
         target = st.selectbox("Select Target Variable", df.columns)
         features = [col for col in df.columns if col != target]
         X = df[features]
         y = df[target]
-        # Determine if the target is continuous or categorical
-        is_classification = y.dtype == 'object' or len(y.unique()) <= 10  # If target is categorical or has few unique values, treat as classification
         # Ensure there is enough data before proceeding with train-test split
         if len(X) == 0 or len(y) == 0:
@@ -116,45 +135,101 @@ if uploaded_file is not None:
             # Create a metrics DataFrame
             metrics_df = pd.DataFrame(metrics)
-            # Add bold formatting to the headers for tabulate
-            bold_headers = [f"\033[1m{header}\033[0m" for header in metrics_df.columns]
-            # Format table with tabulate
-            table = tabulate(
-                metrics_df,
-                headers=bold_headers,
-                tablefmt="fancy_grid",
-                showindex=False,
-                numalign="center",
-                stralign="center"
-            )
-            # Display results in Streamlit
             st.subheader("Model Performance Metrics")
-            st.markdown(f"**Model Performance Metrics**")
-            st.text(table)
-            # Option to download the model performance metrics (Results Table)
             st.download_button(
-                label="Download Model Report (Excel)",
-                data=to_excel(metrics_df),  # The metrics dataframe
                 file_name="model_report.xlsx",
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
-            # Option to download the cleaned dataset
             st.download_button(
-                label="Download Cleaned Dataset (Excel)",
-                data=to_excel(df),  # The cleaned dataset is 'df'
-                file_name="cleaned_dataset.xlsx",
-                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
-            # Option to download the report as PNG
-            img_path = save_table_as_png(metrics_df)
-            with open(img_path, "rb") as file:
                 st.download_button(
-                    label="Download Model Report (PNG)",
                     data=file,
                     file_name="model_report.png",
                     mime="image/png"

 import streamlit as st
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
 # File uploader
 st.title("Model Training with Metrics")
     if df.empty:
         st.warning("The dataset is empty. Please upload a valid CSV file.")
     else:
+        # Handle Null Values (Missing Data)
+        st.write("Handling Missing (Null) Values:")
+        # Option to drop rows with null values or fill them
+        fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"])
+        if fill_method == "Drop rows":
+            df = df.dropna()
+        elif fill_method == "Fill with mean/median":
+            for col in df.columns:
+                if df[col].dtype in ['float64', 'int64']:
+                    df[col].fillna(df[col].mean(), inplace=True)  # For numeric columns, fill with mean
+                else:
+                    df[col].fillna(df[col].mode()[0], inplace=True)  # For categorical columns, fill with mode
+        # Handle Outliers using IQR method
+        st.write("Handling Outliers:")
+        # Define function to remove outliers using IQR
+        def remove_outliers_iqr(dataframe):
+            Q1 = dataframe.quantile(0.25)
+            Q3 = dataframe.quantile(0.75)
+            IQR = Q3 - Q1
+            # Filter out rows that are outside the IQR range
+            return dataframe[~((dataframe < (Q1 - 1.5 * IQR)) | (dataframe > (Q3 + 1.5 * IQR))).any(axis=1)]
+        # Remove outliers from the numerical columns
+        df = remove_outliers_iqr(df)
+        # Handle Extreme Values by Capping (Winsorization)
+        st.write("Handling Extreme Values (Capping):")
+        def cap_extreme_values(dataframe):
+            for col in dataframe.select_dtypes(include=[np.number]).columns:
+                # Define the thresholds for extreme values (95th percentile and 5th percentile)
+                lower_limit = dataframe[col].quantile(0.05)
+                upper_limit = dataframe[col].quantile(0.95)
+                # Cap the extreme values
+                dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit)
+            return dataframe
+        df = cap_extreme_values(df)
+        # Show cleaned dataset
+        st.write("Cleaned Dataset:")
+        st.dataframe(df)
         target = st.selectbox("Select Target Variable", df.columns)
         features = [col for col in df.columns if col != target]
         X = df[features]
         y = df[target]
+        # Label Encoding for categorical columns
+        label_encoder = LabelEncoder()
+        # Encode the target variable (if it's categorical)
+        if y.dtype == 'object' or len(y.unique()) <= 10:  # If the target variable is categorical
+            y = label_encoder.fit_transform(y)
+        # Encode categorical feature columns (if any)
+        for col in X.columns:
+            if X[col].dtype == 'object' or len(X[col].unique()) <= 10:  # If the column is categorical
+                X[col] = label_encoder.fit_transform(X[col])
         # Ensure there is enough data before proceeding with train-test split
         if len(X) == 0 or len(y) == 0:
             # Create a metrics DataFrame
             metrics_df = pd.DataFrame(metrics)
+            # Display results in a table using st.dataframe
             st.subheader("Model Performance Metrics")
+            st.dataframe(metrics_df)
+            # Download options
+            st.subheader("Download Model Performance Report in Different Formats")
+            # CSV
             st.download_button(
+                label="Download as CSV",
+                data=metrics_df.to_csv(index=False),
+                file_name="model_report.csv",
+                mime="text/csv"
+            )
+            # Excel
+            st.download_button(
+                label="Download as Excel",
+                data=metrics_df.to_excel(index=False, engine='openpyxl'),
                 file_name="model_report.xlsx",
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
+            # JSON
             st.download_button(
+                label="Download as JSON",
+                data=metrics_df.to_json(orient='records'),
+                file_name="model_report.json",
+                mime="application/json"
             )
+            # PDF (using `fpdf` library)
+            from fpdf import FPDF
+            def generate_pdf(df):
+                pdf = FPDF()
+                pdf.add_page()
+                pdf.set_font("Arial", size=12)
+                pdf.cell(200, 10, txt="Model Performance Report", ln=True, align="C")
+                pdf.ln(10)
+                # Add table header
+                pdf.set_font("Arial", style='B', size=10)
+                for header in df.columns:
+                    pdf.cell(40, 10, header, border=1)
+                pdf.ln()
+                # Add table rows
+                pdf.set_font("Arial", size=10)
+                for row in df.values:
+                    for value in row:
+                        pdf.cell(40, 10, str(value), border=1)
+                    pdf.ln()
+                return pdf.output(dest='S').encode('latin1')
+            # PDF download
+            st.download_button(
+                label="Download as PDF",
+                data=generate_pdf(metrics_df),
+                file_name="model_report.pdf",
+                mime="application/pdf"
+            )
+            # Option to download the dataset
+            st.download_button(
+                label="Download Dataset",
+                data=df.to_csv(index=False),
+                file_name="dataset.csv",
+                mime="text/csv"
+            )
+            # Generate and download PNG report
+            st.subheader("Download Report as PNG")
+            # Create table plot using matplotlib
+            fig, ax = plt.subplots(figsize=(12, 4))  # Adjust the figure size to match the table's layout
+            ax.axis('tight')
+            ax.axis('off')
+            table_data = metrics_df.values
+            table_columns = metrics_df.columns.tolist()
+            table = ax.table(cellText=table_data, colLabels=table_columns, loc='center', cellLoc='center', colLoc='center')
+            table.auto_set_font_size(False)
+            table.set_fontsize(10)
+            table.scale(1.2, 1.2)  # Adjust the scale for better appearance
+            # Save the table as a PNG file
+            png_file = "model_report.png"
+            fig.savefig(png_file, bbox_inches='tight', dpi=300)
+            # Provide a download button for the PNG file
+            with open(png_file, "rb") as file:
                 st.download_button(
+                    label="Download as PNG",
                     data=file,
                     file_name="model_report.png",
                     mime="image/png"