Spaces:

OpenCaptchaWorld
/

captchav2_prototype

Running

File size: 55,560 Bytes

# leaderboard/app.py
import pandas as pd, numpy as np, matplotlib.pyplot as plt, gradio as gr
import pathlib
import json
import csv

CATEGORY_MAP = {
    "Overall": ["Overall Pass Rate"],
    # You can define sets, e.g. "Vision-hard": ["Squiggle", "Shadow_Plausible"]
}

def get_results_path():
    """Get the path to results.csv, resolving relative to this file's location."""
    this_file = pathlib.Path(__file__).resolve()
    results_path = this_file.parent / "results.csv"
    return results_path

def get_runs_path():
    """Get the path to runs directory, resolving relative to this file's location."""
    this_file = pathlib.Path(__file__).resolve()
    runs_path = this_file.parent / "runs"
    runs_path.mkdir(parents=True, exist_ok=True)
    return runs_path

def infer_type(row):
    """Infer model type (Proprietary/Open source) from Provider or Model name."""
    provider = str(row.get("Provider", "")).lower()
    model = str(row.get("Model", "")).lower()
    
    # Open source indicators
    open_source_keywords = [
        "llama", "mistral", "qwen", "phi", "gemma", "falcon", "mpt", 
        "vicuna", "alpaca", "wizard", "openchat", "neural-chat",
        "browser-use", "browseruse", "open source", "opensource"
    ]
    
    # Check if any open source keyword appears
    for keyword in open_source_keywords:
        if keyword in provider or keyword in model:
            return "Open source"
    
    # Default to Proprietary if not found
    return "Proprietary"

def load_df(path=None):
    """Load the results CSV, creating empty dataframe if file doesn't exist."""
    if path is None:
        path = get_results_path()
    
    metadata_cols = ["Model", "Provider", "Agent Framework", "Type"]
    metric_cols = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
    expected_cols = metadata_cols + metric_cols
    
    if not pathlib.Path(path).exists():
        # Return empty dataframe with expected columns
        return pd.DataFrame(columns=expected_cols)
    
    try:
        df = pd.read_csv(path)
        # Handle empty CSV (only headers)
        if len(df) == 0:
            return pd.DataFrame(columns=expected_cols)
        
        # Ensure required columns exist
        if "Agent Framework" not in df.columns:
            # Try legacy "Notes" column
            if "Notes" in df.columns:
                df["Agent Framework"] = df["Notes"]
            else:
                df["Agent Framework"] = ""
        
        # Handle legacy "Overall" column
        if "Overall" in df.columns and "Overall Pass Rate" not in df.columns:
            df["Overall Pass Rate"] = df["Overall"]
        
        # Add Type column if missing, infer from Provider/Model
        if "Type" not in df.columns:
            df["Type"] = df.apply(infer_type, axis=1)
        
        # Convert numeric columns
        numeric_cols = metric_cols + [c for c in df.columns if c not in metadata_cols + metric_cols]
        for c in numeric_cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        
        return df
    except Exception as e:
        print(f"Error loading results.csv: {e}")
        return pd.DataFrame(columns=expected_cols)

def compute_score(df, category):
    # Get columns to compute score from
    # Map "Overall" category to "Overall Pass Rate" column
    if category == "Overall":
        # Use CATEGORY_MAP which maps "Overall" to ["Overall Pass Rate"]
        cols = CATEGORY_MAP.get("Overall", ["Overall Pass Rate"])
    elif category in CATEGORY_MAP:
        # Use predefined category mapping
        cols = CATEGORY_MAP[category]
    elif category in df.columns:
        # Category is a direct column name
        cols = [category]
    else:
        # Fallback: use "Overall Pass Rate" if it exists, otherwise all numeric columns
        if "Overall Pass Rate" in df.columns:
            cols = ["Overall Pass Rate"]
        else:
            numeric_cols = [c for c in df.columns if c not in ["Model", "Provider", "Agent Framework", "Type"]]
            cols = numeric_cols if numeric_cols else []
    
    # Filter to only existing columns
    cols = [c for c in cols if c in df.columns]
    
    # If no valid columns found, use all numeric columns except metadata/metrics
    if not cols:
        exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Avg Duration (s)", "Avg Cost ($)"]
        numeric_cols = [c for c in df.columns if c not in exclude_cols]
        cols = numeric_cols if numeric_cols else []
        # If still no columns, create a zero score
        if not cols:
            df = df.copy()
            df["Category Pass Rate"] = 0.0
            return df
    
    df = df.copy()
    if cols:
        df["Category Pass Rate"] = df[cols].mean(axis=1, skipna=True)
    else:
        df["Category Pass Rate"] = 0.0
    return df

def table_html(df):
    if len(df) == 0:
        return """
        <style>
          .leaderboard-container {
            background: #ffffff;
            border-radius: 8px;
            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
            overflow: hidden;
            margin: 20px 0;
          }
          table.lb {
            width: 100%;
            border-collapse: collapse;
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            font-size: 14px;
          }
          table.lb thead {
            background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
            color: white;
          }
          table.lb th {
            padding: 16px 20px;
            text-align: left;
            font-weight: 600;
            font-size: 13px;
            text-transform: uppercase;
            letter-spacing: 0.5px;
          }
          table.lb td {
            padding: 16px 20px;
            border-bottom: 1px solid #e5e7eb;
            color: #374151;
          }
          table.lb tbody tr {
            transition: background-color 0.2s ease;
          }
          table.lb tbody tr:hover {
            background: #f9fafb;
          }
          table.lb tbody tr:last-child td {
            border-bottom: none;
          }
          .rank-badge {
            display: inline-block;
            width: 32px;
            height: 32px;
            line-height: 32px;
            text-align: center;
            border-radius: 50%;
            font-weight: 700;
            font-size: 14px;
          }
          .rank-1 { background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }
          .rank-2 { background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }
          .rank-3 { background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }
          .rank-other { background: #f1f5f9; color: #64748b; }
          .pass-rate-cell {
            font-weight: 600;
            font-size: 15px;
          }
          .metric-cell {
            font-weight: 500;
            font-size: 14px;
            color: #6b7280;
          }
        </style>
        <div class="leaderboard-container">
          <table class="lb">
            <thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead>
            <tbody><tr><td colspan="8" style="text-align:center;padding:40px;color:#9ca3af;font-size:16px;">No results yet. Run evaluations to populate the leaderboard.</td></tr></tbody>
          </table>
        </div>
        """
    rows = []
    for i, r in df.iterrows():
        rank = i + 1
        rank_class = "rank-1" if rank == 1 else "rank-2" if rank == 2 else "rank-3" if rank == 3 else "rank-other"
        pass_rate = r['Category Pass Rate']
        pass_rate_color = "#10b981" if pass_rate >= 0.7 else "#f59e0b" if pass_rate >= 0.4 else "#ef4444"
        
        # Format duration and cost
        duration = r.get('Avg Duration (s)', None)
        duration_str = f"{duration:.2f}" if pd.notna(duration) and duration is not None else "N/A"
        
        cost = r.get('Avg Cost ($)', None)
        cost_str = f"${cost:.4f}" if pd.notna(cost) and cost is not None else "N/A"
        
        type_val = r.get('Type', 'Proprietary')
        type_color = "#10b981" if type_val == "Open source" else "#6366f1"
        
        rows.append(f"""
        <tr>
          <td><span class="rank-badge {rank_class}">{rank}</span></td>
          <td><strong style="color: #111827;">{r['Model']}</strong></td>
          <td style="color: #6b7280;">{r.get('Provider','')}</td>
          <td><span style="color: {type_color}; font-weight: 600;">{type_val}</span></td>
          <td style="color: #6b7280;">{r.get('Agent Framework','')}</td>
          <td class="pass-rate-cell" style="color: {pass_rate_color};">{pass_rate:.3f}</td>
          <td class="metric-cell">{duration_str}</td>
          <td class="metric-cell">{cost_str}</td>
        </tr>""")
    return f"""
    <style>
      .leaderboard-container {{
        background: #ffffff;
        border-radius: 8px;
        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
        overflow: hidden;
        margin: 20px 0;
      }}
      table.lb {{
        width: 100%;
        border-collapse: collapse;
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
        font-size: 14px;
      }}
          table.lb thead {{
            background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
            color: white;
          }}
      table.lb th {{
        padding: 16px 20px;
        text-align: left;
        font-weight: 600;
        font-size: 13px;
        text-transform: uppercase;
        letter-spacing: 0.5px;
      }}
      table.lb td {{
        padding: 16px 20px;
        border-bottom: 1px solid #e5e7eb;
        color: #374151;
      }}
      table.lb tbody tr {{
        transition: background-color 0.2s ease;
      }}
      table.lb tbody tr:hover {{
        background: #f9fafb;
      }}
      table.lb tbody tr:last-child td {{
        border-bottom: none;
      }}
      .rank-badge {{
        display: inline-block;
        width: 32px;
        height: 32px;
        line-height: 32px;
        text-align: center;
        border-radius: 50%;
        font-weight: 700;
        font-size: 14px;
      }}
      .rank-1 {{ background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }}
      .rank-2 {{ background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }}
      .rank-3 {{ background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }}
      .rank-other {{ background: #f1f5f9; color: #64748b; }}
      .pass-rate-cell {{
        font-weight: 600;
        font-size: 15px;
      }}
      .metric-cell {{
        font-weight: 500;
        font-size: 14px;
        color: #6b7280;
      }}
    </style>
    <div class="leaderboard-container">
      <table class="lb">
        <thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead>
        <tbody>{''.join(rows)}</tbody>
      </table>
    </div>
    """

def perf_bar(df):
    plt.close("all")
    if len(df) == 0:
        fig, ax = plt.subplots(figsize=(10, 4), facecolor='white', dpi=150)
        ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
        ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
        fig.tight_layout(); return fig
    d = df.sort_values("Category Pass Rate", ascending=True)
    fig, ax = plt.subplots(figsize=(10, max(4, 0.5*len(d))), facecolor='white', dpi=150)
    
    # Create gradient colors based on pass rate - CAPTCHA themed
    colors = []
    for pass_rate in d["Category Pass Rate"]:
        if pass_rate >= 0.7:
            colors.append('#10b981')  # verification green
        elif pass_rate >= 0.4:
            colors.append('#f59e0b')  # warning amber
        else:
            colors.append('#ef4444')  # error red
    
    bars = ax.barh(range(len(d)), d["Category Pass Rate"], color=colors, alpha=0.8, edgecolor='white', linewidth=1.5)
    
    # Add value labels on bars
    for i, (bar, pass_rate) in enumerate(zip(bars, d["Category Pass Rate"])):
        width = bar.get_width()
        ax.text(width + 0.01, bar.get_y() + bar.get_height()/2, 
                f'{pass_rate:.3f}', ha='left', va='center', fontsize=11, fontweight='600')
    
    ax.set_yticks(range(len(d)))
    ax.set_yticklabels(d["Model"], fontsize=12)
    ax.set_xlabel("Pass Rate", fontsize=12, fontweight='600', color='#374151')
    ax.set_xlim(0, 1.1)
    ax.set_title("Performance Comparison", fontsize=16, fontweight='700', color='#111827', pad=20)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#e5e7eb')
    ax.spines['bottom'].set_color('#e5e7eb')
    ax.grid(axis='x', alpha=0.3, linestyle='--')
    ax.set_facecolor('#fafafa')
    fig.tight_layout()
    return fig

def perf_by_type(df_full, model_filter="Models Avg"):
    """
    Show average performance by puzzle type.
    
    Args:
        df_full: Full dataframe with all models
        model_filter: "Models Avg" for average across all models, or a specific model name
    """
    plt.close("all")
    
    # Filter by model if specified
    if model_filter and model_filter != "Models Avg":
        df_filtered = df_full[df_full["Model"] == model_filter].copy()
        if len(df_filtered) == 0:
            fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
            ax.text(0.5, 0.5, f"No data available for model: {model_filter}", ha="center", va="center", fontsize=14, color="gray")
            ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
            fig.tight_layout(); return fig
        df_plot = df_filtered
        plot_title = f"Performance by Type - {model_filter}"
    else:
        df_plot = df_full
        plot_title = "Average Performance by CAPTCHA Type (All Models)"
    
    if len(df_plot) == 0:
        fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
        ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
        ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
        fig.tight_layout(); return fig
    
    # Average each per-type column across models (exclude metadata and metric columns)
    exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)", "Category Pass Rate"]
    numeric_cols = [c for c in df_plot.columns if c not in exclude_cols]
    type_cols = [c for c in numeric_cols if df_plot[c].notna().any() and df_plot[c].dtype in ['float64', 'int64', 'float32', 'int32']]
    
    if len(type_cols) == 0:
        fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
        ax.text(0.5, 0.5, "No per-type data available", ha="center", va="center", fontsize=14, color="gray")
        ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
        fig.tight_layout(); return fig
    
    # Calculate means, handling NaN values properly
    if model_filter == "Models Avg":
        # Average across all models
        means = df_plot[type_cols].mean(numeric_only=True)
    else:
        # For a single model, just get its values (should be one row)
        if len(df_plot) == 1:
            means = df_plot[type_cols].iloc[0]
        else:
            # If multiple rows (shouldn't happen), average them
            means = df_plot[type_cols].mean(numeric_only=True)
    
    # Filter out any NaN means
    means = means.dropna()
    
    if len(means) == 0:
        fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150)
        ax.text(0.5, 0.5, "No valid per-type data available", ha="center", va="center", fontsize=14, color="gray")
        ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
        fig.tight_layout(); return fig
    
    fig, ax = plt.subplots(figsize=(max(12, len(means) * 0.8), 6), facecolor='white', dpi=150)
    
    # Create gradient colors based on performance - CAPTCHA themed
    colors = []
    for val in means.values:
        if pd.isna(val):
            colors.append('#94a3b8')  # slate gray for NaN
        elif val >= 0.7:
            colors.append('#10b981')  # verification green
        elif val >= 0.4:
            colors.append('#f59e0b')  # warning amber
        else:
            colors.append('#ef4444')  # error red
    
    bars = ax.bar(range(len(means)), means.values, color=colors, alpha=0.8, edgecolor='white', linewidth=1.5)
    
    # Add value labels on bars
    for bar, val in zip(bars, means.values):
        if not pd.isna(val):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                    f'{val:.2f}', ha='center', va='bottom', fontsize=10, fontweight='600')
    
    ax.set_xticks(range(len(means)))
    ax.set_xticklabels(means.index, rotation=45, ha="right", fontsize=10)
    ax.set_ylim(0, max(1.1, means.max() * 1.1) if not means.empty else 1.1)
    ax.set_ylabel("Average Pass Rate", fontsize=12, fontweight='600', color='#374151')
    ax.set_title(plot_title, fontsize=16, fontweight='700', color='#111827', pad=20)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#e5e7eb')
    ax.spines['bottom'].set_color('#e5e7eb')
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_facecolor('#fafafa')
    fig.tight_layout()
    return fig

def cost_effectiveness_plot(df):
    """
    Create a cost-effectiveness scatter plot: Performance (X) vs Cost (Y).
    Color-coded by Type (Proprietary vs Open source).
    """
    plt.close("all")
    if len(df) == 0:
        fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150)
        ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray")
        ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
        fig.tight_layout(); return fig
    
    # Filter to rows with valid performance and cost data
    df_plot = df.copy()
    df_plot = df_plot[df_plot['Category Pass Rate'].notna() & df_plot['Avg Cost ($)'].notna()]
    
    if len(df_plot) == 0:
        fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150)
        ax.text(0.5, 0.5, "No data with both performance and cost metrics", ha="center", va="center", fontsize=14, color="gray")
        ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off")
        fig.tight_layout(); return fig
    
    # Create figure with higher DPI for better resolution
    fig, ax = plt.subplots(figsize=(14, 9), facecolor='white', dpi=150)
    
    # Separate by type
    proprietary = df_plot[df_plot.get('Type', 'Proprietary') == 'Proprietary']
    open_source = df_plot[df_plot.get('Type', 'Proprietary') == 'Open source']
    
    # Plot points
    if len(proprietary) > 0:
        ax.scatter(proprietary['Category Pass Rate'], proprietary['Avg Cost ($)'], 
                  c='#6366f1', s=200, alpha=0.75, edgecolors='white', linewidth=2.5, 
                  label='Proprietary', zorder=3)
        # Add labels for proprietary models
        for idx, row in proprietary.iterrows():
            ax.annotate(row['Model'], 
                       (row['Category Pass Rate'], row['Avg Cost ($)']),
                       fontsize=10, alpha=0.85, ha='left', va='bottom',
                       bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none'))
    
    if len(open_source) > 0:
        ax.scatter(open_source['Category Pass Rate'], open_source['Avg Cost ($)'], 
                  c='#10b981', s=200, alpha=0.75, edgecolors='white', linewidth=2.5, 
                  label='Open source', zorder=3)
        # Add labels for open source models
        for idx, row in open_source.iterrows():
            ax.annotate(row['Model'], 
                       (row['Category Pass Rate'], row['Avg Cost ($)']),
                       fontsize=10, alpha=0.85, ha='left', va='bottom',
                       bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none'))
    
    # Calculate thresholds for quadrants (median or fixed thresholds)
    perf_threshold = df_plot['Category Pass Rate'].median() if len(df_plot) > 1 else 0.4
    cost_threshold = df_plot['Avg Cost ($)'].median() if len(df_plot) > 1 else 0.01
    
    # Add quadrant lines
    ax.axvline(x=perf_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1)
    ax.axhline(y=cost_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1)
    
    # Add quadrant annotations
    x_range = df_plot['Category Pass Rate'].max() - df_plot['Category Pass Rate'].min()
    y_range = df_plot['Avg Cost ($)'].max() - df_plot['Avg Cost ($)'].min()
    
    # Top-left: Low Performance, High Cost
    ax.text(df_plot['Category Pass Rate'].min() + x_range * 0.05, 
            df_plot['Avg Cost ($)'].max() - y_range * 0.05,
            '▲ Low Performance\nHigh Cost', 
            fontsize=12, color='#ef4444', weight='bold', 
            ha='left', va='top', alpha=0.8,
            bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#ef4444', linewidth=1.5))
    
    # Bottom-right: High Performance, Low Cost
    ax.text(df_plot['Category Pass Rate'].max() - x_range * 0.05, 
            df_plot['Avg Cost ($)'].min() + y_range * 0.05,
            '▼ High Performance\nLow Cost', 
            fontsize=12, color='#10b981', weight='bold', 
            ha='right', va='bottom', alpha=0.8,
            bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#10b981', linewidth=1.5))
    
    # Styling
    ax.set_xlabel("Performance (Pass Rate)", fontsize=14, fontweight='600', color='#374151')
    ax.set_ylabel("Avg Cost ($)", fontsize=14, fontweight='600', color='#374151')
    ax.set_title("Cost-Effectiveness Analysis", fontsize=17, fontweight='700', color='#111827', pad=25)
    
    # Add padding to axes (more padding on right for legend space)
    x_pad = x_range * 0.15 if x_range > 0 else 0.1
    y_pad = y_range * 0.15 if y_range > 0 else 0.001
    ax.set_xlim(df_plot['Category Pass Rate'].min() - x_pad * 0.5, df_plot['Category Pass Rate'].max() + x_pad)
    ax.set_ylim(max(0, df_plot['Avg Cost ($)'].min() - y_pad * 0.5), df_plot['Avg Cost ($)'].max() + y_pad)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#e5e7eb')
    ax.spines['bottom'].set_color('#e5e7eb')
    ax.grid(alpha=0.3, linestyle='--', zorder=0, linewidth=1)
    ax.set_facecolor('#fafafa')
    
    # Add legend - position it outside the plot area to avoid covering data
    # Use bbox_to_anchor to place it outside the plot
    ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1), frameon=True, 
              fancybox=True, shadow=True, fontsize=12, framealpha=0.95,
              edgecolor='#e5e7eb', facecolor='white')
    
    # Adjust layout to make room for legend
    fig.tight_layout(rect=[0, 0, 0.95, 1])
    return fig

def convert_benchmark_results_json(file_path, model_name=None, provider=None, agent_framework=None):
    """
    Convert benchmark_results.json format (per-puzzle results) to aggregated format.
    
    Args:
        file_path: Path to benchmark_results.json file (Path object or string)
        model_name: Model name (if None, will try to infer from filename or use "Unknown")
        provider: Provider name (if None, will try to infer from model_name)
        agent_framework: Agent framework name (if None, will use "browser-use" as default)
    
    Returns:
        dict: Aggregated record with Model, Provider, Agent Framework, Type, metrics, and per-type pass rates
    """
    # Convert to Path object if needed
    file_path = pathlib.Path(file_path) if not isinstance(file_path, pathlib.Path) else file_path
    
    # Read the file - it's a JSONL file (one JSON object per line)
    puzzle_results = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    puzzle_results.append(json.loads(line))
                except json.JSONDecodeError:
                    continue
    
    if not puzzle_results:
        raise ValueError("No valid puzzle results found in file")
    
    # Try to extract model/provider from puzzle results first (if they were included)
    extracted_model = None
    extracted_provider = None
    extracted_agent_framework = None
    
    for result in puzzle_results[:10]:  # Check first 10 results
        if 'model' in result and result['model']:
            extracted_model = result['model']
        if 'provider' in result and result['provider']:
            extracted_provider = result['provider']
        if 'agent_framework' in result and result['agent_framework']:
            extracted_agent_framework = result['agent_framework']
        # Also check camelCase variants
        if 'agentFramework' in result and result['agentFramework']:
            extracted_agent_framework = result['agentFramework']
    
    # Use extracted values if available, otherwise use provided parameters
    if model_name is None:
        model_name = extracted_model
    
    if provider is None:
        provider = extracted_provider
    
    if agent_framework is None:
        agent_framework = extracted_agent_framework
    
    # Infer model/provider if still not available
    if model_name is None:
        # Try to infer from filename (e.g., "gpt-4_results.json" -> "gpt-4")
        filename = file_path.stem.lower()
        if 'benchmark_results' in filename:
            model_name = "Unknown Model"
        else:
            # Try to extract model name from filename
            model_name = filename.replace('_results', '').replace('_benchmark', '').replace('-', ' ').title()
    
    if provider is None:
        # Try to infer provider from model name
        model_lower = model_name.lower()
        if any(x in model_lower for x in ['gpt', 'openai']):
            provider = "OpenAI"
        elif any(x in model_lower for x in ['claude', 'anthropic']):
            provider = "Anthropic"
        elif any(x in model_lower for x in ['gemini', 'google']):
            provider = "Google"
        elif any(x in model_lower for x in ['llama', 'mistral', 'qwen', 'phi', 'gemma']):
            provider = "Open Source"
        else:
            provider = "Unknown"
    
    if agent_framework is None:
        agent_framework = "browser-use"  # Default assumption
    
    # Aggregate results
    # Group by puzzle_type
    puzzle_type_stats = {}
    total_correct = 0
    total_attempts = len(puzzle_results)
    total_duration = 0.0
    total_cost = 0.0
    cost_count = 0
    
    for result in puzzle_results:
        puzzle_type = result.get('puzzle_type', 'Unknown')
        
        # Initialize puzzle type stats if needed
        if puzzle_type not in puzzle_type_stats:
            puzzle_type_stats[puzzle_type] = {'correct': 0, 'total': 0}
        
        puzzle_type_stats[puzzle_type]['total'] += 1
        if result.get('correct', False):
            puzzle_type_stats[puzzle_type]['correct'] += 1
            total_correct += 1
        
        # Aggregate duration
        elapsed_time = result.get('elapsed_time')
        if elapsed_time is not None:
            try:
                total_duration += float(elapsed_time)
            except (ValueError, TypeError):
                pass
        
        # Aggregate cost
        cost = result.get('cost')
        if cost is not None:
            try:
                total_cost += float(cost)
                cost_count += 1
            except (ValueError, TypeError):
                pass
    
    # Calculate overall pass rate
    overall_pass_rate = total_correct / total_attempts if total_attempts > 0 else 0.0
    
    # Calculate average duration
    avg_duration = total_duration / total_attempts if total_attempts > 0 else None
    
    # Calculate average cost
    avg_cost = total_cost / cost_count if cost_count > 0 else None
    
    # Build aggregated record
    record = {
        "Model": model_name,
        "Provider": provider,
        "Agent Framework": agent_framework,
        "Overall Pass Rate": overall_pass_rate,
        "Avg Duration (s)": avg_duration,
        "Avg Cost ($)": avg_cost,
    }
    
    # Add per-type pass rates
    for puzzle_type, stats in puzzle_type_stats.items():
        pass_rate = stats['correct'] / stats['total'] if stats['total'] > 0 else 0.0
        record[puzzle_type] = pass_rate
    
    # Infer Type
    record["Type"] = infer_type(record)
    
    return record

def is_benchmark_results_format(data):
    """
    Check if the data is in benchmark_results.json format (per-puzzle results).
    
    Args:
        data: List of dictionaries or single dictionary
    
    Returns:
        bool: True if data appears to be in benchmark_results format
    """
    if isinstance(data, dict):
        data = [data]
    
    if not isinstance(data, list) or len(data) == 0:
        return False
    
    # Check if first record has benchmark_results.json structure
    first = data[0]
    required_fields = ['puzzle_type', 'puzzle_id', 'correct']
    has_required = all(field in first for field in required_fields)
    
    # Check if it's NOT the aggregated format (which would have Model, Provider, etc.)
    aggregated_fields = ['Model', 'Provider', 'Overall Pass Rate']
    is_not_aggregated = not any(field in first for field in aggregated_fields)
    
    return has_required and is_not_aggregated

def process_uploaded_file(file, model_name=None, provider=None, agent_framework=None):
    """
    Process an uploaded CSV or JSON file and merge with existing results.
    
    Args:
        file: File path string (from Gradio File component with type="filepath")
        model_name: Optional model name (for benchmark_results.json conversion)
        provider: Optional provider name (for benchmark_results.json conversion)
        agent_framework: Optional agent framework name (for benchmark_results.json conversion)
    
    Returns:
        tuple: (success_message, error_message)
    """
    if file is None:
        return None, "No file uploaded"
    
    try:
        # Gradio returns a file path string when type="filepath"
        file_path = pathlib.Path(file) if isinstance(file, str) else pathlib.Path(file.name)
        
        # Read the file based on extension
        if file_path.suffix.lower() == '.json':
            # Try reading as JSONL first (benchmark_results.json format)
            try:
                # Read first few lines to detect format
                with open(file_path, 'r') as f:
                    first_lines = [f.readline().strip() for _ in range(5)]
                    f.seek(0)
                    
                    # Try to parse as JSONL (one JSON object per line)
                    puzzle_results = []
                    for line in first_lines:
                        if line:
                            try:
                                puzzle_results.append(json.loads(line))
                            except json.JSONDecodeError:
                                break
                    
                    # Check if it's benchmark_results format
                    if puzzle_results and is_benchmark_results_format(puzzle_results):
                        # Read entire file as JSONL
                        puzzle_results = []
                        with open(file_path, 'r') as f:
                            for line in f:
                                line = line.strip()
                                if line:
                                    try:
                                        puzzle_results.append(json.loads(line))
                                    except json.JSONDecodeError:
                                        continue
                        
                        # Convert to aggregated format
                        record = convert_benchmark_results_json(
                            file_path, 
                            model_name=model_name,
                            provider=provider,
                            agent_framework=agent_framework
                        )
                        records = [record]
                    else:
                        # Try reading as regular JSON
                        f.seek(0)
                        data = json.load(f)
                        
                        # Normalize to list of records
                        if isinstance(data, dict):
                            records = [data]
                        elif isinstance(data, list):
                            records = data
                        else:
                            return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}"
                        
                        # Check if it's benchmark_results format
                        if is_benchmark_results_format(records):
                            # Convert to aggregated format
                            record = convert_benchmark_results_json(
                                file_path,
                                model_name=model_name,
                                provider=provider,
                                agent_framework=agent_framework
                            )
                            records = [record]
            except Exception as e:
                # Fallback: try reading as regular JSON
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Normalize to list of records
                    if isinstance(data, dict):
                        records = [data]
                    elif isinstance(data, list):
                        records = data
                    else:
                        return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}"
                    
                    # Check if it's benchmark_results format
                    if is_benchmark_results_format(records):
                        # Convert to aggregated format
                        record = convert_benchmark_results_json(
                            file_path,
                            model_name=model_name,
                            provider=provider,
                            agent_framework=agent_framework
                        )
                        records = [record]
                except Exception as json_err:
                    return None, f"Error reading JSON file: {str(json_err)}"
            
            # Handle legacy column names
            legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
            for record in records:
                for old_key, new_key in legacy_map.items():
                    if old_key in record and new_key not in record:
                        record[new_key] = record.pop(old_key)
                
                # Infer Type if not present
                if "Type" not in record:
                    record["Type"] = infer_type(record)
            
            # Save individual JSON files to runs directory for aggregation
            runs_path = get_runs_path()
            import time
            for record in records:
                run_file = runs_path / f"run_{int(time.time() * 1000)}.json"
                with open(run_file, 'w') as f:
                    json.dump(record, f, indent=2)
            
            num_records = len(records)
            
        elif file_path.suffix.lower() == '.csv':
            # Handle CSV file
            df_uploaded = pd.read_csv(file_path)
            
            # Handle legacy column names
            if "Notes" in df_uploaded.columns and "Agent Framework" not in df_uploaded.columns:
                df_uploaded["Agent Framework"] = df_uploaded["Notes"]
            if "Overall" in df_uploaded.columns and "Overall Pass Rate" not in df_uploaded.columns:
                df_uploaded["Overall Pass Rate"] = df_uploaded["Overall"]
            
            # Add Type column if missing
            if "Type" not in df_uploaded.columns:
                df_uploaded["Type"] = df_uploaded.apply(infer_type, axis=1)
            
            # Convert to records and save as JSON files (for consistency with aggregation script)
            records = df_uploaded.to_dict('records')
            runs_path = get_runs_path()
            import time
            for record in records:
                run_file = runs_path / f"run_{int(time.time() * 1000)}.json"
                with open(run_file, 'w') as f:
                    json.dump(record, f, indent=2)
            
            num_records = len(records)
            
        else:
            return None, f"Unsupported file type: {file_path.suffix}. Please upload a .csv or .json file."
        
        # Aggregate runs into results.csv
        aggregate_runs_to_csv()
        
        return f"✅ Successfully uploaded {num_records} record(s). Leaderboard updated!", None
        
    except json.JSONDecodeError as e:
        return None, f"Invalid JSON file: {str(e)}"
    except pd.errors.EmptyDataError:
        return None, "CSV file is empty"
    except Exception as e:
        return None, f"Error processing file: {str(e)}"

def clean_nan_values(record):
    """Convert NaN values to None for proper CSV serialization."""
    import math
    cleaned = {}
    for key, value in record.items():
        if pd.isna(value) or (isinstance(value, float) and math.isnan(value)):
            cleaned[key] = None
        else:
            cleaned[key] = value
    return cleaned

def aggregate_runs_to_csv():
    """
    Aggregate all JSON files in runs/ directory into results.csv.
    This consolidates all uploaded evaluation results into a single CSV file.
    Deduplicates records based on (Model, Provider, Agent Framework) combination,
    keeping the most recent entry for each unique combination.
    Preserves existing records from results.csv that aren't in runs/ directory.
    """
    runs_path = get_runs_path()
    results_path = get_results_path()
    
    # First, load existing results.csv to preserve models not in new uploads
    existing_records_with_time = []
    if results_path.exists():
        try:
            df_existing = load_df(results_path)
            if len(df_existing) > 0:
                # Convert existing records to dict format
                for _, row in df_existing.iterrows():
                    record = row.to_dict()
                    # Clean NaN values
                    record = clean_nan_values(record)
                    # Use file modification time - 1 day as timestamp (older than new uploads)
                    # This ensures new uploads take precedence, but existing records are preserved
                    existing_mtime = results_path.stat().st_mtime - 86400  # 1 day ago
                    existing_records_with_time.append((existing_mtime, record))
        except Exception as e:
            print(f"Warning: Error loading existing results.csv: {e}")
    
    # Gather all JSON files with their modification times
    records_with_time = []
    for path in runs_path.glob("*.json"):
        try:
            record = json.loads(path.read_text())
            # Store modification time for deduplication (most recent wins)
            mtime = path.stat().st_mtime
            records_with_time.append((mtime, record))
        except Exception as e:
            print(f"Warning: Skipping invalid JSON file {path}: {e}")
    
    # Combine existing records with new records from runs/
    all_records_with_time = existing_records_with_time + records_with_time
    
    if not all_records_with_time:
        # Create empty CSV with headers
        fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
        fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
        with results_path.open("w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
            w.writeheader()
        return
    
    # Sort by modification time (most recent first)
    all_records_with_time.sort(key=lambda x: x[0], reverse=True)
    
    # Handle legacy column names and infer Type
    legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
    processed_records = []
    for mtime, record in all_records_with_time:
        for old_key, new_key in legacy_map.items():
            if old_key in record and new_key not in record:
                record[new_key] = record.pop(old_key)
        
        # Infer Type if not present
        if "Type" not in record:
            record["Type"] = infer_type(record)
        
        processed_records.append(record)
    
    # Deduplicate: keep only the most recent record for each (Model, Provider, Agent Framework) combination
    seen = {}
    deduplicated_records = []
    
    for record in processed_records:
        # Create unique key from Model, Provider, and Agent Framework
        model = str(record.get("Model", "")).strip()
        provider = str(record.get("Provider", "")).strip()
        agent_framework = str(record.get("Agent Framework", "")).strip()
        unique_key = (model, provider, agent_framework)
        
        # Only add if we haven't seen this combination before
        # Since records are sorted by time (most recent first), the first occurrence is kept
        if unique_key not in seen:
            seen[unique_key] = True
            deduplicated_records.append(record)
    
    if not deduplicated_records:
        # Create empty CSV with headers
        fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
        fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
        with results_path.open("w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
            w.writeheader()
        return
    
    # Build header: metadata → metrics → puzzle types
    fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
    fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
    puzzle_types = sorted({k for r in deduplicated_records for k in r.keys() 
                          if k not in fixed_metadata + fixed_metrics})
    header = fixed_metadata + fixed_metrics + puzzle_types
    
    # Write CSV
    results_path.parent.mkdir(parents=True, exist_ok=True)
    with results_path.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=header)
        w.writeheader()
        for r in deduplicated_records:
            w.writerow(r)

def render(category, sort_column, sort_direction, model_filter="Models Avg"):
    df_full = load_df()  # Keep full dataset for perf_by_type
    df = df_full.copy()
    
    df = compute_score(df, category)
    
    # Determine sort column and direction
    ascending = (sort_direction == "Low→High")
    
    # Map sort column names to actual column names (only numeric/metric columns)
    sort_column_map = {
        "Pass Rate": "Category Pass Rate",
        "Avg Duration (s)": "Avg Duration (s)",
        "Avg Cost ($)": "Avg Cost ($)"
    }
    
    actual_sort_column = sort_column_map.get(sort_column, "Category Pass Rate")
    
    # Check if column exists
    if actual_sort_column not in df.columns:
        actual_sort_column = "Category Pass Rate"
    
    # Handle NaN values for numeric sorting
    df = df.copy()
    df['_sort_helper'] = df[actual_sort_column].fillna(float('inf') if ascending else float('-inf'))
    df = df.sort_values('_sort_helper', ascending=ascending).drop(columns=['_sort_helper'])
    df = df.reset_index(drop=True)
    
    # perf_by_type uses full dataset to show all puzzle types, with optional model filter
    # cost_effectiveness_plot needs df with Category Pass Rate computed
    return table_html(df), perf_bar(df), perf_by_type(df_full, model_filter), cost_effectiveness_plot(df)

def app():
    df = load_df()
    
    cats = ["Overall"]
    if len(df) > 0:
        # Get all puzzle type columns (exclude metadata and metric columns)
        exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
        puzzle_cols = [c for c in df.columns if c not in exclude_cols]
        cats = ["Overall"] + puzzle_cols

    with gr.Blocks(title="CAPTCHAv2 Leaderboard", theme=gr.themes.Soft(primary_hue="indigo")) as demo:
        gr.Markdown("""
        <div style="text-align: center; padding: 30px 0;">
          <h1 style="font-size: 42px; font-weight: 700; margin: 0; background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;">
            CAPTCHAv2 Leaderboard
          </h1>
          <p style="font-size: 16px; color: #64748b; margin-top: 10px;">
            Compare model performance across different CAPTCHA types
          </p>
        </div>
        """)
        
        # Upload section
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📤 Upload Results")
                
                # Main accordion for the entire guide
                with gr.Accordion("📖 Step-by-Step Guide to Submit Results", open=False):
                    # Step 1: Run Evaluation Protocol
                    with gr.Accordion("Step 1: Run the Evaluation Protocol", open=False):
                        gr.Markdown("""
                        **Option A: Using browser-use Agent Framework**
                        
                        1. Start the CAPTCHA server:
                           ```bash
                           python app.py
                           ```
                           The server will run on `http://127.0.0.1:7860`
                        
                        2. Run the browser-use agent evaluation (default is their in house model BU1.0):
                           ```bash
                           python -m agent_frameworks.browseruse_cli \\
                             --url http://127.0.0.1:7860 \\
                             --llm browser-use \\
                           ```
                           Or with a different LLM:
                           ```bash
                           python -m agent_frameworks.browseruse_cli \\
                             --url http://127.0.0.1:7860 \\
                             --llm openai \\
                             --model gpt-4o 
                           ```
                        
                        3. The evaluation will automatically save results to `benchmark_results.json` in the project root.
                           Each puzzle attempt is logged as a JSON object with fields:
                           - `puzzle_type`, `puzzle_id`, `user_answer`, `correct_answer`, `correct`
                           - `elapsed_time`, `timestamp`
                           - `model`, `provider`, `agent_framework` 
                        
                        **Option B: Using Other Agent Frameworks**
                        
                        Follow your framework's evaluation protocol. Ensure results are saved in `benchmark_results.json` format
                        (JSONL: one JSON object per line) with the same field structure.
                        """)
                    
                    # Step 2: Convert Results
                    with gr.Accordion("Step 2: Convert Results to CSV Format", open=False):
                        gr.Markdown("""
                        **Method 1:  Convert to CSV Format (Recommended)**
                        
                        Use the provided conversion script (`convert_benchmark_to_csv.py` in the project root):
                        ```bash
                        python convert_benchmark_to_csv.py benchmark_results.json leaderboard/results.csv
                        ```
                        
                        **Method 2: Directly Upload to Leaderboard (Auto-conversion)**
                        
                        You can upload `benchmark_results.json` directly here. The system will automatically handle all.
                        
                        Optionally provide metadata below if auto-detection fails:
                        - Model Name (e.g., "gpt-4", "claude-3-sonnet", "bu-1-0")
                        - Provider (e.g., "OpenAI", "Anthropic", "browser-use")
                        - Agent Framework (e.g., "browser-use", "crewai")
                        """)
                    
                    # Step 3: Upload Results
                    with gr.Accordion("Step 3: Upload Results", open=False):
                        gr.Markdown("""
                        **Supported file formats:**
                        - ✅ `benchmark_results.json` - Per-puzzle results (JSONL format) 
                        - ✅ `results.csv` - Aggregated results  **Recommended**
                        - ✅ JSON files - Single object or array of aggregated results
                        
                        **File format requirements:**
                        
                        For `benchmark_results.json` (per-puzzle format):
                        ```json
                        {"puzzle_type": "Dice_Count", "puzzle_id": "dice1.png", "user_answer": "24", "correct_answer": 24, "correct": true, "elapsed_time": "12.5", "timestamp": "2025-01-01T00:00:00Z", "model": "bu-1-0", "provider": "browser-use", "agent_framework": "browser-use"}
                        ```
                        
                        For CSV (aggregated format):
                        - Required columns: `Model`, `Provider`, `Agent Framework`, `Type`, `Overall Pass Rate` , `Avg Duration (s)`, `Avg Cost ($)`, and puzzle type columns (e.g., `Dice_Count`, `Mirror`, etc.)                
                        """)
                
                file_upload = gr.File(
                    label="Upload Results File",
                    file_types=[".csv", ".json"],
                    type="filepath"
                )
                with gr.Row():
                    model_name_input = gr.Textbox(
                        label="Model Name (optional, for benchmark_results.json)",
                        placeholder="e.g., gpt-4, claude-3-sonnet",
                        container=True
                    )
                    provider_input = gr.Textbox(
                        label="Provider (optional, for benchmark_results.json)",
                        placeholder="e.g., OpenAI, Anthropic, Google",
                        container=True
                    )
                    agent_framework_input = gr.Textbox(
                        label="Agent Framework (optional, for benchmark_results.json)",
                        placeholder="e.g., browser-use, crewai",
                        value="browser-use",
                        container=True
                    )
                upload_btn = gr.Button("Upload & Update Leaderboard", variant="primary")
                upload_status = gr.Markdown("")
        
        gr.Markdown("---")
        
        with gr.Row():
            cat = gr.Dropdown(choices=cats, value="Overall", label="Category/Type", container=True)
            sort_col = gr.Dropdown(
                choices=["Pass Rate", "Avg Duration (s)", "Avg Cost ($)"],
                value="Pass Rate",
                label="Sort by",
                container=True
            )
            sort_dir = gr.Radio(
                choices=["High→Low", "Low→High"],
                value="High→Low",
                label="Sort Direction",
                container=True
            )
        
        # Model filter for Performance by Type plot
        model_choices = ["Models Avg"]
        if len(df) > 0 and "Model" in df.columns:
            model_choices.extend(sorted(df["Model"].unique().tolist()))
        
        with gr.Row():
            model_filter = gr.Dropdown(
                choices=model_choices,
                value="Models Avg",
                label="Model Filter (for Performance by Type plot)",
                container=True
            )
        
        out = gr.HTML(elem_classes="leaderboard-table")
        bar = gr.Plot(label="Performance Comparison")
        pertype_plot = gr.Plot(label="Performance by Type")
        cost_eff_plot = gr.Plot(label="Cost-Effectiveness Analysis")

        def handle_upload(file, model_filter_val, model_name_input_val, provider_input_val, agent_framework_input_val):
            if file is None:
                # Return current state if no file
                table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
                return "Please select a file to upload.", table, bar_fig, pertype_fig, cost_fig
            
            # Use provided metadata or None (which will trigger auto-detection)
            model_name_val = model_name_input_val.strip() if model_name_input_val else None
            provider_val = provider_input_val.strip() if provider_input_val else None
            agent_framework_val = agent_framework_input_val.strip() if agent_framework_input_val else None
            
            success_msg, error_msg = process_uploaded_file(
                file,
                model_name=model_name_val,
                provider=provider_val,
                agent_framework=agent_framework_val
            )
            if error_msg:
                # Return current state with error message
                table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
                return f"❌ {error_msg}", table, bar_fig, pertype_fig, cost_fig
            
            # Reload and render after successful upload
            # Re-render with current settings (use Overall as default since we can't access component values directly)
            table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg")
            return success_msg, table, bar_fig, pertype_fig, cost_fig

        upload_btn.click(
            handle_upload,
            inputs=[file_upload, model_filter, model_name_input, provider_input, agent_framework_input],
            outputs=[upload_status, out, bar, pertype_plot, cost_eff_plot]
        )

        demo.load(lambda: render("Overall", "Pass Rate", "High→Low", "Models Avg"), outputs=[out, bar, pertype_plot, cost_eff_plot])
        for comp in (cat, sort_col, sort_dir, model_filter):
            comp.change(render, inputs=[cat, sort_col, sort_dir, model_filter], outputs=[out, bar, pertype_plot, cost_eff_plot])
    return demo

if __name__ == "__main__":
    app().launch()