|
|
|
|
|
import pandas as pd, numpy as np, matplotlib.pyplot as plt, gradio as gr |
|
|
import pathlib |
|
|
import json |
|
|
import csv |
|
|
|
|
|
CATEGORY_MAP = { |
|
|
"Overall": ["Overall Pass Rate"], |
|
|
|
|
|
} |
|
|
|
|
|
def get_results_path(): |
|
|
"""Get the path to results.csv, resolving relative to this file's location.""" |
|
|
this_file = pathlib.Path(__file__).resolve() |
|
|
results_path = this_file.parent / "results.csv" |
|
|
return results_path |
|
|
|
|
|
def get_runs_path(): |
|
|
"""Get the path to runs directory, resolving relative to this file's location.""" |
|
|
this_file = pathlib.Path(__file__).resolve() |
|
|
runs_path = this_file.parent / "runs" |
|
|
runs_path.mkdir(parents=True, exist_ok=True) |
|
|
return runs_path |
|
|
|
|
|
def infer_type(row): |
|
|
"""Infer model type (Proprietary/Open source) from Provider or Model name.""" |
|
|
provider = str(row.get("Provider", "")).lower() |
|
|
model = str(row.get("Model", "")).lower() |
|
|
|
|
|
|
|
|
open_source_keywords = [ |
|
|
"llama", "mistral", "qwen", "phi", "gemma", "falcon", "mpt", |
|
|
"vicuna", "alpaca", "wizard", "openchat", "neural-chat", |
|
|
"browser-use", "browseruse", "open source", "opensource" |
|
|
] |
|
|
|
|
|
|
|
|
for keyword in open_source_keywords: |
|
|
if keyword in provider or keyword in model: |
|
|
return "Open source" |
|
|
|
|
|
|
|
|
return "Proprietary" |
|
|
|
|
|
def load_df(path=None): |
|
|
"""Load the results CSV, creating empty dataframe if file doesn't exist.""" |
|
|
if path is None: |
|
|
path = get_results_path() |
|
|
|
|
|
metadata_cols = ["Model", "Provider", "Agent Framework", "Type"] |
|
|
metric_cols = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"] |
|
|
expected_cols = metadata_cols + metric_cols |
|
|
|
|
|
if not pathlib.Path(path).exists(): |
|
|
|
|
|
return pd.DataFrame(columns=expected_cols) |
|
|
|
|
|
try: |
|
|
df = pd.read_csv(path) |
|
|
|
|
|
if len(df) == 0: |
|
|
return pd.DataFrame(columns=expected_cols) |
|
|
|
|
|
|
|
|
if "Agent Framework" not in df.columns: |
|
|
|
|
|
if "Notes" in df.columns: |
|
|
df["Agent Framework"] = df["Notes"] |
|
|
else: |
|
|
df["Agent Framework"] = "" |
|
|
|
|
|
|
|
|
if "Overall" in df.columns and "Overall Pass Rate" not in df.columns: |
|
|
df["Overall Pass Rate"] = df["Overall"] |
|
|
|
|
|
|
|
|
if "Type" not in df.columns: |
|
|
df["Type"] = df.apply(infer_type, axis=1) |
|
|
|
|
|
|
|
|
numeric_cols = metric_cols + [c for c in df.columns if c not in metadata_cols + metric_cols] |
|
|
for c in numeric_cols: |
|
|
if c in df.columns: |
|
|
df[c] = pd.to_numeric(df[c], errors="coerce") |
|
|
|
|
|
return df |
|
|
except Exception as e: |
|
|
print(f"Error loading results.csv: {e}") |
|
|
return pd.DataFrame(columns=expected_cols) |
|
|
|
|
|
def compute_score(df, category): |
|
|
|
|
|
|
|
|
if category == "Overall": |
|
|
|
|
|
cols = CATEGORY_MAP.get("Overall", ["Overall Pass Rate"]) |
|
|
elif category in CATEGORY_MAP: |
|
|
|
|
|
cols = CATEGORY_MAP[category] |
|
|
elif category in df.columns: |
|
|
|
|
|
cols = [category] |
|
|
else: |
|
|
|
|
|
if "Overall Pass Rate" in df.columns: |
|
|
cols = ["Overall Pass Rate"] |
|
|
else: |
|
|
numeric_cols = [c for c in df.columns if c not in ["Model", "Provider", "Agent Framework", "Type"]] |
|
|
cols = numeric_cols if numeric_cols else [] |
|
|
|
|
|
|
|
|
cols = [c for c in cols if c in df.columns] |
|
|
|
|
|
|
|
|
if not cols: |
|
|
exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Avg Duration (s)", "Avg Cost ($)"] |
|
|
numeric_cols = [c for c in df.columns if c not in exclude_cols] |
|
|
cols = numeric_cols if numeric_cols else [] |
|
|
|
|
|
if not cols: |
|
|
df = df.copy() |
|
|
df["Category Pass Rate"] = 0.0 |
|
|
return df |
|
|
|
|
|
df = df.copy() |
|
|
if cols: |
|
|
df["Category Pass Rate"] = df[cols].mean(axis=1, skipna=True) |
|
|
else: |
|
|
df["Category Pass Rate"] = 0.0 |
|
|
return df |
|
|
|
|
|
def table_html(df): |
|
|
if len(df) == 0: |
|
|
return """ |
|
|
<style> |
|
|
.leaderboard-container { |
|
|
background: #ffffff; |
|
|
border-radius: 8px; |
|
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
|
|
overflow: hidden; |
|
|
margin: 20px 0; |
|
|
} |
|
|
table.lb { |
|
|
width: 100%; |
|
|
border-collapse: collapse; |
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; |
|
|
font-size: 14px; |
|
|
} |
|
|
table.lb thead { |
|
|
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%); |
|
|
color: white; |
|
|
} |
|
|
table.lb th { |
|
|
padding: 16px 20px; |
|
|
text-align: left; |
|
|
font-weight: 600; |
|
|
font-size: 13px; |
|
|
text-transform: uppercase; |
|
|
letter-spacing: 0.5px; |
|
|
} |
|
|
table.lb td { |
|
|
padding: 16px 20px; |
|
|
border-bottom: 1px solid #e5e7eb; |
|
|
color: #374151; |
|
|
} |
|
|
table.lb tbody tr { |
|
|
transition: background-color 0.2s ease; |
|
|
} |
|
|
table.lb tbody tr:hover { |
|
|
background: #f9fafb; |
|
|
} |
|
|
table.lb tbody tr:last-child td { |
|
|
border-bottom: none; |
|
|
} |
|
|
.rank-badge { |
|
|
display: inline-block; |
|
|
width: 32px; |
|
|
height: 32px; |
|
|
line-height: 32px; |
|
|
text-align: center; |
|
|
border-radius: 50%; |
|
|
font-weight: 700; |
|
|
font-size: 14px; |
|
|
} |
|
|
.rank-1 { background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); } |
|
|
.rank-2 { background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); } |
|
|
.rank-3 { background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); } |
|
|
.rank-other { background: #f1f5f9; color: #64748b; } |
|
|
.pass-rate-cell { |
|
|
font-weight: 600; |
|
|
font-size: 15px; |
|
|
} |
|
|
.metric-cell { |
|
|
font-weight: 500; |
|
|
font-size: 14px; |
|
|
color: #6b7280; |
|
|
} |
|
|
</style> |
|
|
<div class="leaderboard-container"> |
|
|
<table class="lb"> |
|
|
<thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead> |
|
|
<tbody><tr><td colspan="8" style="text-align:center;padding:40px;color:#9ca3af;font-size:16px;">No results yet. Run evaluations to populate the leaderboard.</td></tr></tbody> |
|
|
</table> |
|
|
</div> |
|
|
""" |
|
|
rows = [] |
|
|
for i, r in df.iterrows(): |
|
|
rank = i + 1 |
|
|
rank_class = "rank-1" if rank == 1 else "rank-2" if rank == 2 else "rank-3" if rank == 3 else "rank-other" |
|
|
pass_rate = r['Category Pass Rate'] |
|
|
pass_rate_color = "#10b981" if pass_rate >= 0.7 else "#f59e0b" if pass_rate >= 0.4 else "#ef4444" |
|
|
|
|
|
|
|
|
duration = r.get('Avg Duration (s)', None) |
|
|
duration_str = f"{duration:.2f}" if pd.notna(duration) and duration is not None else "N/A" |
|
|
|
|
|
cost = r.get('Avg Cost ($)', None) |
|
|
cost_str = f"${cost:.4f}" if pd.notna(cost) and cost is not None else "N/A" |
|
|
|
|
|
type_val = r.get('Type', 'Proprietary') |
|
|
type_color = "#10b981" if type_val == "Open source" else "#6366f1" |
|
|
|
|
|
rows.append(f""" |
|
|
<tr> |
|
|
<td><span class="rank-badge {rank_class}">{rank}</span></td> |
|
|
<td><strong style="color: #111827;">{r['Model']}</strong></td> |
|
|
<td style="color: #6b7280;">{r.get('Provider','')}</td> |
|
|
<td><span style="color: {type_color}; font-weight: 600;">{type_val}</span></td> |
|
|
<td style="color: #6b7280;">{r.get('Agent Framework','')}</td> |
|
|
<td class="pass-rate-cell" style="color: {pass_rate_color};">{pass_rate:.3f}</td> |
|
|
<td class="metric-cell">{duration_str}</td> |
|
|
<td class="metric-cell">{cost_str}</td> |
|
|
</tr>""") |
|
|
return f""" |
|
|
<style> |
|
|
.leaderboard-container {{ |
|
|
background: #ffffff; |
|
|
border-radius: 8px; |
|
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
|
|
overflow: hidden; |
|
|
margin: 20px 0; |
|
|
}} |
|
|
table.lb {{ |
|
|
width: 100%; |
|
|
border-collapse: collapse; |
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; |
|
|
font-size: 14px; |
|
|
}} |
|
|
table.lb thead {{ |
|
|
background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%); |
|
|
color: white; |
|
|
}} |
|
|
table.lb th {{ |
|
|
padding: 16px 20px; |
|
|
text-align: left; |
|
|
font-weight: 600; |
|
|
font-size: 13px; |
|
|
text-transform: uppercase; |
|
|
letter-spacing: 0.5px; |
|
|
}} |
|
|
table.lb td {{ |
|
|
padding: 16px 20px; |
|
|
border-bottom: 1px solid #e5e7eb; |
|
|
color: #374151; |
|
|
}} |
|
|
table.lb tbody tr {{ |
|
|
transition: background-color 0.2s ease; |
|
|
}} |
|
|
table.lb tbody tr:hover {{ |
|
|
background: #f9fafb; |
|
|
}} |
|
|
table.lb tbody tr:last-child td {{ |
|
|
border-bottom: none; |
|
|
}} |
|
|
.rank-badge {{ |
|
|
display: inline-block; |
|
|
width: 32px; |
|
|
height: 32px; |
|
|
line-height: 32px; |
|
|
text-align: center; |
|
|
border-radius: 50%; |
|
|
font-weight: 700; |
|
|
font-size: 14px; |
|
|
}} |
|
|
.rank-1 {{ background: linear-gradient(135deg, #ffd700 0%, #ffed4e 100%); color: #000; box-shadow: 0 2px 8px rgba(255, 215, 0, 0.4); }} |
|
|
.rank-2 {{ background: linear-gradient(135deg, #c0c0c0 0%, #e8e8e8 100%); color: #000; box-shadow: 0 2px 8px rgba(192, 192, 192, 0.4); }} |
|
|
.rank-3 {{ background: linear-gradient(135deg, #cd7f32 0%, #e6a55d 100%); color: #fff; box-shadow: 0 2px 8px rgba(205, 127, 50, 0.4); }} |
|
|
.rank-other {{ background: #f1f5f9; color: #64748b; }} |
|
|
.pass-rate-cell {{ |
|
|
font-weight: 600; |
|
|
font-size: 15px; |
|
|
}} |
|
|
.metric-cell {{ |
|
|
font-weight: 500; |
|
|
font-size: 14px; |
|
|
color: #6b7280; |
|
|
}} |
|
|
</style> |
|
|
<div class="leaderboard-container"> |
|
|
<table class="lb"> |
|
|
<thead><tr><th>#</th><th>Model</th><th>Provider</th><th>Type</th><th>Agent Framework</th><th>Pass Rate</th><th>Avg Duration (s)</th><th>Avg Cost ($)</th></tr></thead> |
|
|
<tbody>{''.join(rows)}</tbody> |
|
|
</table> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
def perf_bar(df): |
|
|
plt.close("all") |
|
|
if len(df) == 0: |
|
|
fig, ax = plt.subplots(figsize=(10, 4), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
d = df.sort_values("Category Pass Rate", ascending=True) |
|
|
fig, ax = plt.subplots(figsize=(10, max(4, 0.5*len(d))), facecolor='white', dpi=150) |
|
|
|
|
|
|
|
|
colors = [] |
|
|
for pass_rate in d["Category Pass Rate"]: |
|
|
if pass_rate >= 0.7: |
|
|
colors.append('#10b981') |
|
|
elif pass_rate >= 0.4: |
|
|
colors.append('#f59e0b') |
|
|
else: |
|
|
colors.append('#ef4444') |
|
|
|
|
|
bars = ax.barh(range(len(d)), d["Category Pass Rate"], color=colors, alpha=0.8, edgecolor='white', linewidth=1.5) |
|
|
|
|
|
|
|
|
for i, (bar, pass_rate) in enumerate(zip(bars, d["Category Pass Rate"])): |
|
|
width = bar.get_width() |
|
|
ax.text(width + 0.01, bar.get_y() + bar.get_height()/2, |
|
|
f'{pass_rate:.3f}', ha='left', va='center', fontsize=11, fontweight='600') |
|
|
|
|
|
ax.set_yticks(range(len(d))) |
|
|
ax.set_yticklabels(d["Model"], fontsize=12) |
|
|
ax.set_xlabel("Pass Rate", fontsize=12, fontweight='600', color='#374151') |
|
|
ax.set_xlim(0, 1.1) |
|
|
ax.set_title("Performance Comparison", fontsize=16, fontweight='700', color='#111827', pad=20) |
|
|
ax.spines['top'].set_visible(False) |
|
|
ax.spines['right'].set_visible(False) |
|
|
ax.spines['left'].set_color('#e5e7eb') |
|
|
ax.spines['bottom'].set_color('#e5e7eb') |
|
|
ax.grid(axis='x', alpha=0.3, linestyle='--') |
|
|
ax.set_facecolor('#fafafa') |
|
|
fig.tight_layout() |
|
|
return fig |
|
|
|
|
|
def perf_by_type(df_full, model_filter="Models Avg"): |
|
|
""" |
|
|
Show average performance by puzzle type. |
|
|
|
|
|
Args: |
|
|
df_full: Full dataframe with all models |
|
|
model_filter: "Models Avg" for average across all models, or a specific model name |
|
|
""" |
|
|
plt.close("all") |
|
|
|
|
|
|
|
|
if model_filter and model_filter != "Models Avg": |
|
|
df_filtered = df_full[df_full["Model"] == model_filter].copy() |
|
|
if len(df_filtered) == 0: |
|
|
fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, f"No data available for model: {model_filter}", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
df_plot = df_filtered |
|
|
plot_title = f"Performance by Type - {model_filter}" |
|
|
else: |
|
|
df_plot = df_full |
|
|
plot_title = "Average Performance by CAPTCHA Type (All Models)" |
|
|
|
|
|
if len(df_plot) == 0: |
|
|
fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
|
|
|
|
|
|
exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)", "Category Pass Rate"] |
|
|
numeric_cols = [c for c in df_plot.columns if c not in exclude_cols] |
|
|
type_cols = [c for c in numeric_cols if df_plot[c].notna().any() and df_plot[c].dtype in ['float64', 'int64', 'float32', 'int32']] |
|
|
|
|
|
if len(type_cols) == 0: |
|
|
fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, "No per-type data available", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
|
|
|
|
|
|
if model_filter == "Models Avg": |
|
|
|
|
|
means = df_plot[type_cols].mean(numeric_only=True) |
|
|
else: |
|
|
|
|
|
if len(df_plot) == 1: |
|
|
means = df_plot[type_cols].iloc[0] |
|
|
else: |
|
|
|
|
|
means = df_plot[type_cols].mean(numeric_only=True) |
|
|
|
|
|
|
|
|
means = means.dropna() |
|
|
|
|
|
if len(means) == 0: |
|
|
fig, ax = plt.subplots(figsize=(12, 5), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, "No valid per-type data available", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(max(12, len(means) * 0.8), 6), facecolor='white', dpi=150) |
|
|
|
|
|
|
|
|
colors = [] |
|
|
for val in means.values: |
|
|
if pd.isna(val): |
|
|
colors.append('#94a3b8') |
|
|
elif val >= 0.7: |
|
|
colors.append('#10b981') |
|
|
elif val >= 0.4: |
|
|
colors.append('#f59e0b') |
|
|
else: |
|
|
colors.append('#ef4444') |
|
|
|
|
|
bars = ax.bar(range(len(means)), means.values, color=colors, alpha=0.8, edgecolor='white', linewidth=1.5) |
|
|
|
|
|
|
|
|
for bar, val in zip(bars, means.values): |
|
|
if not pd.isna(val): |
|
|
height = bar.get_height() |
|
|
ax.text(bar.get_x() + bar.get_width()/2., height + 0.01, |
|
|
f'{val:.2f}', ha='center', va='bottom', fontsize=10, fontweight='600') |
|
|
|
|
|
ax.set_xticks(range(len(means))) |
|
|
ax.set_xticklabels(means.index, rotation=45, ha="right", fontsize=10) |
|
|
ax.set_ylim(0, max(1.1, means.max() * 1.1) if not means.empty else 1.1) |
|
|
ax.set_ylabel("Average Pass Rate", fontsize=12, fontweight='600', color='#374151') |
|
|
ax.set_title(plot_title, fontsize=16, fontweight='700', color='#111827', pad=20) |
|
|
ax.spines['top'].set_visible(False) |
|
|
ax.spines['right'].set_visible(False) |
|
|
ax.spines['left'].set_color('#e5e7eb') |
|
|
ax.spines['bottom'].set_color('#e5e7eb') |
|
|
ax.grid(axis='y', alpha=0.3, linestyle='--') |
|
|
ax.set_facecolor('#fafafa') |
|
|
fig.tight_layout() |
|
|
return fig |
|
|
|
|
|
def cost_effectiveness_plot(df): |
|
|
""" |
|
|
Create a cost-effectiveness scatter plot: Performance (X) vs Cost (Y). |
|
|
Color-coded by Type (Proprietary vs Open source). |
|
|
""" |
|
|
plt.close("all") |
|
|
if len(df) == 0: |
|
|
fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, "No data available", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
|
|
|
|
|
|
df_plot = df.copy() |
|
|
df_plot = df_plot[df_plot['Category Pass Rate'].notna() & df_plot['Avg Cost ($)'].notna()] |
|
|
|
|
|
if len(df_plot) == 0: |
|
|
fig, ax = plt.subplots(figsize=(10, 6), facecolor='white', dpi=150) |
|
|
ax.text(0.5, 0.5, "No data with both performance and cost metrics", ha="center", va="center", fontsize=14, color="gray") |
|
|
ax.set_xlim(0, 1); ax.set_ylim(0, 1); ax.axis("off") |
|
|
fig.tight_layout(); return fig |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(14, 9), facecolor='white', dpi=150) |
|
|
|
|
|
|
|
|
proprietary = df_plot[df_plot.get('Type', 'Proprietary') == 'Proprietary'] |
|
|
open_source = df_plot[df_plot.get('Type', 'Proprietary') == 'Open source'] |
|
|
|
|
|
|
|
|
if len(proprietary) > 0: |
|
|
ax.scatter(proprietary['Category Pass Rate'], proprietary['Avg Cost ($)'], |
|
|
c='#6366f1', s=200, alpha=0.75, edgecolors='white', linewidth=2.5, |
|
|
label='Proprietary', zorder=3) |
|
|
|
|
|
for idx, row in proprietary.iterrows(): |
|
|
ax.annotate(row['Model'], |
|
|
(row['Category Pass Rate'], row['Avg Cost ($)']), |
|
|
fontsize=10, alpha=0.85, ha='left', va='bottom', |
|
|
bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none')) |
|
|
|
|
|
if len(open_source) > 0: |
|
|
ax.scatter(open_source['Category Pass Rate'], open_source['Avg Cost ($)'], |
|
|
c='#10b981', s=200, alpha=0.75, edgecolors='white', linewidth=2.5, |
|
|
label='Open source', zorder=3) |
|
|
|
|
|
for idx, row in open_source.iterrows(): |
|
|
ax.annotate(row['Model'], |
|
|
(row['Category Pass Rate'], row['Avg Cost ($)']), |
|
|
fontsize=10, alpha=0.85, ha='left', va='bottom', |
|
|
bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7, edgecolor='none')) |
|
|
|
|
|
|
|
|
perf_threshold = df_plot['Category Pass Rate'].median() if len(df_plot) > 1 else 0.4 |
|
|
cost_threshold = df_plot['Avg Cost ($)'].median() if len(df_plot) > 1 else 0.01 |
|
|
|
|
|
|
|
|
ax.axvline(x=perf_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1) |
|
|
ax.axhline(y=cost_threshold, color='gray', linestyle='--', linewidth=1.5, alpha=0.5, zorder=1) |
|
|
|
|
|
|
|
|
x_range = df_plot['Category Pass Rate'].max() - df_plot['Category Pass Rate'].min() |
|
|
y_range = df_plot['Avg Cost ($)'].max() - df_plot['Avg Cost ($)'].min() |
|
|
|
|
|
|
|
|
ax.text(df_plot['Category Pass Rate'].min() + x_range * 0.05, |
|
|
df_plot['Avg Cost ($)'].max() - y_range * 0.05, |
|
|
'▲ Low Performance\nHigh Cost', |
|
|
fontsize=12, color='#ef4444', weight='bold', |
|
|
ha='left', va='top', alpha=0.8, |
|
|
bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#ef4444', linewidth=1.5)) |
|
|
|
|
|
|
|
|
ax.text(df_plot['Category Pass Rate'].max() - x_range * 0.05, |
|
|
df_plot['Avg Cost ($)'].min() + y_range * 0.05, |
|
|
'▼ High Performance\nLow Cost', |
|
|
fontsize=12, color='#10b981', weight='bold', |
|
|
ha='right', va='bottom', alpha=0.8, |
|
|
bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8, edgecolor='#10b981', linewidth=1.5)) |
|
|
|
|
|
|
|
|
ax.set_xlabel("Performance (Pass Rate)", fontsize=14, fontweight='600', color='#374151') |
|
|
ax.set_ylabel("Avg Cost ($)", fontsize=14, fontweight='600', color='#374151') |
|
|
ax.set_title("Cost-Effectiveness Analysis", fontsize=17, fontweight='700', color='#111827', pad=25) |
|
|
|
|
|
|
|
|
x_pad = x_range * 0.15 if x_range > 0 else 0.1 |
|
|
y_pad = y_range * 0.15 if y_range > 0 else 0.001 |
|
|
ax.set_xlim(df_plot['Category Pass Rate'].min() - x_pad * 0.5, df_plot['Category Pass Rate'].max() + x_pad) |
|
|
ax.set_ylim(max(0, df_plot['Avg Cost ($)'].min() - y_pad * 0.5), df_plot['Avg Cost ($)'].max() + y_pad) |
|
|
|
|
|
ax.spines['top'].set_visible(False) |
|
|
ax.spines['right'].set_visible(False) |
|
|
ax.spines['left'].set_color('#e5e7eb') |
|
|
ax.spines['bottom'].set_color('#e5e7eb') |
|
|
ax.grid(alpha=0.3, linestyle='--', zorder=0, linewidth=1) |
|
|
ax.set_facecolor('#fafafa') |
|
|
|
|
|
|
|
|
|
|
|
ax.legend(loc='upper left', bbox_to_anchor=(1.02, 1), frameon=True, |
|
|
fancybox=True, shadow=True, fontsize=12, framealpha=0.95, |
|
|
edgecolor='#e5e7eb', facecolor='white') |
|
|
|
|
|
|
|
|
fig.tight_layout(rect=[0, 0, 0.95, 1]) |
|
|
return fig |
|
|
|
|
|
def convert_benchmark_results_json(file_path, model_name=None, provider=None, agent_framework=None): |
|
|
""" |
|
|
Convert benchmark_results.json format (per-puzzle results) to aggregated format. |
|
|
|
|
|
Args: |
|
|
file_path: Path to benchmark_results.json file (Path object or string) |
|
|
model_name: Model name (if None, will try to infer from filename or use "Unknown") |
|
|
provider: Provider name (if None, will try to infer from model_name) |
|
|
agent_framework: Agent framework name (if None, will use "browser-use" as default) |
|
|
|
|
|
Returns: |
|
|
dict: Aggregated record with Model, Provider, Agent Framework, Type, metrics, and per-type pass rates |
|
|
""" |
|
|
|
|
|
file_path = pathlib.Path(file_path) if not isinstance(file_path, pathlib.Path) else file_path |
|
|
|
|
|
|
|
|
puzzle_results = [] |
|
|
with open(file_path, 'r') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if line: |
|
|
try: |
|
|
puzzle_results.append(json.loads(line)) |
|
|
except json.JSONDecodeError: |
|
|
continue |
|
|
|
|
|
if not puzzle_results: |
|
|
raise ValueError("No valid puzzle results found in file") |
|
|
|
|
|
|
|
|
extracted_model = None |
|
|
extracted_provider = None |
|
|
extracted_agent_framework = None |
|
|
|
|
|
for result in puzzle_results[:10]: |
|
|
if 'model' in result and result['model']: |
|
|
extracted_model = result['model'] |
|
|
if 'provider' in result and result['provider']: |
|
|
extracted_provider = result['provider'] |
|
|
if 'agent_framework' in result and result['agent_framework']: |
|
|
extracted_agent_framework = result['agent_framework'] |
|
|
|
|
|
if 'agentFramework' in result and result['agentFramework']: |
|
|
extracted_agent_framework = result['agentFramework'] |
|
|
|
|
|
|
|
|
if model_name is None: |
|
|
model_name = extracted_model |
|
|
|
|
|
if provider is None: |
|
|
provider = extracted_provider |
|
|
|
|
|
if agent_framework is None: |
|
|
agent_framework = extracted_agent_framework |
|
|
|
|
|
|
|
|
if model_name is None: |
|
|
|
|
|
filename = file_path.stem.lower() |
|
|
if 'benchmark_results' in filename: |
|
|
model_name = "Unknown Model" |
|
|
else: |
|
|
|
|
|
model_name = filename.replace('_results', '').replace('_benchmark', '').replace('-', ' ').title() |
|
|
|
|
|
if provider is None: |
|
|
|
|
|
model_lower = model_name.lower() |
|
|
if any(x in model_lower for x in ['gpt', 'openai']): |
|
|
provider = "OpenAI" |
|
|
elif any(x in model_lower for x in ['claude', 'anthropic']): |
|
|
provider = "Anthropic" |
|
|
elif any(x in model_lower for x in ['gemini', 'google']): |
|
|
provider = "Google" |
|
|
elif any(x in model_lower for x in ['llama', 'mistral', 'qwen', 'phi', 'gemma']): |
|
|
provider = "Open Source" |
|
|
else: |
|
|
provider = "Unknown" |
|
|
|
|
|
if agent_framework is None: |
|
|
agent_framework = "browser-use" |
|
|
|
|
|
|
|
|
|
|
|
puzzle_type_stats = {} |
|
|
total_correct = 0 |
|
|
total_attempts = len(puzzle_results) |
|
|
total_duration = 0.0 |
|
|
total_cost = 0.0 |
|
|
cost_count = 0 |
|
|
|
|
|
for result in puzzle_results: |
|
|
puzzle_type = result.get('puzzle_type', 'Unknown') |
|
|
|
|
|
|
|
|
if puzzle_type not in puzzle_type_stats: |
|
|
puzzle_type_stats[puzzle_type] = {'correct': 0, 'total': 0} |
|
|
|
|
|
puzzle_type_stats[puzzle_type]['total'] += 1 |
|
|
if result.get('correct', False): |
|
|
puzzle_type_stats[puzzle_type]['correct'] += 1 |
|
|
total_correct += 1 |
|
|
|
|
|
|
|
|
elapsed_time = result.get('elapsed_time') |
|
|
if elapsed_time is not None: |
|
|
try: |
|
|
total_duration += float(elapsed_time) |
|
|
except (ValueError, TypeError): |
|
|
pass |
|
|
|
|
|
|
|
|
cost = result.get('cost') |
|
|
if cost is not None: |
|
|
try: |
|
|
total_cost += float(cost) |
|
|
cost_count += 1 |
|
|
except (ValueError, TypeError): |
|
|
pass |
|
|
|
|
|
|
|
|
overall_pass_rate = total_correct / total_attempts if total_attempts > 0 else 0.0 |
|
|
|
|
|
|
|
|
avg_duration = total_duration / total_attempts if total_attempts > 0 else None |
|
|
|
|
|
|
|
|
avg_cost = total_cost / cost_count if cost_count > 0 else None |
|
|
|
|
|
|
|
|
record = { |
|
|
"Model": model_name, |
|
|
"Provider": provider, |
|
|
"Agent Framework": agent_framework, |
|
|
"Overall Pass Rate": overall_pass_rate, |
|
|
"Avg Duration (s)": avg_duration, |
|
|
"Avg Cost ($)": avg_cost, |
|
|
} |
|
|
|
|
|
|
|
|
for puzzle_type, stats in puzzle_type_stats.items(): |
|
|
pass_rate = stats['correct'] / stats['total'] if stats['total'] > 0 else 0.0 |
|
|
record[puzzle_type] = pass_rate |
|
|
|
|
|
|
|
|
record["Type"] = infer_type(record) |
|
|
|
|
|
return record |
|
|
|
|
|
def is_benchmark_results_format(data): |
|
|
""" |
|
|
Check if the data is in benchmark_results.json format (per-puzzle results). |
|
|
|
|
|
Args: |
|
|
data: List of dictionaries or single dictionary |
|
|
|
|
|
Returns: |
|
|
bool: True if data appears to be in benchmark_results format |
|
|
""" |
|
|
if isinstance(data, dict): |
|
|
data = [data] |
|
|
|
|
|
if not isinstance(data, list) or len(data) == 0: |
|
|
return False |
|
|
|
|
|
|
|
|
first = data[0] |
|
|
required_fields = ['puzzle_type', 'puzzle_id', 'correct'] |
|
|
has_required = all(field in first for field in required_fields) |
|
|
|
|
|
|
|
|
aggregated_fields = ['Model', 'Provider', 'Overall Pass Rate'] |
|
|
is_not_aggregated = not any(field in first for field in aggregated_fields) |
|
|
|
|
|
return has_required and is_not_aggregated |
|
|
|
|
|
def process_uploaded_file(file, model_name=None, provider=None, agent_framework=None): |
|
|
""" |
|
|
Process an uploaded CSV or JSON file and merge with existing results. |
|
|
|
|
|
Args: |
|
|
file: File path string (from Gradio File component with type="filepath") |
|
|
model_name: Optional model name (for benchmark_results.json conversion) |
|
|
provider: Optional provider name (for benchmark_results.json conversion) |
|
|
agent_framework: Optional agent framework name (for benchmark_results.json conversion) |
|
|
|
|
|
Returns: |
|
|
tuple: (success_message, error_message) |
|
|
""" |
|
|
if file is None: |
|
|
return None, "No file uploaded" |
|
|
|
|
|
try: |
|
|
|
|
|
file_path = pathlib.Path(file) if isinstance(file, str) else pathlib.Path(file.name) |
|
|
|
|
|
|
|
|
if file_path.suffix.lower() == '.json': |
|
|
|
|
|
try: |
|
|
|
|
|
with open(file_path, 'r') as f: |
|
|
first_lines = [f.readline().strip() for _ in range(5)] |
|
|
f.seek(0) |
|
|
|
|
|
|
|
|
puzzle_results = [] |
|
|
for line in first_lines: |
|
|
if line: |
|
|
try: |
|
|
puzzle_results.append(json.loads(line)) |
|
|
except json.JSONDecodeError: |
|
|
break |
|
|
|
|
|
|
|
|
if puzzle_results and is_benchmark_results_format(puzzle_results): |
|
|
|
|
|
puzzle_results = [] |
|
|
with open(file_path, 'r') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if line: |
|
|
try: |
|
|
puzzle_results.append(json.loads(line)) |
|
|
except json.JSONDecodeError: |
|
|
continue |
|
|
|
|
|
|
|
|
record = convert_benchmark_results_json( |
|
|
file_path, |
|
|
model_name=model_name, |
|
|
provider=provider, |
|
|
agent_framework=agent_framework |
|
|
) |
|
|
records = [record] |
|
|
else: |
|
|
|
|
|
f.seek(0) |
|
|
data = json.load(f) |
|
|
|
|
|
|
|
|
if isinstance(data, dict): |
|
|
records = [data] |
|
|
elif isinstance(data, list): |
|
|
records = data |
|
|
else: |
|
|
return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}" |
|
|
|
|
|
|
|
|
if is_benchmark_results_format(records): |
|
|
|
|
|
record = convert_benchmark_results_json( |
|
|
file_path, |
|
|
model_name=model_name, |
|
|
provider=provider, |
|
|
agent_framework=agent_framework |
|
|
) |
|
|
records = [record] |
|
|
except Exception as e: |
|
|
|
|
|
try: |
|
|
with open(file_path, 'r') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
|
|
|
if isinstance(data, dict): |
|
|
records = [data] |
|
|
elif isinstance(data, list): |
|
|
records = data |
|
|
else: |
|
|
return None, f"Invalid JSON format: expected object or array, got {type(data).__name__}" |
|
|
|
|
|
|
|
|
if is_benchmark_results_format(records): |
|
|
|
|
|
record = convert_benchmark_results_json( |
|
|
file_path, |
|
|
model_name=model_name, |
|
|
provider=provider, |
|
|
agent_framework=agent_framework |
|
|
) |
|
|
records = [record] |
|
|
except Exception as json_err: |
|
|
return None, f"Error reading JSON file: {str(json_err)}" |
|
|
|
|
|
|
|
|
legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"} |
|
|
for record in records: |
|
|
for old_key, new_key in legacy_map.items(): |
|
|
if old_key in record and new_key not in record: |
|
|
record[new_key] = record.pop(old_key) |
|
|
|
|
|
|
|
|
if "Type" not in record: |
|
|
record["Type"] = infer_type(record) |
|
|
|
|
|
|
|
|
runs_path = get_runs_path() |
|
|
import time |
|
|
for record in records: |
|
|
run_file = runs_path / f"run_{int(time.time() * 1000)}.json" |
|
|
with open(run_file, 'w') as f: |
|
|
json.dump(record, f, indent=2) |
|
|
|
|
|
num_records = len(records) |
|
|
|
|
|
elif file_path.suffix.lower() == '.csv': |
|
|
|
|
|
df_uploaded = pd.read_csv(file_path) |
|
|
|
|
|
|
|
|
if "Notes" in df_uploaded.columns and "Agent Framework" not in df_uploaded.columns: |
|
|
df_uploaded["Agent Framework"] = df_uploaded["Notes"] |
|
|
if "Overall" in df_uploaded.columns and "Overall Pass Rate" not in df_uploaded.columns: |
|
|
df_uploaded["Overall Pass Rate"] = df_uploaded["Overall"] |
|
|
|
|
|
|
|
|
if "Type" not in df_uploaded.columns: |
|
|
df_uploaded["Type"] = df_uploaded.apply(infer_type, axis=1) |
|
|
|
|
|
|
|
|
records = df_uploaded.to_dict('records') |
|
|
runs_path = get_runs_path() |
|
|
import time |
|
|
for record in records: |
|
|
run_file = runs_path / f"run_{int(time.time() * 1000)}.json" |
|
|
with open(run_file, 'w') as f: |
|
|
json.dump(record, f, indent=2) |
|
|
|
|
|
num_records = len(records) |
|
|
|
|
|
else: |
|
|
return None, f"Unsupported file type: {file_path.suffix}. Please upload a .csv or .json file." |
|
|
|
|
|
|
|
|
aggregate_runs_to_csv() |
|
|
|
|
|
return f"✅ Successfully uploaded {num_records} record(s). Leaderboard updated!", None |
|
|
|
|
|
except json.JSONDecodeError as e: |
|
|
return None, f"Invalid JSON file: {str(e)}" |
|
|
except pd.errors.EmptyDataError: |
|
|
return None, "CSV file is empty" |
|
|
except Exception as e: |
|
|
return None, f"Error processing file: {str(e)}" |
|
|
|
|
|
def clean_nan_values(record): |
|
|
"""Convert NaN values to None for proper CSV serialization.""" |
|
|
import math |
|
|
cleaned = {} |
|
|
for key, value in record.items(): |
|
|
if pd.isna(value) or (isinstance(value, float) and math.isnan(value)): |
|
|
cleaned[key] = None |
|
|
else: |
|
|
cleaned[key] = value |
|
|
return cleaned |
|
|
|
|
|
def aggregate_runs_to_csv(): |
|
|
""" |
|
|
Aggregate all JSON files in runs/ directory into results.csv. |
|
|
This consolidates all uploaded evaluation results into a single CSV file. |
|
|
Deduplicates records based on (Model, Provider, Agent Framework) combination, |
|
|
keeping the most recent entry for each unique combination. |
|
|
Preserves existing records from results.csv that aren't in runs/ directory. |
|
|
""" |
|
|
runs_path = get_runs_path() |
|
|
results_path = get_results_path() |
|
|
|
|
|
|
|
|
existing_records_with_time = [] |
|
|
if results_path.exists(): |
|
|
try: |
|
|
df_existing = load_df(results_path) |
|
|
if len(df_existing) > 0: |
|
|
|
|
|
for _, row in df_existing.iterrows(): |
|
|
record = row.to_dict() |
|
|
|
|
|
record = clean_nan_values(record) |
|
|
|
|
|
|
|
|
existing_mtime = results_path.stat().st_mtime - 86400 |
|
|
existing_records_with_time.append((existing_mtime, record)) |
|
|
except Exception as e: |
|
|
print(f"Warning: Error loading existing results.csv: {e}") |
|
|
|
|
|
|
|
|
records_with_time = [] |
|
|
for path in runs_path.glob("*.json"): |
|
|
try: |
|
|
record = json.loads(path.read_text()) |
|
|
|
|
|
mtime = path.stat().st_mtime |
|
|
records_with_time.append((mtime, record)) |
|
|
except Exception as e: |
|
|
print(f"Warning: Skipping invalid JSON file {path}: {e}") |
|
|
|
|
|
|
|
|
all_records_with_time = existing_records_with_time + records_with_time |
|
|
|
|
|
if not all_records_with_time: |
|
|
|
|
|
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"] |
|
|
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"] |
|
|
with results_path.open("w", newline="") as f: |
|
|
w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics) |
|
|
w.writeheader() |
|
|
return |
|
|
|
|
|
|
|
|
all_records_with_time.sort(key=lambda x: x[0], reverse=True) |
|
|
|
|
|
|
|
|
legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"} |
|
|
processed_records = [] |
|
|
for mtime, record in all_records_with_time: |
|
|
for old_key, new_key in legacy_map.items(): |
|
|
if old_key in record and new_key not in record: |
|
|
record[new_key] = record.pop(old_key) |
|
|
|
|
|
|
|
|
if "Type" not in record: |
|
|
record["Type"] = infer_type(record) |
|
|
|
|
|
processed_records.append(record) |
|
|
|
|
|
|
|
|
seen = {} |
|
|
deduplicated_records = [] |
|
|
|
|
|
for record in processed_records: |
|
|
|
|
|
model = str(record.get("Model", "")).strip() |
|
|
provider = str(record.get("Provider", "")).strip() |
|
|
agent_framework = str(record.get("Agent Framework", "")).strip() |
|
|
unique_key = (model, provider, agent_framework) |
|
|
|
|
|
|
|
|
|
|
|
if unique_key not in seen: |
|
|
seen[unique_key] = True |
|
|
deduplicated_records.append(record) |
|
|
|
|
|
if not deduplicated_records: |
|
|
|
|
|
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"] |
|
|
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"] |
|
|
with results_path.open("w", newline="") as f: |
|
|
w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics) |
|
|
w.writeheader() |
|
|
return |
|
|
|
|
|
|
|
|
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"] |
|
|
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"] |
|
|
puzzle_types = sorted({k for r in deduplicated_records for k in r.keys() |
|
|
if k not in fixed_metadata + fixed_metrics}) |
|
|
header = fixed_metadata + fixed_metrics + puzzle_types |
|
|
|
|
|
|
|
|
results_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
with results_path.open("w", newline="") as f: |
|
|
w = csv.DictWriter(f, fieldnames=header) |
|
|
w.writeheader() |
|
|
for r in deduplicated_records: |
|
|
w.writerow(r) |
|
|
|
|
|
def render(category, sort_column, sort_direction, model_filter="Models Avg"): |
|
|
df_full = load_df() |
|
|
df = df_full.copy() |
|
|
|
|
|
df = compute_score(df, category) |
|
|
|
|
|
|
|
|
ascending = (sort_direction == "Low→High") |
|
|
|
|
|
|
|
|
sort_column_map = { |
|
|
"Pass Rate": "Category Pass Rate", |
|
|
"Avg Duration (s)": "Avg Duration (s)", |
|
|
"Avg Cost ($)": "Avg Cost ($)" |
|
|
} |
|
|
|
|
|
actual_sort_column = sort_column_map.get(sort_column, "Category Pass Rate") |
|
|
|
|
|
|
|
|
if actual_sort_column not in df.columns: |
|
|
actual_sort_column = "Category Pass Rate" |
|
|
|
|
|
|
|
|
df = df.copy() |
|
|
df['_sort_helper'] = df[actual_sort_column].fillna(float('inf') if ascending else float('-inf')) |
|
|
df = df.sort_values('_sort_helper', ascending=ascending).drop(columns=['_sort_helper']) |
|
|
df = df.reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
return table_html(df), perf_bar(df), perf_by_type(df_full, model_filter), cost_effectiveness_plot(df) |
|
|
|
|
|
def app(): |
|
|
df = load_df() |
|
|
|
|
|
cats = ["Overall"] |
|
|
if len(df) > 0: |
|
|
|
|
|
exclude_cols = ["Model", "Provider", "Agent Framework", "Type", "Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"] |
|
|
puzzle_cols = [c for c in df.columns if c not in exclude_cols] |
|
|
cats = ["Overall"] + puzzle_cols |
|
|
|
|
|
with gr.Blocks(title="CAPTCHAv2 Leaderboard", theme=gr.themes.Soft(primary_hue="indigo")) as demo: |
|
|
gr.Markdown(""" |
|
|
<div style="text-align: center; padding: 30px 0;"> |
|
|
<h1 style="font-size: 42px; font-weight: 700; margin: 0; background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;"> |
|
|
CAPTCHAv2 Leaderboard |
|
|
</h1> |
|
|
<p style="font-size: 16px; color: #64748b; margin-top: 10px;"> |
|
|
Compare model performance across different CAPTCHA types |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📤 Upload Results") |
|
|
|
|
|
|
|
|
with gr.Accordion("📖 Step-by-Step Guide to Submit Results", open=False): |
|
|
|
|
|
with gr.Accordion("Step 1: Run the Evaluation Protocol", open=False): |
|
|
gr.Markdown(""" |
|
|
**Option A: Using browser-use Agent Framework** |
|
|
|
|
|
1. Start the CAPTCHA server: |
|
|
```bash |
|
|
python app.py |
|
|
``` |
|
|
The server will run on `http://127.0.0.1:7860` |
|
|
|
|
|
2. Run the browser-use agent evaluation (default is their in house model BU1.0): |
|
|
```bash |
|
|
python -m agent_frameworks.browseruse_cli \\ |
|
|
--url http://127.0.0.1:7860 \\ |
|
|
--llm browser-use \\ |
|
|
``` |
|
|
Or with a different LLM: |
|
|
```bash |
|
|
python -m agent_frameworks.browseruse_cli \\ |
|
|
--url http://127.0.0.1:7860 \\ |
|
|
--llm openai \\ |
|
|
--model gpt-4o |
|
|
``` |
|
|
|
|
|
3. The evaluation will automatically save results to `benchmark_results.json` in the project root. |
|
|
Each puzzle attempt is logged as a JSON object with fields: |
|
|
- `puzzle_type`, `puzzle_id`, `user_answer`, `correct_answer`, `correct` |
|
|
- `elapsed_time`, `timestamp` |
|
|
- `model`, `provider`, `agent_framework` |
|
|
|
|
|
**Option B: Using Other Agent Frameworks** |
|
|
|
|
|
Follow your framework's evaluation protocol. Ensure results are saved in `benchmark_results.json` format |
|
|
(JSONL: one JSON object per line) with the same field structure. |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Accordion("Step 2: Convert Results to CSV Format", open=False): |
|
|
gr.Markdown(""" |
|
|
**Method 1: Convert to CSV Format (Recommended)** |
|
|
|
|
|
Use the provided conversion script (`convert_benchmark_to_csv.py` in the project root): |
|
|
```bash |
|
|
python convert_benchmark_to_csv.py benchmark_results.json leaderboard/results.csv |
|
|
``` |
|
|
|
|
|
**Method 2: Directly Upload to Leaderboard (Auto-conversion)** |
|
|
|
|
|
You can upload `benchmark_results.json` directly here. The system will automatically handle all. |
|
|
|
|
|
Optionally provide metadata below if auto-detection fails: |
|
|
- Model Name (e.g., "gpt-4", "claude-3-sonnet", "bu-1-0") |
|
|
- Provider (e.g., "OpenAI", "Anthropic", "browser-use") |
|
|
- Agent Framework (e.g., "browser-use", "crewai") |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Accordion("Step 3: Upload Results", open=False): |
|
|
gr.Markdown(""" |
|
|
**Supported file formats:** |
|
|
- ✅ `benchmark_results.json` - Per-puzzle results (JSONL format) |
|
|
- ✅ `results.csv` - Aggregated results **Recommended** |
|
|
- ✅ JSON files - Single object or array of aggregated results |
|
|
|
|
|
**File format requirements:** |
|
|
|
|
|
For `benchmark_results.json` (per-puzzle format): |
|
|
```json |
|
|
{"puzzle_type": "Dice_Count", "puzzle_id": "dice1.png", "user_answer": "24", "correct_answer": 24, "correct": true, "elapsed_time": "12.5", "timestamp": "2025-01-01T00:00:00Z", "model": "bu-1-0", "provider": "browser-use", "agent_framework": "browser-use"} |
|
|
``` |
|
|
|
|
|
For CSV (aggregated format): |
|
|
- Required columns: `Model`, `Provider`, `Agent Framework`, `Type`, `Overall Pass Rate` , `Avg Duration (s)`, `Avg Cost ($)`, and puzzle type columns (e.g., `Dice_Count`, `Mirror`, etc.) |
|
|
""") |
|
|
|
|
|
file_upload = gr.File( |
|
|
label="Upload Results File", |
|
|
file_types=[".csv", ".json"], |
|
|
type="filepath" |
|
|
) |
|
|
with gr.Row(): |
|
|
model_name_input = gr.Textbox( |
|
|
label="Model Name (optional, for benchmark_results.json)", |
|
|
placeholder="e.g., gpt-4, claude-3-sonnet", |
|
|
container=True |
|
|
) |
|
|
provider_input = gr.Textbox( |
|
|
label="Provider (optional, for benchmark_results.json)", |
|
|
placeholder="e.g., OpenAI, Anthropic, Google", |
|
|
container=True |
|
|
) |
|
|
agent_framework_input = gr.Textbox( |
|
|
label="Agent Framework (optional, for benchmark_results.json)", |
|
|
placeholder="e.g., browser-use, crewai", |
|
|
value="browser-use", |
|
|
container=True |
|
|
) |
|
|
upload_btn = gr.Button("Upload & Update Leaderboard", variant="primary") |
|
|
upload_status = gr.Markdown("") |
|
|
|
|
|
gr.Markdown("---") |
|
|
|
|
|
with gr.Row(): |
|
|
cat = gr.Dropdown(choices=cats, value="Overall", label="Category/Type", container=True) |
|
|
sort_col = gr.Dropdown( |
|
|
choices=["Pass Rate", "Avg Duration (s)", "Avg Cost ($)"], |
|
|
value="Pass Rate", |
|
|
label="Sort by", |
|
|
container=True |
|
|
) |
|
|
sort_dir = gr.Radio( |
|
|
choices=["High→Low", "Low→High"], |
|
|
value="High→Low", |
|
|
label="Sort Direction", |
|
|
container=True |
|
|
) |
|
|
|
|
|
|
|
|
model_choices = ["Models Avg"] |
|
|
if len(df) > 0 and "Model" in df.columns: |
|
|
model_choices.extend(sorted(df["Model"].unique().tolist())) |
|
|
|
|
|
with gr.Row(): |
|
|
model_filter = gr.Dropdown( |
|
|
choices=model_choices, |
|
|
value="Models Avg", |
|
|
label="Model Filter (for Performance by Type plot)", |
|
|
container=True |
|
|
) |
|
|
|
|
|
out = gr.HTML(elem_classes="leaderboard-table") |
|
|
bar = gr.Plot(label="Performance Comparison") |
|
|
pertype_plot = gr.Plot(label="Performance by Type") |
|
|
cost_eff_plot = gr.Plot(label="Cost-Effectiveness Analysis") |
|
|
|
|
|
def handle_upload(file, model_filter_val, model_name_input_val, provider_input_val, agent_framework_input_val): |
|
|
if file is None: |
|
|
|
|
|
table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg") |
|
|
return "Please select a file to upload.", table, bar_fig, pertype_fig, cost_fig |
|
|
|
|
|
|
|
|
model_name_val = model_name_input_val.strip() if model_name_input_val else None |
|
|
provider_val = provider_input_val.strip() if provider_input_val else None |
|
|
agent_framework_val = agent_framework_input_val.strip() if agent_framework_input_val else None |
|
|
|
|
|
success_msg, error_msg = process_uploaded_file( |
|
|
file, |
|
|
model_name=model_name_val, |
|
|
provider=provider_val, |
|
|
agent_framework=agent_framework_val |
|
|
) |
|
|
if error_msg: |
|
|
|
|
|
table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg") |
|
|
return f"❌ {error_msg}", table, bar_fig, pertype_fig, cost_fig |
|
|
|
|
|
|
|
|
|
|
|
table, bar_fig, pertype_fig, cost_fig = render("Overall", "Pass Rate", "High→Low", model_filter_val or "Models Avg") |
|
|
return success_msg, table, bar_fig, pertype_fig, cost_fig |
|
|
|
|
|
upload_btn.click( |
|
|
handle_upload, |
|
|
inputs=[file_upload, model_filter, model_name_input, provider_input, agent_framework_input], |
|
|
outputs=[upload_status, out, bar, pertype_plot, cost_eff_plot] |
|
|
) |
|
|
|
|
|
demo.load(lambda: render("Overall", "Pass Rate", "High→Low", "Models Avg"), outputs=[out, bar, pertype_plot, cost_eff_plot]) |
|
|
for comp in (cat, sort_col, sort_dir, model_filter): |
|
|
comp.change(render, inputs=[cat, sort_col, sort_dir, model_filter], outputs=[out, bar, pertype_plot, cost_eff_plot]) |
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app().launch() |
|
|
|