Commit
Β·
a1c511d
1
Parent(s):
441dd2f
debug duplications in leaderborad
Browse files- app.py +44 -7
- requirements.txt +3 -3
app.py
CHANGED
|
@@ -904,19 +904,24 @@ def aggregate_runs_to_csv():
|
|
| 904 |
"""
|
| 905 |
Aggregate all JSON files in runs/ directory into results.csv.
|
| 906 |
This consolidates all uploaded evaluation results into a single CSV file.
|
|
|
|
|
|
|
| 907 |
"""
|
| 908 |
runs_path = get_runs_path()
|
| 909 |
results_path = get_results_path()
|
| 910 |
|
| 911 |
-
# Gather all JSON files
|
| 912 |
-
|
| 913 |
for path in runs_path.glob("*.json"):
|
| 914 |
try:
|
| 915 |
-
|
|
|
|
|
|
|
|
|
|
| 916 |
except Exception as e:
|
| 917 |
print(f"Warning: Skipping invalid JSON file {path}: {e}")
|
| 918 |
|
| 919 |
-
if not
|
| 920 |
# Create empty CSV with headers
|
| 921 |
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
|
| 922 |
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
|
|
@@ -925,9 +930,13 @@ def aggregate_runs_to_csv():
|
|
| 925 |
w.writeheader()
|
| 926 |
return
|
| 927 |
|
|
|
|
|
|
|
|
|
|
| 928 |
# Handle legacy column names and infer Type
|
| 929 |
legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
|
| 930 |
-
|
|
|
|
| 931 |
for old_key, new_key in legacy_map.items():
|
| 932 |
if old_key in record and new_key not in record:
|
| 933 |
record[new_key] = record.pop(old_key)
|
|
@@ -935,11 +944,39 @@ def aggregate_runs_to_csv():
|
|
| 935 |
# Infer Type if not present
|
| 936 |
if "Type" not in record:
|
| 937 |
record["Type"] = infer_type(record)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
|
| 939 |
# Build header: metadata β metrics β puzzle types
|
| 940 |
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
|
| 941 |
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
|
| 942 |
-
puzzle_types = sorted({k for r in
|
| 943 |
if k not in fixed_metadata + fixed_metrics})
|
| 944 |
header = fixed_metadata + fixed_metrics + puzzle_types
|
| 945 |
|
|
@@ -948,7 +985,7 @@ def aggregate_runs_to_csv():
|
|
| 948 |
with results_path.open("w", newline="") as f:
|
| 949 |
w = csv.DictWriter(f, fieldnames=header)
|
| 950 |
w.writeheader()
|
| 951 |
-
for r in
|
| 952 |
w.writerow(r)
|
| 953 |
|
| 954 |
def render(category, sort_column, sort_direction, model_filter="Models Avg"):
|
|
|
|
| 904 |
"""
|
| 905 |
Aggregate all JSON files in runs/ directory into results.csv.
|
| 906 |
This consolidates all uploaded evaluation results into a single CSV file.
|
| 907 |
+
Deduplicates records based on (Model, Provider, Agent Framework) combination,
|
| 908 |
+
keeping the most recent entry for each unique combination.
|
| 909 |
"""
|
| 910 |
runs_path = get_runs_path()
|
| 911 |
results_path = get_results_path()
|
| 912 |
|
| 913 |
+
# Gather all JSON files with their modification times
|
| 914 |
+
records_with_time = []
|
| 915 |
for path in runs_path.glob("*.json"):
|
| 916 |
try:
|
| 917 |
+
record = json.loads(path.read_text())
|
| 918 |
+
# Store modification time for deduplication (most recent wins)
|
| 919 |
+
mtime = path.stat().st_mtime
|
| 920 |
+
records_with_time.append((mtime, record))
|
| 921 |
except Exception as e:
|
| 922 |
print(f"Warning: Skipping invalid JSON file {path}: {e}")
|
| 923 |
|
| 924 |
+
if not records_with_time:
|
| 925 |
# Create empty CSV with headers
|
| 926 |
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
|
| 927 |
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
|
|
|
|
| 930 |
w.writeheader()
|
| 931 |
return
|
| 932 |
|
| 933 |
+
# Sort by modification time (most recent first)
|
| 934 |
+
records_with_time.sort(key=lambda x: x[0], reverse=True)
|
| 935 |
+
|
| 936 |
# Handle legacy column names and infer Type
|
| 937 |
legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
|
| 938 |
+
processed_records = []
|
| 939 |
+
for mtime, record in records_with_time:
|
| 940 |
for old_key, new_key in legacy_map.items():
|
| 941 |
if old_key in record and new_key not in record:
|
| 942 |
record[new_key] = record.pop(old_key)
|
|
|
|
| 944 |
# Infer Type if not present
|
| 945 |
if "Type" not in record:
|
| 946 |
record["Type"] = infer_type(record)
|
| 947 |
+
|
| 948 |
+
processed_records.append(record)
|
| 949 |
+
|
| 950 |
+
# Deduplicate: keep only the most recent record for each (Model, Provider, Agent Framework) combination
|
| 951 |
+
seen = {}
|
| 952 |
+
deduplicated_records = []
|
| 953 |
+
|
| 954 |
+
for record in processed_records:
|
| 955 |
+
# Create unique key from Model, Provider, and Agent Framework
|
| 956 |
+
model = str(record.get("Model", "")).strip()
|
| 957 |
+
provider = str(record.get("Provider", "")).strip()
|
| 958 |
+
agent_framework = str(record.get("Agent Framework", "")).strip()
|
| 959 |
+
unique_key = (model, provider, agent_framework)
|
| 960 |
+
|
| 961 |
+
# Only add if we haven't seen this combination before
|
| 962 |
+
# Since records are sorted by time (most recent first), the first occurrence is kept
|
| 963 |
+
if unique_key not in seen:
|
| 964 |
+
seen[unique_key] = True
|
| 965 |
+
deduplicated_records.append(record)
|
| 966 |
+
|
| 967 |
+
if not deduplicated_records:
|
| 968 |
+
# Create empty CSV with headers
|
| 969 |
+
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
|
| 970 |
+
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
|
| 971 |
+
with results_path.open("w", newline="") as f:
|
| 972 |
+
w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
|
| 973 |
+
w.writeheader()
|
| 974 |
+
return
|
| 975 |
|
| 976 |
# Build header: metadata β metrics β puzzle types
|
| 977 |
fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
|
| 978 |
fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
|
| 979 |
+
puzzle_types = sorted({k for r in deduplicated_records for k in r.keys()
|
| 980 |
if k not in fixed_metadata + fixed_metrics})
|
| 981 |
header = fixed_metadata + fixed_metrics + puzzle_types
|
| 982 |
|
|
|
|
| 985 |
with results_path.open("w", newline="") as f:
|
| 986 |
w = csv.DictWriter(f, fieldnames=header)
|
| 987 |
w.writeheader()
|
| 988 |
+
for r in deduplicated_records:
|
| 989 |
w.writerow(r)
|
| 990 |
|
| 991 |
def render(category, sort_column, sort_direction, model_filter="Models Avg"):
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
gradio
|
| 2 |
-
pandas
|
| 3 |
-
matplotlib
|
| 4 |
numpy
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
| 3 |
+
matplotlib
|
| 4 |
numpy
|