OpenCaptchaWorld commited on
Commit
a1c511d
Β·
1 Parent(s): 441dd2f

debug duplications in leaderborad

Browse files
Files changed (2) hide show
  1. app.py +44 -7
  2. requirements.txt +3 -3
app.py CHANGED
@@ -904,19 +904,24 @@ def aggregate_runs_to_csv():
904
  """
905
  Aggregate all JSON files in runs/ directory into results.csv.
906
  This consolidates all uploaded evaluation results into a single CSV file.
 
 
907
  """
908
  runs_path = get_runs_path()
909
  results_path = get_results_path()
910
 
911
- # Gather all JSON files
912
- records = []
913
  for path in runs_path.glob("*.json"):
914
  try:
915
- records.append(json.loads(path.read_text()))
 
 
 
916
  except Exception as e:
917
  print(f"Warning: Skipping invalid JSON file {path}: {e}")
918
 
919
- if not records:
920
  # Create empty CSV with headers
921
  fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
922
  fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
@@ -925,9 +930,13 @@ def aggregate_runs_to_csv():
925
  w.writeheader()
926
  return
927
 
 
 
 
928
  # Handle legacy column names and infer Type
929
  legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
930
- for record in records:
 
931
  for old_key, new_key in legacy_map.items():
932
  if old_key in record and new_key not in record:
933
  record[new_key] = record.pop(old_key)
@@ -935,11 +944,39 @@ def aggregate_runs_to_csv():
935
  # Infer Type if not present
936
  if "Type" not in record:
937
  record["Type"] = infer_type(record)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
 
939
  # Build header: metadata β†’ metrics β†’ puzzle types
940
  fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
941
  fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
942
- puzzle_types = sorted({k for r in records for k in r.keys()
943
  if k not in fixed_metadata + fixed_metrics})
944
  header = fixed_metadata + fixed_metrics + puzzle_types
945
 
@@ -948,7 +985,7 @@ def aggregate_runs_to_csv():
948
  with results_path.open("w", newline="") as f:
949
  w = csv.DictWriter(f, fieldnames=header)
950
  w.writeheader()
951
- for r in records:
952
  w.writerow(r)
953
 
954
  def render(category, sort_column, sort_direction, model_filter="Models Avg"):
 
904
  """
905
  Aggregate all JSON files in runs/ directory into results.csv.
906
  This consolidates all uploaded evaluation results into a single CSV file.
907
+ Deduplicates records based on (Model, Provider, Agent Framework) combination,
908
+ keeping the most recent entry for each unique combination.
909
  """
910
  runs_path = get_runs_path()
911
  results_path = get_results_path()
912
 
913
+ # Gather all JSON files with their modification times
914
+ records_with_time = []
915
  for path in runs_path.glob("*.json"):
916
  try:
917
+ record = json.loads(path.read_text())
918
+ # Store modification time for deduplication (most recent wins)
919
+ mtime = path.stat().st_mtime
920
+ records_with_time.append((mtime, record))
921
  except Exception as e:
922
  print(f"Warning: Skipping invalid JSON file {path}: {e}")
923
 
924
+ if not records_with_time:
925
  # Create empty CSV with headers
926
  fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
927
  fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
 
930
  w.writeheader()
931
  return
932
 
933
+ # Sort by modification time (most recent first)
934
+ records_with_time.sort(key=lambda x: x[0], reverse=True)
935
+
936
  # Handle legacy column names and infer Type
937
  legacy_map = {"Notes": "Agent Framework", "Overall": "Overall Pass Rate"}
938
+ processed_records = []
939
+ for mtime, record in records_with_time:
940
  for old_key, new_key in legacy_map.items():
941
  if old_key in record and new_key not in record:
942
  record[new_key] = record.pop(old_key)
 
944
  # Infer Type if not present
945
  if "Type" not in record:
946
  record["Type"] = infer_type(record)
947
+
948
+ processed_records.append(record)
949
+
950
+ # Deduplicate: keep only the most recent record for each (Model, Provider, Agent Framework) combination
951
+ seen = {}
952
+ deduplicated_records = []
953
+
954
+ for record in processed_records:
955
+ # Create unique key from Model, Provider, and Agent Framework
956
+ model = str(record.get("Model", "")).strip()
957
+ provider = str(record.get("Provider", "")).strip()
958
+ agent_framework = str(record.get("Agent Framework", "")).strip()
959
+ unique_key = (model, provider, agent_framework)
960
+
961
+ # Only add if we haven't seen this combination before
962
+ # Since records are sorted by time (most recent first), the first occurrence is kept
963
+ if unique_key not in seen:
964
+ seen[unique_key] = True
965
+ deduplicated_records.append(record)
966
+
967
+ if not deduplicated_records:
968
+ # Create empty CSV with headers
969
+ fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
970
+ fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
971
+ with results_path.open("w", newline="") as f:
972
+ w = csv.DictWriter(f, fieldnames=fixed_metadata + fixed_metrics)
973
+ w.writeheader()
974
+ return
975
 
976
  # Build header: metadata β†’ metrics β†’ puzzle types
977
  fixed_metadata = ["Model", "Provider", "Agent Framework", "Type"]
978
  fixed_metrics = ["Overall Pass Rate", "Avg Duration (s)", "Avg Cost ($)"]
979
+ puzzle_types = sorted({k for r in deduplicated_records for k in r.keys()
980
  if k not in fixed_metadata + fixed_metrics})
981
  header = fixed_metadata + fixed_metrics + puzzle_types
982
 
 
985
  with results_path.open("w", newline="") as f:
986
  w = csv.DictWriter(f, fieldnames=header)
987
  w.writeheader()
988
+ for r in deduplicated_records:
989
  w.writerow(r)
990
 
991
  def render(category, sort_column, sort_direction, model_filter="Models Avg"):
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio>=5.49.1
2
- pandas>=2.3.3
3
- matplotlib>=3.10.7
4
  numpy
 
1
+ gradio
2
+ pandas
3
+ matplotlib
4
  numpy