tathagataraha commited on
Commit
b7600d0
·
2 Parent(s): 3faf231 fc21df8

[ADD] Initial results and frontend for EHRSQL, MedCalc, MedEC

Browse files
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -1,145 +1,71 @@
1
- import subprocess
2
-
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
  import time
 
 
 
 
8
 
9
  from src.about import (
10
- CITATION_BUTTON_LABEL,
11
- CITATION_BUTTON_TEXT,
12
- EVALUATION_QUEUE_TEXT,
13
- INTRODUCTION_TEXT,
14
- LLM_BENCHMARKS_TEXT_1,
15
- LLM_BENCHMARKS_TEXT_2,
16
- CROSS_EVALUATION_METRICS,
17
- NOTE_GENERATION_METRICS,
18
- HEALTHBENCH_METRICS,
19
- # EVALUATION_EXAMPLE_IMG,
20
- # LLM_BENCHMARKS_TEXT_2,
21
- # ENTITY_DISTRIBUTION_IMG,
22
- # LLM_BENCHMARKS_TEXT_3,
23
- TITLE,
24
- LOGO,
25
- FIVE_PILLAR_DIAGRAM
26
  )
27
  from src.display.css_html_js import custom_css
28
- # changes to be made here
29
  from src.display.utils import (
30
- DATASET_BENCHMARK_COLS,
31
- OPEN_ENDED_BENCHMARK_COLS,
32
- MED_SAFETY_BENCHMARK_COLS,
33
- MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
34
- ACI_BENCHMARK_COLS,
35
- SOAP_BENCHMARK_COLS,
36
- HEALTHBENCH_BENCHMARK_COLS,
37
- HEALTHBENCH_HARD_BENCHMARK_COLS,
38
- DATASET_COLS,
39
- OPEN_ENDED_COLS,
40
- MED_SAFETY_COLS,
41
- MEDICAL_SUMMARIZATION_COLS,
42
- ACI_COLS,
43
- SOAP_COLS,
44
- HEALTHBENCH_COLS,
45
- HEALTHBENCH_HARD_COLS,
46
- EVAL_COLS,
47
- EVAL_TYPES,
48
- NUMERIC_INTERVALS,
49
- TYPES,
50
- AutoEvalColumn,
51
- ModelType,
52
- ModelArch,
53
- PromptTemplateName,
54
- Precision,
55
- WeightType,
56
- fields,
57
- render_generation_templates,
58
- OpenEndedArabic_COLS,
59
- OpenEndedArabic_BENCHMARK_COLS,
60
- OpenEndedFrench_COLS,
61
- OpenEndedFrench_BENCHMARK_COLS,
62
- OpenEndedPortuguese_COLS,
63
- OpenEndedPortuguese_BENCHMARK_COLS,
64
- OpenEndedRomanian_COLS,
65
- OpenEndedRomanian_BENCHMARK_COLS,
66
- OpenEndedGreek_COLS,
67
- OpenEndedGreek_BENCHMARK_COLS,
68
- OpenEndedSpanish_COLS,
69
- OpenEndedSpanish_BENCHMARK_COLS,
70
- ClosedEndedMultilingual_COLS,
71
- ClosedEndedMultilingual_BENCHMARK_COLS,
72
  )
73
-
74
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
75
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
76
- from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
 
 
 
 
77
 
78
  def restart_space():
79
  API.restart_space(repo_id=REPO_ID)
80
 
81
 
82
- print(f"QUEUE_REPO: {QUEUE_REPO}")
83
- print(f"RESULTS_REPO: {RESULTS_REPO}")
84
- print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
85
- print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
86
- print(f"TOKEN: {TOKEN}")
87
-
88
- try:
89
- print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
90
- snapshot_download(
91
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
92
- )
93
- print(f"EVAL_REQUESTS_PATH downloaded")
94
- except Exception:
95
- print("An error occurred while downloading EVAL_REQUESTS_PATH. Please check the connection or the repository settings.")
96
- restart_space()
97
  try:
98
- print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
99
- snapshot_download(
100
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
101
- )
102
- print(f"EVAL_RESULTS_PATH downloaded")
103
- except Exception:
104
- print("An error occurred while downloading EVAL_RESULTS_PATH. Please check the connection or the repository settings.")
105
  restart_space()
106
 
107
- # Span based results
108
- # changes to be made here
109
-
110
  start_time = time.time()
111
 
112
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
113
- harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
114
- print("Closed ended English results loaded")
115
-
116
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
117
- open_ended_leaderboard_df = open_ended_original_df.copy()
118
- print("Open ended English results loaded")
119
-
120
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
121
- med_safety_leaderboard_df = med_safety_original_df.copy()
122
- print("Med safety results loaded")
123
-
124
  _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
125
- medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
126
- print("Medical summarization results loaded")
127
-
128
  _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
129
- aci_leaderboard_df = aci_original_df.copy()
130
- print("ACI results loaded")
131
-
132
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
133
- soap_leaderboard_df = soap_original_df.copy()
134
- print("SOAP results loaded")
135
-
136
  _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
137
- healthbench_leaderboard_df = healthbench_original_df.copy()
138
-
139
  _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
140
- healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
141
- print("Healthbench results loaded")
142
-
143
  _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
144
  _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
145
  _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
@@ -147,129 +73,53 @@ _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_
147
  _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
148
  _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
149
  _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
150
-
151
-
152
- open_ended_arabic_leaderboard_df = open_ended_arabic_df.copy()
153
- open_ended_french_leaderboard_df = open_ended_french_df.copy()
154
- open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
155
- open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
156
- open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
157
- open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
158
- print("Open ended multilingual results loaded")
159
-
160
- closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
161
- print("Closed ended multilingual results loaded")
162
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  end_time = time.time()
164
- total_time = end_time - start_time
165
- print(f"Total time taken to load all results: {total_time:.2f} seconds")
166
-
167
- # breakpoint()
168
- # # Token based results
169
- # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
170
- # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
171
 
172
- # _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
173
- # token_based_types_leaderboard_df = token_based_types_original_df.copy()
174
-
175
-
176
- (
177
- finished_eval_queue_df,
178
- running_eval_queue_df,
179
- pending_eval_queue_df,
180
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
181
-
182
- # breakpoint()
183
- def update_df(shown_columns, subset="datasets"):
184
- # changes to be made here
185
- if subset == "datasets":
186
- leaderboard_table_df = harness_datasets_leaderboard_df.copy()
187
- hidden_leader_board_df = harness_datasets_original_df
188
- elif subset == "open_ended":
189
- leaderboard_table_df = open_ended_leaderboard_df.copy()
190
- hidden_leader_board_df = open_ended_original_df
191
- elif subset == "med_safety":
192
- leaderboard_table_df = med_safety_leaderboard_df.copy()
193
- hidden_leader_board_df = med_safety_original_df
194
- elif subset == "medical_summarization":
195
- leaderboard_table_df = medical_summarization_leaderboard_df.copy()
196
- hidden_leader_board_df = medical_summarization_original_df
197
- elif subset == "aci":
198
- leaderboard_table_df = aci_leaderboard_df.copy()
199
- hidden_leader_board_df = aci_original_df
200
- elif subset == "soap":
201
- leaderboard_table_df = soap_leaderboard_df.copy()
202
- hidden_leader_board_df = soap_original_df
203
- elif subset == "healthbench":
204
- leaderboard_table_df = healthbench_leaderboard_df.copy()
205
- hidden_leader_board_df = healthbench_original_df
206
- elif subset == "healthbench_hard":
207
- leaderboard_table_df = healthbench_hard_leaderboard_df.copy()
208
- hidden_leader_board_df = healthbench_hard_original_df
209
- elif subset == "open_ended_arabic":
210
- leaderboard_table_df = open_ended_arabic_df.copy()
211
- hidden_leader_board_df = open_ended_arabic_df
212
- elif subset == "open_ended_french":
213
- leaderboard_table_df = open_ended_french_df.copy()
214
- hidden_leader_board_df = open_ended_french_df
215
- elif subset == "open_ended_portuguese":
216
- leaderboard_table_df = open_ended_portuguese_df.copy()
217
- hidden_leader_board_df = open_ended_portuguese_df
218
- elif subset == "open_ended_romanian":
219
- leaderboard_table_df = open_ended_romanian_df.copy()
220
- hidden_leader_board_df = open_ended_romanian_df
221
- elif subset == "open_ended_greek":
222
- leaderboard_table_df = open_ended_greek_df.copy()
223
- hidden_leader_board_df = open_ended_greek_df
224
- elif subset == "open_ended_spanish":
225
- leaderboard_table_df = open_ended_spanish_df.copy()
226
- hidden_leader_board_df = open_ended_spanish_df
227
- elif subset == "closed_ended_multilingual":
228
- leaderboard_table_df = closed_ended_multilingual_df.copy()
229
- hidden_leader_board_df = closed_ended_multilingual_df
230
-
231
-
232
- value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
233
- # breakpoint()
234
- return leaderboard_table_df[value_cols], hidden_leader_board_df
235
-
236
-
237
- # Searching and filtering
238
- def update_table(
239
- hidden_df: pd.DataFrame,
240
- columns: list,
241
- query: str = "",
242
- # type_query: list = None,
243
- domain_specific_query: list = None,
244
- size_query: list = None,
245
- precision_query: str = None,
246
- show_deleted: bool = False,
247
- ):
248
- # breakpoint()
249
- type_query = None
250
- filtered_df = filter_models(hidden_df, type_query, domain_specific_query, size_query, precision_query, show_deleted)
251
- # breakpoint()
252
- filtered_df = filter_queries(query, filtered_df)
253
- # breakpoint()
254
- df = select_columns(filtered_df, columns, list(hidden_df.columns))
255
- # breakpoint()
256
- return df
257
 
 
 
 
258
 
259
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
260
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
261
 
262
-
263
- def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
264
- always_here_cols = [
265
- # AutoEvalColumn.model_type_symbol.name,
266
- AutoEvalColumn.model.name,
267
- ]
268
- # We use COLS to maintain sorting
269
- filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
270
- return filtered_df
271
-
272
-
273
  def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
274
  final_df = []
275
  if query != "":
@@ -285,8 +135,6 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
285
  filtered_df = filtered_df.drop_duplicates(
286
  subset=[
287
  AutoEvalColumn.model.name,
288
- # AutoEvalColumn.precision.name,
289
- # AutoEvalColumn.revision.name,
290
  ]
291
  )
292
 
@@ -296,11 +144,6 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
296
  def filter_models(
297
  df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
298
  ) -> pd.DataFrame:
299
- # Show all models
300
- # if show_deleted:
301
- # filtered_df = df
302
- # else: # Show only still on the hub models
303
- # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
304
 
305
  filtered_df = df
306
 
@@ -310,17 +153,12 @@ def filter_models(
310
 
311
  if domain_specific_query is not None:
312
  domain_specifics = []
313
- if "🏥 Clinical models" in domain_specific_query:
314
  domain_specifics.append(True)
315
  if "Generic models" in domain_specific_query:
316
  domain_specifics.append(False)
317
  filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
318
-
319
- # if architecture_query is not None:
320
- # arch_types = [t for t in architecture_query]
321
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
322
- # # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
323
-
324
  if precision_query is not None:
325
  if AutoEvalColumn.precision.name in df.columns:
326
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -333,1075 +171,276 @@ def filter_models(
333
 
334
  return filtered_df
335
 
336
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  demo = gr.Blocks(css=custom_css)
 
338
  with demo:
339
- print("hello")
340
  gr.HTML(LOGO)
341
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
342
- filter_columns_type = None
343
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
344
  with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
345
  with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
346
  LANGUAGES = {
347
- "🇺🇸 English": "open_ended",
348
- "🇦🇪 Arabic": "open_ended_arabic",
349
- "🇫🇷 French": "open_ended_french",
350
- "🇪🇸 Spanish": "open_ended_spanish",
351
- "🇵🇹 Portuguese": "open_ended_portuguese",
352
- "🇷🇴 Romanian": "open_ended_romanian",
353
  "🇬🇷 Greek": "open_ended_greek",
354
  }
355
-
356
  for idx, (label, subset) in enumerate(LANGUAGES.items()):
357
  with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
358
- # Custom judge information for each language
359
- if label == "🇺🇸 English":
360
- judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English."
361
- else:
362
- judge_text = "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
363
-
364
  gr.Markdown(judge_text, elem_classes="markdown-text")
365
 
366
- with gr.Row():
367
- with gr.Column():
368
- with gr.Row():
369
- search_bar = gr.Textbox(
370
- placeholder=f"🔍 Search for your model in {label}...",
371
- show_label=False,
372
- elem_id=f"search-bar-{subset}",
373
- )
374
- with gr.Row():
375
- shown_columns = gr.CheckboxGroup(
376
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
377
- value=[
378
- c.name
379
- for c in fields(AutoEvalColumn)
380
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
381
- ],
382
- label="Select columns to show",
383
- elem_id=f"column-select-{subset}",
384
- interactive=True,
385
- )
386
- with gr.Column(min_width=320):
387
- # filter_columns_type = gr.CheckboxGroup(
388
- # label="Model Types",
389
- # choices=[t.to_str() for t in ModelType],
390
- # value=[t.to_str() for t in ModelType],
391
- # interactive=True,
392
- # elem_id=f"filter-columns-type-{subset}",
393
- # )
394
-
395
- filter_domain_specific = gr.CheckboxGroup(
396
- label="Domain Specificity",
397
- choices=["🏥 Clinical models", "Generic models"],
398
- value=["🏥 Clinical models", "Generic models"],
399
- interactive=True,
400
- elem_id=f"filter-columns-domain-{subset}",
401
- )
402
- filter_columns_size = gr.CheckboxGroup(
403
- label="Model sizes (in billions of parameters)",
404
- choices=list(NUMERIC_INTERVALS.keys()),
405
- value=list(NUMERIC_INTERVALS.keys()),
406
- interactive=True,
407
- elem_id=f"filter-columns-size-{subset}",
408
- )
409
-
410
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset=subset)
411
-
412
- leaderboard_table = gr.Dataframe(
413
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
414
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
415
- datatype=TYPES,
416
- elem_id=f"leaderboard-table-{subset}",
417
- interactive=False,
418
- visible=True,
419
  )
420
-
421
- hidden_leaderboard_table_for_search = gr.Dataframe(
422
- value=datasets_original_df[OPEN_ENDED_COLS],
423
- headers=OPEN_ENDED_COLS,
424
- datatype=TYPES,
425
- visible=False,
426
- )
427
-
428
- search_bar.submit(
429
- update_table,
430
- [
431
- hidden_leaderboard_table_for_search,
432
- shown_columns,
433
- search_bar,
434
- # filter_columns_type,
435
- filter_domain_specific,
436
- filter_columns_size
437
- ],
438
- leaderboard_table,
439
- )
440
-
441
- for selector in [
442
- shown_columns,
443
- # filter_columns_type,
444
- filter_domain_specific,
445
- filter_columns_size,
446
- ]:
447
- selector.change(
448
- update_table,
449
- [
450
- hidden_leaderboard_table_for_search,
451
- shown_columns,
452
- search_bar,
453
- # filter_columns_type,
454
- filter_domain_specific,
455
- filter_columns_size
456
- ],
457
- leaderboard_table,
458
- queue=True,
459
- )
460
-
461
  with gr.Accordion("💬 Generation templates", open=False):
462
  with gr.Accordion("Response generation", open=False):
463
  render_generation_templates(task="open_ended", generation_type="response_generation")
464
  with gr.Accordion("Scoring Rubric", open=False):
465
  render_generation_templates(task="open_ended", generation_type="scoring_rubric")
466
-
467
  with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
468
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
469
- with gr.Row():
470
- with gr.Column():
471
- with gr.Row():
472
- search_bar = gr.Textbox(
473
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
474
- show_label=False,
475
- elem_id="search-bar",
476
- )
477
- with gr.Row():
478
- shown_columns = gr.CheckboxGroup(
479
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
480
- value=[
481
- c.name
482
- for c in fields(AutoEvalColumn)
483
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
484
- ],
485
- label="Select columns to show",
486
- elem_id="column-select",
487
- interactive=True,
488
- )
489
- # with gr.Row():
490
- # deleted_models_visibility = gr.Checkbox(
491
- # value=False, label="Show gated/private/deleted models", interactive=True
492
- # )
493
- with gr.Column(min_width=320):
494
- # with gr.Box(elem_id="box-filter"):
495
- # filter_columns_type = gr.CheckboxGroup(
496
- # label="Model Types",
497
- # choices=[t.to_str() for t in ModelType],
498
- # value=[t.to_str() for t in ModelType],
499
- # interactive=True,
500
- # elem_id="filter-columns-type",
501
- # )
502
- # filter_columns_architecture = gr.CheckboxGroup(
503
- # label="Architecture Types",
504
- # choices=[i.value.name for i in ModelArch],
505
- # value=[i.value.name for i in ModelArch],
506
- # interactive=True,
507
- # elem_id="filter-columns-architecture",
508
- # )
509
- filter_domain_specific = gr.CheckboxGroup(
510
- label="Domain Specificity",
511
- choices=["🏥 Clinical models", "Generic models"],
512
- value=["🏥 Clinical models", "Generic models"],
513
- interactive=True,
514
- elem_id="filter-columns-type",
515
- )
516
- filter_columns_size = gr.CheckboxGroup(
517
- label="Model sizes (in billions of parameters)",
518
- choices=list(NUMERIC_INTERVALS.keys()),
519
- value=list(NUMERIC_INTERVALS.keys()),
520
- interactive=True,
521
- elem_id="filter-columns-size",
522
- )
523
-
524
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
525
-
526
- leaderboard_table = gr.components.Dataframe(
527
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
528
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
529
- datatype=TYPES,
530
- elem_id="leaderboard-table",
531
- interactive=False,
532
- visible=True,
533
- )
534
-
535
- # Dummy leaderboard for handling the case when the user uses backspace key
536
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
537
- value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
538
- headers=MEDICAL_SUMMARIZATION_COLS,
539
- datatype=TYPES,
540
- visible=False,
541
- )
542
-
543
-
544
- search_bar.submit(
545
- update_table,
546
- [
547
- hidden_leaderboard_table_for_search,
548
- shown_columns,
549
- search_bar,
550
- # filter_columns_type,
551
- filter_domain_specific,
552
- filter_columns_size
553
- # filter_columns_architecture
554
- ],
555
- leaderboard_table,
556
  )
557
- for selector in [
558
- shown_columns,
559
- # filter_columns_type,
560
- filter_domain_specific,
561
- filter_columns_size,
562
- # deleted_models_visibility,
563
- ]:
564
- selector.change(
565
- update_table,
566
- [
567
- hidden_leaderboard_table_for_search,
568
- shown_columns,
569
- search_bar,
570
- # filter_columns_type,
571
- filter_domain_specific,
572
- filter_columns_size
573
- ],
574
- leaderboard_table,
575
- queue=True,
576
- )
577
  with gr.Accordion("💬 Generation templates", open=False):
578
  with gr.Accordion("Response generation", open=False):
579
- system_prompt, user_prompt = render_generation_templates(task="medical_summarization", generation_type="response_generation")
580
  with gr.Accordion("Question generation", open=False):
581
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
582
  with gr.Accordion("Cross Examination", open=False):
583
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
584
-
585
  with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=3):
586
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
587
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
588
- with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-table2", id=0):
589
- with gr.Row():
590
- with gr.Column():
591
- with gr.Row():
592
- search_bar = gr.Textbox(
593
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
594
- show_label=False,
595
- elem_id="search-bar",
596
- )
597
- with gr.Row():
598
- shown_columns = gr.CheckboxGroup(
599
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
600
- value=[
601
- c.name
602
- for c in fields(AutoEvalColumn)
603
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
604
- ],
605
- label="Select columns to show",
606
- elem_id="column-select",
607
- interactive=True,
608
- )
609
- # with gr.Row():
610
- # deleted_models_visibility = gr.Checkbox(
611
- # value=False, label="Show gated/private/deleted models", interactive=True
612
- # )
613
- with gr.Column(min_width=320):
614
- # with gr.Box(elem_id="box-filter"):
615
- # filter_columns_type = gr.CheckboxGroup(
616
- # label="Model Types",
617
- # choices=[t.to_str() for t in ModelType],
618
- # value=[t.to_str() for t in ModelType],
619
- # interactive=True,
620
- # elem_id="filter-columns-type",
621
- # )
622
- # filter_columns_architecture = gr.CheckboxGroup(
623
- # label="Architecture Types",
624
- # choices=[i.value.name for i in ModelArch],
625
- # value=[i.value.name for i in ModelArch],
626
- # interactive=True,
627
- # elem_id="filter-columns-architecture",
628
- # )
629
- filter_domain_specific = gr.CheckboxGroup(
630
- label="Domain Specificity",
631
- choices=["🏥 Clinical models", "Generic models"],
632
- value=["🏥 Clinical models", "Generic models"],
633
- interactive=True,
634
- elem_id="filter-columns-type",
635
- )
636
- filter_columns_size = gr.CheckboxGroup(
637
- label="Model sizes (in billions of parameters)",
638
- choices=list(NUMERIC_INTERVALS.keys()),
639
- value=list(NUMERIC_INTERVALS.keys()),
640
- interactive=True,
641
- elem_id="filter-columns-size",
642
- )
643
-
644
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
645
-
646
- leaderboard_table = gr.components.Dataframe(
647
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
648
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
649
- datatype=TYPES,
650
- elem_id="leaderboard-table",
651
- interactive=False,
652
- visible=True,
653
  )
654
-
655
- # Dummy leaderboard for handling the case when the user uses backspace key
656
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
657
- value=datasets_original_df[ACI_COLS],
658
- headers=ACI_COLS,
659
- datatype=TYPES,
660
- visible=False,
661
- )
662
-
663
-
664
- search_bar.submit(
665
- update_table,
666
- [
667
- hidden_leaderboard_table_for_search,
668
- shown_columns,
669
- search_bar,
670
- # filter_columns_type,
671
- filter_domain_specific,
672
- filter_columns_size
673
- # filter_columns_architecture
674
- ],
675
- leaderboard_table,
676
- )
677
- for selector in [
678
- shown_columns,
679
- # filter_columns_type,
680
- filter_domain_specific,
681
- filter_columns_size,
682
- # deleted_models_visibility,
683
- ]:
684
- selector.change(
685
- update_table,
686
- [
687
- hidden_leaderboard_table_for_search,
688
- shown_columns,
689
- search_bar,
690
- # filter_columns_type,
691
- filter_domain_specific,
692
- filter_columns_size
693
- ],
694
- leaderboard_table,
695
- queue=True,
696
- )
697
- with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-table2", id=1):
698
- with gr.Row():
699
- with gr.Column():
700
- with gr.Row():
701
- search_bar = gr.Textbox(
702
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
703
- show_label=False,
704
- elem_id="search-bar",
705
- )
706
- with gr.Row():
707
- shown_columns = gr.CheckboxGroup(
708
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
709
- value=[
710
- c.name
711
- for c in fields(AutoEvalColumn)
712
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
713
- ],
714
- label="Select columns to show",
715
- elem_id="column-select",
716
- interactive=True,
717
- )
718
- # with gr.Row():
719
- # deleted_models_visibility = gr.Checkbox(
720
- # value=False, label="Show gated/private/deleted models", interactive=True
721
- # )
722
- with gr.Column(min_width=320):
723
- # with gr.Box(elem_id="box-filter"):
724
- # filter_columns_type = gr.CheckboxGroup(
725
- # label="Model Types",
726
- # choices=[t.to_str() for t in ModelType],
727
- # value=[t.to_str() for t in ModelType],
728
- # interactive=True,
729
- # elem_id="filter-columns-type",
730
- # )
731
- # filter_columns_architecture = gr.CheckboxGroup(
732
- # label="Architecture Types",
733
- # choices=[i.value.name for i in ModelArch],
734
- # value=[i.value.name for i in ModelArch],
735
- # interactive=True,
736
- # elem_id="filter-columns-architecture",
737
- # )
738
- filter_domain_specific = gr.CheckboxGroup(
739
- label="Domain Specificity",
740
- choices=["🏥 Clinical models", "Generic models"],
741
- value=["🏥 Clinical models", "Generic models"],
742
- interactive=True,
743
- elem_id="filter-columns-type",
744
- )
745
- filter_columns_size = gr.CheckboxGroup(
746
- label="Model sizes (in billions of parameters)",
747
- choices=list(NUMERIC_INTERVALS.keys()),
748
- value=list(NUMERIC_INTERVALS.keys()),
749
- interactive=True,
750
- elem_id="filter-columns-size",
751
- )
752
-
753
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
754
-
755
- leaderboard_table = gr.components.Dataframe(
756
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
757
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
758
- datatype=TYPES,
759
- elem_id="leaderboard-table",
760
- interactive=False,
761
- visible=True,
762
  )
763
-
764
- # Dummy leaderboard for handling the case when the user uses backspace key
765
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
766
- value=datasets_original_df[SOAP_COLS],
767
- headers=SOAP_COLS,
768
- datatype=TYPES,
769
- visible=False,
770
- )
771
-
772
-
773
- search_bar.submit(
774
- update_table,
775
- [
776
- hidden_leaderboard_table_for_search,
777
- shown_columns,
778
- search_bar,
779
- # filter_columns_type,
780
- filter_domain_specific,
781
- filter_columns_size
782
- # filter_columns_architecture
783
- ],
784
- leaderboard_table,
785
- )
786
- for selector in [
787
- shown_columns,
788
- # filter_columns_type,
789
- filter_domain_specific,
790
- filter_columns_size,
791
- # deleted_models_visibility,
792
- ]:
793
- selector.change(
794
- update_table,
795
- [
796
- hidden_leaderboard_table_for_search,
797
- shown_columns,
798
- search_bar,
799
- # filter_columns_type,
800
- filter_domain_specific,
801
- filter_columns_size
802
- ],
803
- leaderboard_table,
804
- queue=True,
805
- )
806
- with gr.Accordion("💬 Generation templates", open=False):
807
- with gr.Accordion("ACI-Bench Response generation", open=False):
808
- system_prompt, user_prompt = render_generation_templates(task="aci", generation_type="response_generation")
809
- with gr.Accordion("SOAP Notes Response generation", open=False):
810
- system_prompt, user_prompt = render_generation_templates(task="soap", generation_type="response_generation")
811
- with gr.Accordion("Question generation", open=False):
812
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
813
- with gr.Accordion("Cross Examination", open=False):
814
- system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
815
 
816
  with gr.TabItem("🏅 HealthBench", elem_id="llm-benchmark-tab-table", id=4):
817
  gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
818
- with gr.Tabs(elem_classes="tab-buttons2") as tabs:
819
- with gr.TabItem("HealthBench", elem_id="llm-benchmark-tab-table3", id=0):
820
- with gr.Row():
821
- with gr.Column():
822
- with gr.Row():
823
- search_bar = gr.Textbox(
824
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
825
- show_label=False,
826
- elem_id="search-bar",
827
- )
828
- with gr.Row():
829
- shown_columns = gr.CheckboxGroup(
830
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)],
831
- value=[
832
- c.name
833
- for c in fields(AutoEvalColumn)
834
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)
835
- ],
836
- label="Select columns to show",
837
- elem_id="column-select",
838
- interactive=True,
839
- )
840
- # with gr.Row():
841
- # deleted_models_visibility = gr.Checkbox(
842
- # value=False, label="Show gated/private/deleted models", interactive=True
843
- # )
844
- with gr.Column(min_width=320):
845
- # with gr.Box(elem_id="box-filter"):
846
- # filter_columns_type = gr.CheckboxGroup(
847
- # label="Model Types",
848
- # choices=[t.to_str() for t in ModelType],
849
- # value=[t.to_str() for t in ModelType],
850
- # interactive=True,
851
- # elem_id="filter-columns-type",
852
- # )
853
- # filter_columns_architecture = gr.CheckboxGroup(
854
- # label="Architecture Types",
855
- # choices=[i.value.name for i in ModelArch],
856
- # value=[i.value.name for i in ModelArch],
857
- # interactive=True,
858
- # elem_id="filter-columns-architecture",
859
- # )
860
- filter_domain_specific = gr.CheckboxGroup(
861
- label="Domain Specificity",
862
- choices=["🏥 Clinical models", "Generic models"],
863
- value=["🏥 Clinical models", "Generic models"],
864
- interactive=True,
865
- elem_id="filter-columns-type",
866
- )
867
- filter_columns_size = gr.CheckboxGroup(
868
- label="Model sizes (in billions of parameters)",
869
- choices=list(NUMERIC_INTERVALS.keys()),
870
- value=list(NUMERIC_INTERVALS.keys()),
871
- interactive=True,
872
- elem_id="filter-columns-size",
873
- )
874
-
875
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="healthbench")
876
-
877
- leaderboard_table = gr.components.Dataframe(
878
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
879
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
880
- datatype=TYPES,
881
- elem_id="leaderboard-table",
882
- interactive=False,
883
- visible=True,
884
  )
885
-
886
- # Dummy leaderboard for handling the case when the user uses backspace key
887
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
888
- value=datasets_original_df[HEALTHBENCH_COLS],
889
- headers=HEALTHBENCH_COLS,
890
- datatype=TYPES,
891
- visible=False,
892
- )
893
-
894
-
895
- search_bar.submit(
896
- update_table,
897
- [
898
- hidden_leaderboard_table_for_search,
899
- shown_columns,
900
- search_bar,
901
- # filter_columns_type,
902
- filter_domain_specific,
903
- filter_columns_size
904
- # filter_columns_architecture
905
- ],
906
- leaderboard_table,
907
  )
908
- for selector in [
909
- shown_columns,
910
- # filter_columns_type,
911
- filter_domain_specific,
912
- filter_columns_size,
913
- # deleted_models_visibility,
914
- ]:
915
- selector.change(
916
- update_table,
917
- [
918
- hidden_leaderboard_table_for_search,
919
- shown_columns,
920
- search_bar,
921
- # filter_columns_type,
922
- filter_domain_specific,
923
- filter_columns_size
924
- ],
925
- leaderboard_table,
926
- queue=True,
927
- )
928
- with gr.TabItem("HealthBench-Hard", elem_id="llm-benchmark-tab-table3", id=1):
929
- with gr.Row():
930
- with gr.Column():
931
- with gr.Row():
932
- search_bar = gr.Textbox(
933
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
934
- show_label=False,
935
- elem_id="search-bar",
936
- )
937
- with gr.Row():
938
- shown_columns = gr.CheckboxGroup(
939
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)],
940
- value=[
941
- c.name
942
- for c in fields(AutoEvalColumn)
943
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)
944
- ],
945
- label="Select columns to show",
946
- elem_id="column-select",
947
- interactive=True,
948
- )
949
- # with gr.Row():
950
- # deleted_models_visibility = gr.Checkbox(
951
- # value=False, label="Show gated/private/deleted models", interactive=True
952
- # )
953
- with gr.Column(min_width=320):
954
- # with gr.Box(elem_id="box-filter"):
955
- # filter_columns_type = gr.CheckboxGroup(
956
- # label="Model Types",
957
- # choices=[t.to_str() for t in ModelType],
958
- # value=[t.to_str() for t in ModelType],
959
- # interactive=True,
960
- # elem_id="filter-columns-type",
961
- # )
962
- # filter_columns_architecture = gr.CheckboxGroup(
963
- # label="Architecture Types",
964
- # choices=[i.value.name for i in ModelArch],
965
- # value=[i.value.name for i in ModelArch],
966
- # interactive=True,
967
- # elem_id="filter-columns-architecture",
968
- # )
969
- filter_domain_specific = gr.CheckboxGroup(
970
- label="Domain Specificity",
971
- choices=["🏥 Clinical models", "Generic models"],
972
- value=["🏥 Clinical models", "Generic models"],
973
- interactive=True,
974
- elem_id="filter-columns-type",
975
- )
976
- filter_columns_size = gr.CheckboxGroup(
977
- label="Model sizes (in billions of parameters)",
978
- choices=list(NUMERIC_INTERVALS.keys()),
979
- value=list(NUMERIC_INTERVALS.keys()),
980
- interactive=True,
981
- elem_id="filter-columns-size",
982
- )
983
-
984
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="healthbench_hard")
985
-
986
- leaderboard_table = gr.components.Dataframe(
987
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
988
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
989
- datatype=TYPES,
990
- elem_id="leaderboard-table",
991
- interactive=False,
992
- visible=True,
993
- )
994
-
995
- # Dummy leaderboard for handling the case when the user uses backspace key
996
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
997
- value=datasets_original_df[HEALTHBENCH_HARD_COLS],
998
- headers=HEALTHBENCH_HARD_COLS,
999
- datatype=TYPES,
1000
- visible=False,
1001
- )
1002
-
1003
-
1004
- search_bar.submit(
1005
- update_table,
1006
- [
1007
- hidden_leaderboard_table_for_search,
1008
- shown_columns,
1009
- search_bar,
1010
- # filter_columns_type,
1011
- filter_domain_specific,
1012
- filter_columns_size
1013
- # filter_columns_architecture
1014
- ],
1015
- leaderboard_table,
1016
- )
1017
- for selector in [
1018
- shown_columns,
1019
- # filter_columns_type,
1020
- filter_domain_specific,
1021
- filter_columns_size,
1022
- # deleted_models_visibility,
1023
- ]:
1024
- selector.change(
1025
- update_table,
1026
- [
1027
- hidden_leaderboard_table_for_search,
1028
- shown_columns,
1029
- search_bar,
1030
- # filter_columns_type,
1031
- filter_domain_specific,
1032
- filter_columns_size
1033
- ],
1034
- leaderboard_table,
1035
- queue=True,
1036
- )
1037
 
1038
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=5):
1039
- with gr.Row():
1040
- with gr.Column():
1041
- with gr.Row():
1042
- search_bar = gr.Textbox(
1043
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1044
- show_label=False,
1045
- elem_id="search-bar",
1046
- )
1047
- with gr.Row():
1048
- shown_columns = gr.CheckboxGroup(
1049
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
1050
- value=[
1051
- c.name
1052
- for c in fields(AutoEvalColumn)
1053
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
1054
- ],
1055
- label="Select columns to show",
1056
- elem_id="column-select",
1057
- interactive=True,
1058
- )
1059
- # with gr.Row():
1060
- # deleted_models_visibility = gr.Checkbox(
1061
- # value=False, label="Show gated/private/deleted models", interactive=True
1062
- # )
1063
- with gr.Column(min_width=320):
1064
- # with gr.Box(elem_id="box-filter"):
1065
- # filter_columns_type = gr.CheckboxGroup(
1066
- # label="Model Types",
1067
- # choices=[t.to_str() for t in ModelType],
1068
- # value=[t.to_str() for t in ModelType],
1069
- # interactive=True,
1070
- # elem_id="filter-columns-type",
1071
- # )
1072
- # filter_columns_architecture = gr.CheckboxGroup(
1073
- # label="Architecture Types",
1074
- # choices=[i.value.name for i in ModelArch],
1075
- # value=[i.value.name for i in ModelArch],
1076
- # interactive=True,
1077
- # elem_id="filter-columns-architecture",
1078
- # )
1079
- filter_domain_specific = gr.CheckboxGroup(
1080
- label="Domain Specificity",
1081
- choices=["🏥 Clinical models", "Generic models"],
1082
- value=["🏥 Clinical models", "Generic models"],
1083
- interactive=True,
1084
- elem_id="filter-columns-type",
1085
- )
1086
- filter_columns_size = gr.CheckboxGroup(
1087
- label="Model sizes (in billions of parameters)",
1088
- choices=list(NUMERIC_INTERVALS.keys()),
1089
- value=list(NUMERIC_INTERVALS.keys()),
1090
- interactive=True,
1091
- elem_id="filter-columns-size",
1092
- )
1093
-
1094
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
1095
-
1096
- leaderboard_table = gr.components.Dataframe(
1097
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1098
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1099
- datatype=TYPES,
1100
- elem_id="leaderboard-table",
1101
- interactive=False,
1102
- visible=True,
1103
  )
1104
-
1105
- # Dummy leaderboard for handling the case when the user uses backspace key
1106
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1107
- value=datasets_original_df[MED_SAFETY_COLS],
1108
- headers=MED_SAFETY_COLS,
1109
- datatype=TYPES,
1110
- visible=False,
1111
- )
1112
-
1113
-
1114
- search_bar.submit(
1115
- update_table,
1116
- [
1117
- hidden_leaderboard_table_for_search,
1118
- shown_columns,
1119
- search_bar,
1120
- # filter_columns_type,
1121
- filter_domain_specific,
1122
- filter_columns_size
1123
- # filter_columns_architecture
1124
- ],
1125
- leaderboard_table,
1126
- )
1127
- for selector in [
1128
- shown_columns,
1129
- # filter_columns_type,
1130
- filter_domain_specific,
1131
- filter_columns_size,
1132
- # deleted_models_visibility,
1133
- ]:
1134
- selector.change(
1135
- update_table,
1136
- [
1137
- hidden_leaderboard_table_for_search,
1138
- shown_columns,
1139
- search_bar,
1140
- # filter_columns_type,
1141
- filter_domain_specific,
1142
- filter_columns_size
1143
- ],
1144
- leaderboard_table,
1145
- queue=True,
1146
- )
1147
  with gr.Accordion("💬 Generation templates", open=False):
1148
  with gr.Accordion("Response generation", open=False):
1149
- system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
1150
  with gr.Accordion("Scoring Rubric", open=False):
1151
- system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
1152
-
1153
  with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
1154
- with gr.Tabs(elem_classes="tab-buttons2") as closed_tabs:
1155
- # ENGLISH TAB
1156
- with gr.TabItem("English", elem_id="llm-benchmark-tab-closed-english", id=0):
1157
- with gr.Row():
1158
- with gr.Column():
1159
- with gr.Row():
1160
- search_bar = gr.Textbox(
1161
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1162
- show_label=False,
1163
- elem_id="search-bar-closed-english",
1164
- )
1165
- with gr.Row():
1166
- shown_columns = gr.CheckboxGroup(
1167
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
1168
- value=[
1169
- c.name
1170
- for c in fields(AutoEvalColumn)
1171
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
1172
- ],
1173
- label="Select columns to show",
1174
- elem_id="column-select-closed-english",
1175
- interactive=True,
1176
- )
1177
- with gr.Column(min_width=320):
1178
- # filter_columns_type = gr.CheckboxGroup(
1179
- # label="Model Types",
1180
- # choices=[t.to_str() for t in ModelType],
1181
- # value=[t.to_str() for t in ModelType],
1182
- # interactive=True,
1183
- # elem_id="filter-columns-type-closed-english",
1184
- # )
1185
- filter_domain_specific = gr.CheckboxGroup(
1186
- label="Domain Specificity",
1187
- choices=["🏥 Clinical models", "Generic models"],
1188
- value=["🏥 Clinical models", "Generic models"],
1189
- interactive=True,
1190
- elem_id="filter-domain-specific-closed-english",
1191
- )
1192
- filter_columns_size = gr.CheckboxGroup(
1193
- label="Model sizes (in billions of parameters)",
1194
- choices=list(NUMERIC_INTERVALS.keys()),
1195
- value=list(NUMERIC_INTERVALS.keys()),
1196
- interactive=True,
1197
- elem_id="filter-columns-size-closed-english",
1198
- )
1199
-
1200
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1201
- leaderboard_table = gr.components.Dataframe(
1202
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1203
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1204
- datatype=TYPES,
1205
- elem_id="leaderboard-table-english",
1206
- interactive=False,
1207
- visible=True,
1208
  )
1209
 
1210
- # Dummy leaderboard for handling the case when the user uses backspace key
1211
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1212
- value=datasets_original_df[DATASET_COLS],
1213
- headers=DATASET_COLS,
1214
- datatype=TYPES,
1215
- visible=False,
 
1216
  )
1217
-
1218
- search_bar.submit(
1219
- update_table,
1220
- [
1221
- hidden_leaderboard_table_for_search,
1222
- shown_columns,
1223
- search_bar,
1224
- # filter_columns_type,
1225
- filter_domain_specific,
1226
- filter_columns_size
1227
- ],
1228
- leaderboard_table,
1229
  )
1230
-
1231
- for selector in [
1232
- shown_columns,
1233
- # filter_columns_type,
1234
- filter_domain_specific,
1235
- filter_columns_size,
1236
- ]:
1237
- selector.change(
1238
- update_table,
1239
- [
1240
- hidden_leaderboard_table_for_search,
1241
- shown_columns,
1242
- search_bar,
1243
- # filter_columns_type,
1244
- filter_domain_specific,
1245
- filter_columns_size
1246
- ],
1247
- leaderboard_table,
1248
- queue=True,
1249
- )
1250
-
1251
- #MULTILINGUAL TAB - Same level as English tab
1252
- with gr.TabItem("🌍 Multilingual", elem_id="llm-benchmark-tab-table9", id=1):
1253
- with gr.Row():
1254
- gr.Markdown("📊 **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
1255
-
1256
- with gr.Row():
1257
- with gr.Column():
1258
- with gr.Row():
1259
- search_bar = gr.Textbox(
1260
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1261
- show_label=False,
1262
- elem_id="search-bar",
1263
- )
1264
-
1265
- with gr.Row():
1266
- shown_columns = gr.CheckboxGroup(
1267
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
1268
- value=[
1269
- c.name
1270
- for c in fields(AutoEvalColumn)
1271
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)
1272
- ],
1273
- label="Select columns to show",
1274
- elem_id="column-select",
1275
- interactive=True,
1276
- )
1277
- with gr.Column(min_width=320):
1278
- # with gr.Box(elem_id="box-filter"):
1279
- # filter_columns_type = gr.CheckboxGroup(
1280
- # label="Model Types",
1281
- # choices=[t.to_str() for t in ModelType],
1282
- # value=[t.to_str() for t in ModelType],
1283
- # interactive=True,
1284
- # elem_id="filter-columns-type",
1285
- # )
1286
- filter_domain_specific = gr.CheckboxGroup(
1287
- label="Domain Specificity",
1288
- choices=["🏥 Clinical models", "Generic models"],
1289
- value=["🏥 Clinical models", "Generic models"],
1290
- interactive=True,
1291
- elem_id="filter-columns-type",
1292
- )
1293
- filter_columns_size = gr.CheckboxGroup(
1294
- label="Model sizes (in billions of parameters)",
1295
- choices=list(NUMERIC_INTERVALS.keys()),
1296
- value=list(NUMERIC_INTERVALS.keys()),
1297
- interactive=True,
1298
- elem_id="filter-columns-size",
1299
- )
1300
 
1301
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="closed_ended_multilingual")
1302
- leaderboard_table = gr.components.Dataframe(
1303
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1304
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1305
- datatype=TYPES,
1306
- elem_id="leaderboard-table",
1307
- interactive=False,
1308
- visible=True,
1309
  )
1310
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1311
- value=datasets_original_df[ClosedEndedMultilingual_COLS],
1312
- headers=ClosedEndedMultilingual_COLS,
1313
- datatype=TYPES,
1314
- visible=False,
1315
  )
1316
-
1317
- search_bar.submit(
1318
- update_table,
1319
- [
1320
- hidden_leaderboard_table_for_search,
1321
- shown_columns,
1322
- search_bar,
1323
- # filter_columns_type,
1324
- filter_domain_specific,
1325
- filter_columns_size
1326
- # filter_columns_architecture
1327
- ],
1328
- leaderboard_table,
1329
  )
1330
- for selector in [
1331
- shown_columns,
1332
- # filter_columns_type,
1333
- filter_domain_specific,
1334
- # filter_columns_architecture,
1335
- filter_columns_size,
1336
- # deleted_models_visibility,
1337
- ]:
1338
- selector.change(
1339
- update_table,
1340
- [
1341
- hidden_leaderboard_table_for_search,
1342
- shown_columns,
1343
- search_bar,
1344
- # filter_columns_type,
1345
- filter_domain_specific,
1346
- filter_columns_size
1347
- # filter_columns_architecture,
1348
- ],
1349
- leaderboard_table,
1350
- queue=True,
1351
- )
1352
-
1353
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=7):
1354
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1355
  gr.HTML(FIVE_PILLAR_DIAGRAM)
1356
  gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
1357
- # gr.HTML(EVALUATION_EXAMPLE_IMG, elem_classes="logo")
1358
- # gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
1359
- # gr.HTML(ENTITY_DISTRIBUTION_IMG, elem_classes="logo")
1360
- # gr.Markdown(LLM_BENCHMARKS_TEXT_3, elem_classes="markdown-text")
1361
 
1362
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=8):
 
1363
  with gr.Column():
1364
- with gr.Row():
1365
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1366
-
1367
- with gr.Column():
1368
- with gr.Accordion(
1369
- f" Finished Evaluations ({len(finished_eval_queue_df)})",
1370
- open=False,
1371
- ):
1372
- with gr.Row():
1373
- finished_eval_table = gr.components.Dataframe(
1374
- value=finished_eval_queue_df,
1375
- headers=EVAL_COLS,
1376
- datatype=EVAL_TYPES,
1377
- row_count=5,
1378
- )
1379
- with gr.Accordion(
1380
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
1381
- open=False,
1382
- ):
1383
- with gr.Row():
1384
- running_eval_table = gr.components.Dataframe(
1385
- value=running_eval_queue_df,
1386
- headers=EVAL_COLS,
1387
- datatype=EVAL_TYPES,
1388
- row_count=5,
1389
- )
1390
-
1391
- with gr.Accordion(
1392
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
1393
- open=False,
1394
- ):
1395
- with gr.Row():
1396
- pending_eval_table = gr.components.Dataframe(
1397
- value=pending_eval_queue_df,
1398
- headers=EVAL_COLS,
1399
- datatype=EVAL_TYPES,
1400
- row_count=5,
1401
- )
1402
  with gr.Row():
1403
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
1404
-
1405
  with gr.Row():
1406
  with gr.Column():
1407
  model_name_textbox = gr.Textbox(label="Model name")
@@ -1459,10 +498,9 @@ with demo:
1459
  submission_result,
1460
  )
1461
 
1462
-
1463
  with gr.Row():
1464
  with gr.Accordion("📙 Citation", open=False):
1465
- citation_button = gr.Textbox(
1466
  value=CITATION_BUTTON_TEXT,
1467
  label=CITATION_BUTTON_LABEL,
1468
  lines=20,
@@ -1470,7 +508,9 @@ with demo:
1470
  show_copy_button=True,
1471
  )
1472
 
 
1473
  scheduler = BackgroundScheduler()
1474
- scheduler.add_job(restart_space, "interval", seconds=1800)
1475
  scheduler.start()
1476
- demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
  import time
6
+ import functools
7
+ import gc
8
+
9
+ import os
10
 
11
  from src.about import (
12
+ CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT_1, LLM_BENCHMARKS_TEXT_2, CROSS_EVALUATION_METRICS,
14
+ NOTE_GENERATION_METRICS, HEALTHBENCH_METRICS, TITLE, LOGO, FIVE_PILLAR_DIAGRAM
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
  from src.display.css_html_js import custom_css
 
17
  from src.display.utils import (
18
+ DATASET_BENCHMARK_COLS, OPEN_ENDED_BENCHMARK_COLS, MED_SAFETY_BENCHMARK_COLS,
19
+ MEDICAL_SUMMARIZATION_BENCHMARK_COLS, ACI_BENCHMARK_COLS, SOAP_BENCHMARK_COLS,
20
+ HEALTHBENCH_BENCHMARK_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, DATASET_COLS,
21
+ OPEN_ENDED_COLS, MED_SAFETY_COLS, MEDICAL_SUMMARIZATION_COLS, ACI_COLS, SOAP_COLS,
22
+ HEALTHBENCH_COLS, HEALTHBENCH_HARD_COLS, EVAL_COLS, EVAL_TYPES, NUMERIC_INTERVALS,
23
+ TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, render_generation_templates,
24
+ OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, OpenEndedFrench_COLS,
25
+ OpenEndedFrench_BENCHMARK_COLS, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS,
26
+ OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, OpenEndedGreek_COLS,
27
+ OpenEndedGreek_BENCHMARK_COLS, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS,
28
+ ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS,
29
+ EHRSQL_ZERO_SHOT_COLS, EHRSQL_ZERO_SHOT_BENCHMARK_COLS,
30
+ EHRSQL_FEW_SHOT_COLS, EHRSQL_FEW_SHOT_BENCHMARK_COLS,
31
+ MEDCALC_DIRECT_ANSWER_COLS, MEDCALC_DIRECT_ANSWER_BENCHMARK_COLS,
32
+ MEDCALC_ONE_SHOT_COT_COLS, MEDCALC_ONE_SHOT_COT_BENCHMARK_COLS,
33
+ MEDCALC_ZERO_SHOT_COT_COLS, MEDCALC_ZERO_SHOT_COT_BENCHMARK_COLS,
34
+ MEDEC_ZERO_SHOT_COLS, MEDEC_ZERO_SHOT_BENCHMARK_COLS,
35
+ MEDEC_ONE_SHOT_COLS, MEDEC_ONE_SHOT_BENCHMARK_COLS,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 
38
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
39
+ from src.submission.submit import add_new_eval
40
+
41
+ # =====================================================================================
42
+ # 1. SETUP AND DATA LOADING
43
+ # =====================================================================================
44
 
45
  def restart_space():
46
  API.restart_space(repo_id=REPO_ID)
47
 
48
 
49
+ print("Downloading evaluation data...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  try:
51
+ snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
52
+ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN)
53
+ print("Downloads complete.")
54
+ except Exception as e:
55
+ print(f"An error occurred during download: {e}")
 
 
56
  restart_space()
57
 
58
+ print("Loading all dataframes into a central dictionary...")
 
 
59
  start_time = time.time()
60
 
61
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
 
 
 
62
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
 
 
 
63
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
 
 
 
64
  _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
 
 
 
65
  _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
 
 
 
66
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
 
 
 
67
  _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
 
 
68
  _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
 
 
 
69
  _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
70
  _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
71
  _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
 
73
  _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
74
  _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
75
  _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
76
+ _, ehrsql_zero_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EHRSQL_ZERO_SHOT_COLS, EHRSQL_ZERO_SHOT_BENCHMARK_COLS, "score", "ehrsql_zero_shot")
77
+ _, ehrsql_few_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EHRSQL_FEW_SHOT_COLS, EHRSQL_FEW_SHOT_BENCHMARK_COLS, "score", "ehrsql_few_shot")
78
+ _, medcalc_direct_answer_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDCALC_DIRECT_ANSWER_COLS, MEDCALC_DIRECT_ANSWER_BENCHMARK_COLS, "score", "medcalc_direct_answer")
79
+ _, medcalc_one_shot_cot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDCALC_ONE_SHOT_COT_COLS, MEDCALC_ONE_SHOT_COT_BENCHMARK_COLS, "score", "medcalc_one_shot_cot")
80
+ _, medcalc_zero_shot_cot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDCALC_ZERO_SHOT_COT_COLS, MEDCALC_ZERO_SHOT_COT_BENCHMARK_COLS, "score", "medcalc_zero_shot_cot")
81
+ _, medec_zero_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDEC_ZERO_SHOT_COLS, MEDEC_ZERO_SHOT_BENCHMARK_COLS, "score", "medec_zero_shot")
82
+ _, medec_one_shot_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDEC_ONE_SHOT_COLS, MEDEC_ONE_SHOT_BENCHMARK_COLS, "score", "medec_one_shot")
83
+
84
+ # Debug check for openai/gpt-oss-20b
85
+
86
+ ALL_DATASETS = {
87
+ "datasets": harness_datasets_original_df,
88
+ "open_ended": open_ended_original_df,
89
+ "med_safety": med_safety_original_df,
90
+ "medical_summarization": medical_summarization_original_df,
91
+ "aci": aci_original_df,
92
+ "soap": soap_original_df,
93
+ "healthbench": healthbench_original_df,
94
+ "healthbench_hard": healthbench_hard_original_df,
95
+ "open_ended_arabic": open_ended_arabic_df,
96
+ "open_ended_french": open_ended_french_df,
97
+ "open_ended_portuguese": open_ended_portuguese_df,
98
+ "open_ended_romanian": open_ended_romanian_df,
99
+ "open_ended_greek": open_ended_greek_df,
100
+ "open_ended_spanish": open_ended_spanish_df,
101
+ "closed_ended_multilingual": closed_ended_multilingual_df,
102
+ "ehrsql_zero_shot": ehrsql_zero_shot_df,
103
+ "ehrsql_few_shot": ehrsql_few_shot_df,
104
+ "medcalc_direct_answer": medcalc_direct_answer_df,
105
+ "medcalc_one_shot_cot": medcalc_one_shot_cot_df,
106
+ "medcalc_zero_shot_cot": medcalc_zero_shot_cot_df,
107
+ "medec_zero_shot": medec_zero_shot_df,
108
+ "medec_one_shot": medec_one_shot_df,
109
+ }
110
  end_time = time.time()
111
+ print(f"Dataframes loaded in {end_time - start_time:.2f} seconds.")
 
 
 
 
 
 
112
 
113
+ # Evaluation Queue DataFrames
114
+ (finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ # =====================================================================================
117
+ # 2. EFFICIENT FILTERING LOGIC
118
+ # =====================================================================================
119
 
120
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
121
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
122
 
 
 
 
 
 
 
 
 
 
 
 
123
  def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
124
  final_df = []
125
  if query != "":
 
135
  filtered_df = filtered_df.drop_duplicates(
136
  subset=[
137
  AutoEvalColumn.model.name,
 
 
138
  ]
139
  )
140
 
 
144
  def filter_models(
145
  df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
146
  ) -> pd.DataFrame:
 
 
 
 
 
147
 
148
  filtered_df = df
149
 
 
153
 
154
  if domain_specific_query is not None:
155
  domain_specifics = []
156
+ if "🏥 Clinical models" in domain_specific_query:
157
  domain_specifics.append(True)
158
  if "Generic models" in domain_specific_query:
159
  domain_specifics.append(False)
160
  filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
161
+
 
 
 
 
 
162
  if precision_query is not None:
163
  if AutoEvalColumn.precision.name in df.columns:
164
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
171
 
172
  return filtered_df
173
 
174
+ def get_filtered_table(
175
+ shown_columns: list,
176
+ query: str,
177
+ domain_specific_query: list,
178
+ size_query: list,
179
+ *, # force subset_name to be a keyword-only argument
180
+ subset_name: str
181
+ ):
182
+ original_df = ALL_DATASETS[subset_name]
183
+
184
+ type_query = None
185
+ filtered_df = filter_models(original_df, type_query, domain_specific_query, size_query, None, False)
186
+ filtered_df = filter_queries(query, filtered_df)
187
+
188
+ always_here_cols = [AutoEvalColumn.model.name]
189
+ available_cols = [c for c in shown_columns if c in filtered_df.columns]
190
+ final_df = filtered_df[always_here_cols + available_cols]
191
+
192
+ del filtered_df
193
+ gc.collect()
194
+
195
+
196
+ return final_df
197
+
198
+ # =====================================================================================
199
+ # 3. REUSABLE UI CREATION FUNCTION
200
+ # =====================================================================================
201
+
202
+ def create_leaderboard_ui(subset_name: str, column_choices: list, default_columns: list):
203
+ """Creates a full leaderboard UI block for a given subset."""
204
+ with gr.Row():
205
+ with gr.Column():
206
+ with gr.Row():
207
+ search_bar = gr.Textbox(
208
+ placeholder=f"🔍 Search for models...",
209
+ show_label=False,
210
+ elem_id=f"search-bar-{subset_name}",
211
+ )
212
+ with gr.Row():
213
+ shown_columns = gr.CheckboxGroup(
214
+ choices=column_choices,
215
+ value=default_columns,
216
+ label="Select columns to show",
217
+ elem_id=f"column-select-{subset_name}",
218
+ interactive=True,
219
+ )
220
+ with gr.Column(min_width=320):
221
+ filter_domain_specific = gr.CheckboxGroup(
222
+ label="Domain Specificity",
223
+ choices=["🏥 Clinical models", "Generic models"],
224
+ value=["🏥 Clinical models", "Generic models"],
225
+ interactive=True,
226
+ elem_id=f"filter-domain-{subset_name}",
227
+ )
228
+ filter_columns_size = gr.CheckboxGroup(
229
+ label="Model sizes (in billions of parameters)",
230
+ choices=list(NUMERIC_INTERVALS.keys()),
231
+ value=list(NUMERIC_INTERVALS.keys()),
232
+ interactive=True,
233
+ elem_id=f"filter-size-{subset_name}",
234
+ )
235
+
236
+ update_fn = functools.partial(get_filtered_table, subset_name=subset_name)
237
+
238
+ initial_df = update_fn(
239
+ shown_columns=default_columns,
240
+ query="",
241
+ domain_specific_query=["🏥 Clinical models", "Generic models"],
242
+ size_query=list(NUMERIC_INTERVALS.keys())
243
+ )
244
+
245
+ leaderboard_table = gr.Dataframe(
246
+ value=initial_df,
247
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + default_columns,
248
+ datatype=TYPES,
249
+ elem_id=f"leaderboard-table-{subset_name}",
250
+ interactive=False,
251
+ )
252
+
253
+ inputs = [shown_columns, search_bar, filter_domain_specific, filter_columns_size]
254
+
255
+ # Attach listeners to all input components
256
+ for component in inputs:
257
+ if isinstance(component, gr.Textbox):
258
+ component.submit(update_fn, inputs, leaderboard_table)
259
+ else:
260
+ component.change(update_fn, inputs, leaderboard_table)
261
+
262
+ return leaderboard_table
263
+
264
+ # =====================================================================================
265
+ # 4. GRADIO DEMO UI (Main application layout)
266
+ # =====================================================================================
267
+
268
  demo = gr.Blocks(css=custom_css)
269
+
270
  with demo:
 
271
  gr.HTML(LOGO)
272
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
273
+
274
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
275
  with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
276
  with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
277
  LANGUAGES = {
278
+ "🇺🇸 English": "open_ended", "🇦🇪 Arabic": "open_ended_arabic",
279
+ "🇫🇷 French": "open_ended_french", "🇪🇸 Spanish": "open_ended_spanish",
280
+ "🇵🇹 Portuguese": "open_ended_portuguese", "🇷🇴 Romanian": "open_ended_romanian",
 
 
 
281
  "🇬🇷 Greek": "open_ended_greek",
282
  }
 
283
  for idx, (label, subset) in enumerate(LANGUAGES.items()):
284
  with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
285
+ judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English." if label == "🇺🇸 English" else "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
 
 
 
 
 
286
  gr.Markdown(judge_text, elem_classes="markdown-text")
287
 
288
+ create_leaderboard_ui(
289
+ subset_name=subset,
290
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
291
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  with gr.Accordion("💬 Generation templates", open=False):
294
  with gr.Accordion("Response generation", open=False):
295
  render_generation_templates(task="open_ended", generation_type="response_generation")
296
  with gr.Accordion("Scoring Rubric", open=False):
297
  render_generation_templates(task="open_ended", generation_type="scoring_rubric")
298
+
299
  with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
300
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
301
+ create_leaderboard_ui(
302
+ subset_name="medical_summarization",
303
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
304
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  with gr.Accordion("💬 Generation templates", open=False):
307
  with gr.Accordion("Response generation", open=False):
308
+ render_generation_templates(task="medical_summarization", generation_type="response_generation")
309
  with gr.Accordion("Question generation", open=False):
310
+ render_generation_templates(task="ce", generation_type="question_generation")
311
  with gr.Accordion("Cross Examination", open=False):
312
+ render_generation_templates(task="ce", generation_type="cross_examination")
313
+
314
  with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=3):
315
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
316
+ with gr.Tabs(elem_classes="tab-buttons2"):
317
+ with gr.TabItem("ACI Bench", id=0):
318
+ create_leaderboard_ui(
319
+ subset_name="aci",
320
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
321
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  )
323
+ with gr.TabItem("SOAP Notes", id=1):
324
+ create_leaderboard_ui(
325
+ subset_name="soap",
326
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
327
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  )
329
+ # Add accordions for this section if needed, similar to other tabs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  with gr.TabItem("🏅 HealthBench", elem_id="llm-benchmark-tab-table", id=4):
332
  gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
333
+ with gr.Tabs(elem_classes="tab-buttons2"):
334
+ with gr.TabItem("HealthBench", id=0):
335
+ create_leaderboard_ui(
336
+ subset_name="healthbench",
337
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)],
338
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  )
340
+ with gr.TabItem("HealthBench-Hard", id=1):
341
+ create_leaderboard_ui(
342
+ subset_name="healthbench_hard",
343
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)],
344
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=5):
348
+ create_leaderboard_ui(
349
+ subset_name="med_safety",
350
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
351
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  with gr.Accordion("💬 Generation templates", open=False):
354
  with gr.Accordion("Response generation", open=False):
355
+ render_generation_templates(task="med_safety", generation_type="response_generation")
356
  with gr.Accordion("Scoring Rubric", open=False):
357
+ render_generation_templates(task="med_safety", generation_type="scoring_rubric")
358
+
359
  with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
360
+ with gr.Tabs(elem_classes="tab-buttons2"):
361
+ with gr.TabItem("English", id=0):
362
+ create_leaderboard_ui(
363
+ subset_name="datasets",
364
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
365
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)]
366
+ )
367
+ with gr.TabItem("🌍 Multilingual", id=1):
368
+ gr.Markdown("📊 **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
369
+ create_leaderboard_ui(
370
+ subset_name="closed_ended_multilingual",
371
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
372
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  )
374
 
375
+ with gr.TabItem("🏅 EHRSQL", elem_id="llm-benchmark-tab-table", id=7):
376
+ with gr.Tabs(elem_classes="tab-buttons2"):
377
+ with gr.TabItem("Zero Shot", id=0):
378
+ create_leaderboard_ui(
379
+ subset_name="ehrsql_zero_shot",
380
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_zero_shot_col)],
381
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_zero_shot_col)]
382
  )
383
+ with gr.TabItem("Few Shot", id=1):
384
+ create_leaderboard_ui(
385
+ subset_name="ehrsql_few_shot",
386
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_few_shot_col)],
387
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.ehrsql_few_shot_col)]
 
 
 
 
 
 
 
388
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
+ with gr.TabItem("🏅 MedCalc", elem_id="llm-benchmark-tab-table", id=8):
391
+ with gr.Tabs(elem_classes="tab-buttons2"):
392
+ with gr.TabItem("Direct Answer", id=0):
393
+ create_leaderboard_ui(
394
+ subset_name="medcalc_direct_answer",
395
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_direct_answer_col)],
396
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_direct_answer_col)]
 
397
  )
398
+ with gr.TabItem("One Shot CoT", id=1):
399
+ create_leaderboard_ui(
400
+ subset_name="medcalc_one_shot_cot",
401
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_one_shot_cot_col)],
402
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_one_shot_cot_col)]
403
  )
404
+ with gr.TabItem("Zero Shot CoT", id=2):
405
+ create_leaderboard_ui(
406
+ subset_name="medcalc_zero_shot_cot",
407
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_zero_shot_cot_col)],
408
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medcalc_zero_shot_cot_col)]
 
 
 
 
 
 
 
 
409
  )
410
+
411
+ with gr.TabItem("🏅 MedEC", elem_id="llm-benchmark-tab-table", id=9):
412
+ with gr.Tabs(elem_classes="tab-buttons2"):
413
+ with gr.TabItem("Zero Shot", id=0):
414
+ create_leaderboard_ui(
415
+ subset_name="medec_zero_shot",
416
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medec_zero_shot_col)],
417
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medec_zero_shot_col)]
418
+ )
419
+ with gr.TabItem("One Shot", id=1):
420
+ create_leaderboard_ui(
421
+ subset_name="medec_one_shot",
422
+ column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medec_one_shot_col)],
423
+ default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medec_one_shot_col)]
424
+ )
425
+
426
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=10):
 
 
 
 
 
 
 
427
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
428
  gr.HTML(FIVE_PILLAR_DIAGRAM)
429
  gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text")
 
 
 
 
430
 
431
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=11):
432
+
433
  with gr.Column():
434
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
435
+ with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
436
+ gr.Dataframe(value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
437
+ with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
438
+ gr.Dataframe(value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
439
+ with gr.Accordion(f" Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
440
+ gr.Dataframe(value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
441
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  with gr.Row():
443
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
 
444
  with gr.Row():
445
  with gr.Column():
446
  model_name_textbox = gr.Textbox(label="Model name")
 
498
  submission_result,
499
  )
500
 
 
501
  with gr.Row():
502
  with gr.Accordion("📙 Citation", open=False):
503
+ gr.Textbox(
504
  value=CITATION_BUTTON_TEXT,
505
  label=CITATION_BUTTON_LABEL,
506
  lines=20,
 
508
  show_copy_button=True,
509
  )
510
 
511
+
512
  scheduler = BackgroundScheduler()
513
+ scheduler.add_job(restart_space, "interval", seconds=86400)
514
  scheduler.start()
515
+
516
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'], share=True , ssr_mode=False)
requirements.txt CHANGED
@@ -11,8 +11,6 @@ pandas
11
  python-dateutil
12
  requests
13
  tqdm
14
- git+https://github.com/huggingface/transformers.git
15
  tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
- accelerate
18
  sentencepiece
 
11
  python-dateutil
12
  requests
13
  tqdm
14
+ transformers
15
  tokenizers>=0.15.0
 
 
16
  sentencepiece
src/about.py CHANGED
@@ -209,6 +209,100 @@ class HealthbenchHardColumns(Enum):
209
  healthbench_hard_column12 = HealthbenchHardColumn("Axis: Instruction following", "score", "Axis: Instruction following")
210
  healthbench_hard_column13 = HealthbenchHardColumn("Axis: Communication quality", "score", "Axis: Communication quality")
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  NUM_FEWSHOT = 0 # Change with your few shot
213
  # ---------------------------------------------------
214
 
 
209
  healthbench_hard_column12 = HealthbenchHardColumn("Axis: Instruction following", "score", "Axis: Instruction following")
210
  healthbench_hard_column13 = HealthbenchHardColumn("Axis: Communication quality", "score", "Axis: Communication quality")
211
 
212
+ @dataclass
213
+ class EHRSQLZeroShotColumn:
214
+ benchmark: str
215
+ metric: str
216
+ col_name: str
217
+
218
+ class EHRSQLZeroShotColumns(Enum):
219
+ ehrsql_zero_shot_column0 = EHRSQLZeroShotColumn("RS (0)", "score", "RS (0)")
220
+ ehrsql_zero_shot_column1 = EHRSQLZeroShotColumn("Abstains correct %", "score", "Abstains correct %")
221
+ ehrsql_zero_shot_column2 = EHRSQLZeroShotColumn("Abstains incorrect %", "score", "Abstains incorrect %")
222
+ ehrsql_zero_shot_column3 = EHRSQLZeroShotColumn("Abstains failed %", "score", "Abstains failed %")
223
+
224
+ @dataclass
225
+ class EHRSQLFewShotColumn:
226
+ benchmark: str
227
+ metric: str
228
+ col_name: str
229
+
230
+ class EHRSQLFewShotColumns(Enum):
231
+ ehrsql_few_shot_column0 = EHRSQLFewShotColumn("RS (0)", "score", "RS (0)")
232
+ ehrsql_few_shot_column1 = EHRSQLFewShotColumn("Abstains correct %", "score", "Abstains correct %")
233
+ ehrsql_few_shot_column2 = EHRSQLFewShotColumn("Abstains incorrect %", "score", "Abstains incorrect %")
234
+ ehrsql_few_shot_column3 = EHRSQLFewShotColumn("Abstains failed %", "score", "Abstains failed %")
235
+
236
+ @dataclass
237
+ class MedCalcDirectAnswerColumn:
238
+ benchmark: str
239
+ metric: str
240
+ col_name: str
241
+
242
+ class MedCalcDirectAnswerColumns(Enum):
243
+ medcalc_direct_answer_column0 = MedCalcDirectAnswerColumn("lab", "score", "Lab")
244
+ medcalc_direct_answer_column1 = MedCalcDirectAnswerColumn("risk", "score", "Risk")
245
+ medcalc_direct_answer_column2 = MedCalcDirectAnswerColumn("physical", "score", "Physical")
246
+ medcalc_direct_answer_column3 = MedCalcDirectAnswerColumn("severity", "score", "Severity")
247
+ medcalc_direct_answer_column4 = MedCalcDirectAnswerColumn("diagnosis", "score", "Diagnosis")
248
+ medcalc_direct_answer_column5 = MedCalcDirectAnswerColumn("date", "score", "Date")
249
+ medcalc_direct_answer_column6 = MedCalcDirectAnswerColumn("dosage", "score", "Dosage")
250
+ medcalc_direct_answer_column7 = MedCalcDirectAnswerColumn("overall", "score", "Overall")
251
+
252
+ @dataclass
253
+ class MedCalcOneShotCotColumn:
254
+ benchmark: str
255
+ metric: str
256
+ col_name: str
257
+
258
+ class MedCalcOneShotCotColumns(Enum):
259
+ medcalc_one_shot_cot_column0 = MedCalcOneShotCotColumn("lab", "score", "Lab")
260
+ medcalc_one_shot_cot_column1 = MedCalcOneShotCotColumn("risk", "score", "Risk")
261
+ medcalc_one_shot_cot_column2 = MedCalcOneShotCotColumn("physical", "score", "Physical")
262
+ medcalc_one_shot_cot_column3 = MedCalcOneShotCotColumn("severity", "score", "Severity")
263
+ medcalc_one_shot_cot_column4 = MedCalcOneShotCotColumn("diagnosis", "score", "Diagnosis")
264
+ medcalc_one_shot_cot_column5 = MedCalcOneShotCotColumn("date", "score", "Date")
265
+ medcalc_one_shot_cot_column6 = MedCalcOneShotCotColumn("dosage", "score", "Dosage")
266
+ medcalc_one_shot_cot_column7 = MedCalcOneShotCotColumn("overall", "score", "Overall")
267
+
268
+ @dataclass
269
+ class MedCalcZeroShotCotColumn:
270
+ benchmark: str
271
+ metric: str
272
+ col_name: str
273
+
274
+ class MedCalcZeroShotCotColumns(Enum):
275
+ medcalc_zero_shot_cot_column0 = MedCalcZeroShotCotColumn("lab", "score", "Lab")
276
+ medcalc_zero_shot_cot_column1 = MedCalcZeroShotCotColumn("risk", "score", "Risk")
277
+ medcalc_zero_shot_cot_column2 = MedCalcZeroShotCotColumn("physical", "score", "Physical")
278
+ medcalc_zero_shot_cot_column3 = MedCalcZeroShotCotColumn("severity", "score", "Severity")
279
+ medcalc_zero_shot_cot_column4 = MedCalcZeroShotCotColumn("diagnosis", "score", "Diagnosis")
280
+ medcalc_zero_shot_cot_column5 = MedCalcZeroShotCotColumn("date", "score", "Date")
281
+ medcalc_zero_shot_cot_column6 = MedCalcZeroShotCotColumn("dosage", "score", "Dosage")
282
+ medcalc_zero_shot_cot_column7 = MedCalcZeroShotCotColumn("overall", "score", "Overall")
283
+
284
+ @dataclass
285
+ class MedECZeroShotColumn:
286
+ benchmark: str
287
+ metric: str
288
+ col_name: str
289
+
290
+ class MedECZeroShotColumns(Enum):
291
+ medec_zero_shot_column0 = MedECZeroShotColumn("Error Flag Accuracy (%)", "score", "Error Flag Accuracy (%)")
292
+ medec_zero_shot_column1 = MedECZeroShotColumn("Error Sentence ID Accuracy (%)", "score", "Error Sentence ID Accuracy (%)")
293
+ medec_zero_shot_column2 = MedECZeroShotColumn("Invalid Responses (%)", "score", "Invalid Responses (%)")
294
+
295
+ @dataclass
296
+ class MedECOneShotColumn:
297
+ benchmark: str
298
+ metric: str
299
+ col_name: str
300
+
301
+ class MedECOneShotColumns(Enum):
302
+ medec_one_shot_column0 = MedECOneShotColumn("Error Flag Accuracy (%)", "score", "Error Flag Accuracy (%)")
303
+ medec_one_shot_column1 = MedECOneShotColumn("Error Sentence ID Accuracy (%)", "score", "Error Sentence ID Accuracy (%)")
304
+ medec_one_shot_column2 = MedECOneShotColumn("Invalid Responses (%)", "score", "Invalid Responses (%)")
305
+
306
  NUM_FEWSHOT = 0 # Change with your few shot
307
  # ---------------------------------------------------
308
 
src/display/utils.py CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  # changes to be made here
7
- from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
@@ -41,6 +41,13 @@ class ColumnContent:
41
  open_ended_romanian_col: bool = False
42
  open_ended_greek_col: bool = False
43
  closed_ended_multilingual_col: bool = False
 
 
 
 
 
 
 
44
 
45
 
46
  ## Leaderboard columns
@@ -78,6 +85,20 @@ for column in HealthbenchHardColumns:
78
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_hard_col=True, invariant=False)])
79
  else:
80
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_hard_col=True, invariant=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  for column in OpenEndedArabicColumn:
82
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_arabic_col=True, invariant=False)])
83
  for column in OpenEndedFrenchColumn:
@@ -261,7 +282,14 @@ OpenEndedPortuguese_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidd
261
  OpenEndedRomanian_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_romanian_col or c.invariant)]
262
  OpenEndedGreek_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_greek_col or c.invariant)]
263
 
 
 
264
 
 
 
 
 
 
265
 
266
  ClosedEndedMultilingual_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_multilingual_col or c.invariant)]
267
 
@@ -301,6 +329,14 @@ OpenEndedSpanish_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedSpanishCol
301
  OpenEndedRomanian_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedRomanianColumn]
302
  OpenEndedGreek_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedGreekColumn]
303
 
 
 
 
 
 
 
 
 
304
 
305
  ClosedEndedMultilingual_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedMultilingualColumns]
306
 
 
4
  import pandas as pd
5
 
6
  # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, EHRSQLZeroShotColumns, EHRSQLFewShotColumns, MedCalcDirectAnswerColumns, MedCalcOneShotCotColumns, MedCalcZeroShotCotColumns, MedECZeroShotColumns, MedECOneShotColumns
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
 
41
  open_ended_romanian_col: bool = False
42
  open_ended_greek_col: bool = False
43
  closed_ended_multilingual_col: bool = False
44
+ ehrsql_zero_shot_col: bool = False
45
+ ehrsql_few_shot_col: bool = False
46
+ medcalc_direct_answer_col: bool = False
47
+ medcalc_one_shot_cot_col: bool = False
48
+ medcalc_zero_shot_cot_col: bool = False
49
+ medec_zero_shot_col: bool = False
50
+ medec_one_shot_col: bool = False
51
 
52
 
53
  ## Leaderboard columns
 
85
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_hard_col=True, invariant=False)])
86
  else:
87
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_hard_col=True, invariant=False)])
88
+ for column in EHRSQLZeroShotColumns:
89
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, ehrsql_zero_shot_col=True, invariant=False)])
90
+ for column in EHRSQLFewShotColumns:
91
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, ehrsql_few_shot_col=True, invariant=False)])
92
+ for column in MedCalcDirectAnswerColumns:
93
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medcalc_direct_answer_col=True, invariant=False)])
94
+ for column in MedCalcOneShotCotColumns:
95
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medcalc_one_shot_cot_col=True, invariant=False)])
96
+ for column in MedCalcZeroShotCotColumns:
97
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medcalc_zero_shot_cot_col=True, invariant=False)])
98
+ for column in MedECZeroShotColumns:
99
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medec_zero_shot_col=True, invariant=False)])
100
+ for column in MedECOneShotColumns:
101
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medec_one_shot_col=True, invariant=False)])
102
  for column in OpenEndedArabicColumn:
103
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_arabic_col=True, invariant=False)])
104
  for column in OpenEndedFrenchColumn:
 
282
  OpenEndedRomanian_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_romanian_col or c.invariant)]
283
  OpenEndedGreek_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_greek_col or c.invariant)]
284
 
285
+ EHRSQL_ZERO_SHOT_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.ehrsql_zero_shot_col or c.invariant)]
286
+ EHRSQL_FEW_SHOT_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.ehrsql_few_shot_col or c.invariant)]
287
 
288
+ MEDCALC_DIRECT_ANSWER_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medcalc_direct_answer_col or c.invariant)]
289
+ MEDCALC_ONE_SHOT_COT_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medcalc_one_shot_cot_col or c.invariant)]
290
+ MEDCALC_ZERO_SHOT_COT_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medcalc_zero_shot_cot_col or c.invariant)]
291
+ MEDEC_ZERO_SHOT_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medec_zero_shot_col or c.invariant)]
292
+ MEDEC_ONE_SHOT_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medec_one_shot_col or c.invariant)]
293
 
294
  ClosedEndedMultilingual_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_multilingual_col or c.invariant)]
295
 
 
329
  OpenEndedRomanian_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedRomanianColumn]
330
  OpenEndedGreek_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedGreekColumn]
331
 
332
+ EHRSQL_ZERO_SHOT_BENCHMARK_COLS = [t.value.col_name for t in EHRSQLZeroShotColumns]
333
+ EHRSQL_FEW_SHOT_BENCHMARK_COLS = [t.value.col_name for t in EHRSQLFewShotColumns]
334
+
335
+ MEDCALC_DIRECT_ANSWER_BENCHMARK_COLS = [t.value.col_name for t in MedCalcDirectAnswerColumns]
336
+ MEDCALC_ONE_SHOT_COT_BENCHMARK_COLS = [t.value.col_name for t in MedCalcOneShotCotColumns]
337
+ MEDCALC_ZERO_SHOT_COT_BENCHMARK_COLS = [t.value.col_name for t in MedCalcZeroShotCotColumns]
338
+ MEDEC_ZERO_SHOT_BENCHMARK_COLS = [t.value.col_name for t in MedECZeroShotColumns]
339
+ MEDEC_ONE_SHOT_BENCHMARK_COLS = [t.value.col_name for t in MedECOneShotColumns]
340
 
341
  ClosedEndedMultilingual_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedMultilingualColumns]
342
 
src/leaderboard/read_evals.py CHANGED
@@ -9,7 +9,7 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
@@ -38,6 +38,13 @@ class EvalResult:
38
  open_ended_romanian_results: dict
39
  open_ended_greek_results: dict
40
  closed_ended_multilingual_results: dict
 
 
 
 
 
 
 
41
  is_domain_specific: bool
42
  use_chat_template: bool
43
  # clinical_type_results:dict
@@ -269,6 +276,55 @@ class EvalResult:
269
  task = task.value
270
  accs = data["results"]["closed-ended-multilingual"][task.benchmark]["accuracy"] if task.benchmark in data["results"]["closed-ended-multilingual"] else None
271
  closed_ended_multilingual_results[task.benchmark] = accs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  # #add the
274
  # closed_ended_arabic_results = {}
@@ -327,6 +383,13 @@ class EvalResult:
327
  open_ended_romanian_results=open_ended_romanian_results,
328
  open_ended_greek_results=open_ended_greek_results,
329
  closed_ended_multilingual_results=closed_ended_multilingual_results,
 
 
 
 
 
 
 
330
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
331
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
332
  precision=precision,
@@ -476,6 +539,41 @@ class EvalResult:
476
  for task in ClosedEndedMultilingualColumns:
477
  data_dict[task.value.col_name] = self.closed_ended_multilingual_results[task.value.benchmark]
478
  return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
 
480
  def get_request_file_for_model(requests_path, model_name, precision):
481
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
@@ -539,12 +637,8 @@ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metri
539
  results = []
540
  # clinical_type_results = []
541
  for v in eval_results.values():
542
- try:
543
- v.to_dict(subset="dataset") # we test if the dict version is complete
544
- if not v.display_result:
545
- continue
546
- results.append(v)
547
- except KeyError: # not all eval values present
548
  continue
 
549
  # breakpoint()
550
  return results
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, EHRSQLZeroShotColumns, EHRSQLFewShotColumns, MedCalcDirectAnswerColumns, MedCalcOneShotCotColumns, MedCalcZeroShotCotColumns, MedECZeroShotColumns, MedECOneShotColumns
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
 
38
  open_ended_romanian_results: dict
39
  open_ended_greek_results: dict
40
  closed_ended_multilingual_results: dict
41
+ ehrsql_zero_shot_results: dict
42
+ ehrsql_few_shot_results: dict
43
+ medcalc_direct_answer_results: dict
44
+ medcalc_one_shot_cot_results: dict
45
+ medcalc_zero_shot_cot_results: dict
46
+ medec_zero_shot_results: dict
47
+ medec_one_shot_results: dict
48
  is_domain_specific: bool
49
  use_chat_template: bool
50
  # clinical_type_results:dict
 
276
  task = task.value
277
  accs = data["results"]["closed-ended-multilingual"][task.benchmark]["accuracy"] if task.benchmark in data["results"]["closed-ended-multilingual"] else None
278
  closed_ended_multilingual_results[task.benchmark] = accs
279
+
280
+ ehrsql_zero_shot_results = {}
281
+ if "ehrsql" in data["results"] and "zero_shot" in data["results"]["ehrsql"]:
282
+ for task in EHRSQLZeroShotColumns:
283
+ task = task.value
284
+ accs = data["results"]["ehrsql"]["zero_shot"][task.benchmark] if task.benchmark in data["results"]["ehrsql"]["zero_shot"] else None
285
+ ehrsql_zero_shot_results[task.benchmark] = accs
286
+
287
+ ehrsql_few_shot_results = {}
288
+ if "ehrsql" in data["results"] and "few_shot" in data["results"]["ehrsql"]:
289
+ for task in EHRSQLFewShotColumns:
290
+ task = task.value
291
+ accs = data["results"]["ehrsql"]["few_shot"][task.benchmark] if task.benchmark in data["results"]["ehrsql"]["few_shot"] else None
292
+ ehrsql_few_shot_results[task.benchmark] = accs
293
+
294
+ medcalc_direct_answer_results = {}
295
+ if "medcalc" in data["results"] and "direct_answer" in data["results"]["medcalc"]:
296
+ for task in MedCalcDirectAnswerColumns:
297
+ task = task.value
298
+ accs = data["results"]["medcalc"]["direct_answer"].get(task.benchmark, {}).get("average", None)
299
+ medcalc_direct_answer_results[task.benchmark] = accs
300
+
301
+ medcalc_one_shot_cot_results = {}
302
+ if "medcalc" in data["results"] and "one_shot_cot" in data["results"]["medcalc"]:
303
+ for task in MedCalcOneShotCotColumns:
304
+ task = task.value
305
+ accs = data["results"]["medcalc"]["one_shot_cot"].get(task.benchmark, {}).get("average", None)
306
+ medcalc_one_shot_cot_results[task.benchmark] = accs
307
+
308
+ medcalc_zero_shot_cot_results = {}
309
+ if "medcalc" in data["results"] and "zero_shot_cot" in data["results"]["medcalc"]:
310
+ for task in MedCalcZeroShotCotColumns:
311
+ task = task.value
312
+ accs = data["results"]["medcalc"]["zero_shot_cot"].get(task.benchmark, {}).get("average", None)
313
+ medcalc_zero_shot_cot_results[task.benchmark] = accs
314
+
315
+ medec_zero_shot_results = {}
316
+ if "medec" in data["results"] and "zero_shot" in data["results"]["medec"]:
317
+ for task in MedECZeroShotColumns:
318
+ task = task.value
319
+ accs = data["results"]["medec"]["zero_shot"].get(task.benchmark, None)
320
+ medec_zero_shot_results[task.benchmark] = accs
321
+
322
+ medec_one_shot_results = {}
323
+ if "medec" in data["results"] and "one_shot" in data["results"]["medec"]:
324
+ for task in MedECOneShotColumns:
325
+ task = task.value
326
+ accs = data["results"]["medec"]["one_shot"].get(task.benchmark, None)
327
+ medec_one_shot_results[task.benchmark] = accs
328
 
329
  # #add the
330
  # closed_ended_arabic_results = {}
 
383
  open_ended_romanian_results=open_ended_romanian_results,
384
  open_ended_greek_results=open_ended_greek_results,
385
  closed_ended_multilingual_results=closed_ended_multilingual_results,
386
+ ehrsql_zero_shot_results=ehrsql_zero_shot_results,
387
+ ehrsql_few_shot_results=ehrsql_few_shot_results,
388
+ medcalc_direct_answer_results=medcalc_direct_answer_results,
389
+ medcalc_one_shot_cot_results=medcalc_one_shot_cot_results,
390
+ medcalc_zero_shot_cot_results=medcalc_zero_shot_cot_results,
391
+ medec_zero_shot_results=medec_zero_shot_results,
392
+ medec_one_shot_results=medec_one_shot_results,
393
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
394
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
395
  precision=precision,
 
539
  for task in ClosedEndedMultilingualColumns:
540
  data_dict[task.value.col_name] = self.closed_ended_multilingual_results[task.value.benchmark]
541
  return data_dict
542
+ if subset == "ehrsql_zero_shot":
543
+ if len(self.ehrsql_zero_shot_results) > 0:
544
+ for task in EHRSQLZeroShotColumns:
545
+ data_dict[task.value.col_name] = self.ehrsql_zero_shot_results[task.value.benchmark]
546
+ return data_dict
547
+ if subset == "ehrsql_few_shot":
548
+ if len(self.ehrsql_few_shot_results) > 0:
549
+ for task in EHRSQLFewShotColumns:
550
+ data_dict[task.value.col_name] = self.ehrsql_few_shot_results[task.value.benchmark]
551
+ return data_dict
552
+ if subset == "medcalc_direct_answer":
553
+ if len(self.medcalc_direct_answer_results) > 0:
554
+ for task in MedCalcDirectAnswerColumns:
555
+ data_dict[task.value.col_name] = self.medcalc_direct_answer_results[task.value.benchmark]
556
+ return data_dict
557
+ if subset == "medcalc_one_shot_cot":
558
+ if len(self.medcalc_one_shot_cot_results) > 0:
559
+ for task in MedCalcOneShotCotColumns:
560
+ data_dict[task.value.col_name] = self.medcalc_one_shot_cot_results[task.value.benchmark]
561
+ return data_dict
562
+ if subset == "medcalc_zero_shot_cot":
563
+ if len(self.medcalc_zero_shot_cot_results) > 0:
564
+ for task in MedCalcZeroShotCotColumns:
565
+ data_dict[task.value.col_name] = self.medcalc_zero_shot_cot_results[task.value.benchmark]
566
+ return data_dict
567
+ if subset == "medec_zero_shot":
568
+ if len(self.medec_zero_shot_results) > 0:
569
+ for task in MedECZeroShotColumns:
570
+ data_dict[task.value.col_name] = self.medec_zero_shot_results[task.value.benchmark]
571
+ return data_dict
572
+ if subset == "medec_one_shot":
573
+ if len(self.medec_one_shot_results) > 0:
574
+ for task in MedECOneShotColumns:
575
+ data_dict[task.value.col_name] = self.medec_one_shot_results[task.value.benchmark]
576
+ return data_dict
577
 
578
  def get_request_file_for_model(requests_path, model_name, precision):
579
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
637
  results = []
638
  # clinical_type_results = []
639
  for v in eval_results.values():
640
+ if not v.display_result:
 
 
 
 
 
641
  continue
642
+ results.append(v)
643
  # breakpoint()
644
  return results
src/populate.py CHANGED
@@ -5,17 +5,13 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, ClosedEndedMultilingualColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
15
- # print(raw_data)
16
- # raise Exception("stop")
17
- # if subset.startswith("healthbench"):
18
- # breakpoint()
19
  all_data_json = [v.to_dict(subset=subset) for v in raw_data if not v.full_model.startswith("/models_llm")]
20
  df = pd.DataFrame.from_records(all_data_json)
21
  # changes to be made here
@@ -39,25 +35,43 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
39
  df = df.sort_values(by=["Overall Score"], ascending=False)
40
  elif subset == "closed_ended_multilingual":
41
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  cols = list(set(df.columns).intersection(set(cols)))
43
  df = df[cols].round(decimals=2)
44
  # filter out if any of the benchmarks have not been produced
45
  df = df[has_no_nan_values(df, benchmark_cols)]
46
  return raw_data, df
47
 
48
-
49
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
50
  """Creates the different dataframes for the evaluation queues requestes"""
51
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
52
  all_evals = []
 
53
  for entry in entries:
54
- if ".json" in entry:
55
- file_path = os.path.join(save_path, entry)
56
- with open(file_path) as fp:
 
 
 
 
57
  data = json.load(fp)
 
58
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"]) if not data["private"] else data["model_name"]
59
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
60
- # changes to be made here
61
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
62
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
63
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
@@ -66,42 +80,24 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
66
  if PRIVATE_REPO:
67
  data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
68
  all_evals.append(data)
69
- elif ".md" not in entry:
70
- # this is a folder
71
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
72
- for sub_entry in sub_entries:
73
- file_path = os.path.join(save_path, entry, sub_entry)
74
- with open(file_path) as fp:
75
- data = json.load(fp)
76
- # print(data)
77
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"]) if not data["private"] else data["model_name"]
78
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
79
- data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
80
- data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
81
- data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
82
- data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
83
- data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
84
- if PRIVATE_REPO:
85
- data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
86
- all_evals.append(data)
87
- # breakpoint()
88
  pending_list = []
89
  running_list = []
90
  finished_list = []
91
  for run in all_evals:
92
- # changes to be made here
93
  status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
94
  if PRIVATE_REPO:
95
  status_list.append(run["status"]["closed-ended-arabic"])
96
- # status_list = status_list
97
  if "RUNNING" in status_list:
98
  running_list.append(run)
99
  elif "PENDING" in status_list or "RERUN" in status_list:
100
  pending_list.append(run)
101
  else:
102
  finished_list.append(run)
103
- # breakpoint()
104
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
105
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
106
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
107
  return df_finished[cols], df_running[cols], df_pending[cols]
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, ClosedEndedMultilingualColumns, EHRSQLZeroShotColumns, EHRSQLFewShotColumns, MedCalcDirectAnswerColumns, MedCalcOneShotCotColumns, MedCalcZeroShotCotColumns, MedECZeroShotColumns, MedECOneShotColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
 
 
 
 
15
  all_data_json = [v.to_dict(subset=subset) for v in raw_data if not v.full_model.startswith("/models_llm")]
16
  df = pd.DataFrame.from_records(all_data_json)
17
  # changes to be made here
 
35
  df = df.sort_values(by=["Overall Score"], ascending=False)
36
  elif subset == "closed_ended_multilingual":
37
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
38
+ elif subset == "ehrsql_zero_shot":
39
+ df = df.sort_values(by=["RS (0)"], ascending=False)
40
+ elif subset == "ehrsql_few_shot":
41
+ df = df.sort_values(by=["RS (0)"], ascending=False)
42
+ elif subset == "medcalc_direct_answer":
43
+ df = df.sort_values(by=["Overall"], ascending=False)
44
+ elif subset == "medcalc_one_shot_cot":
45
+ df = df.sort_values(by=["Overall"], ascending=False)
46
+ elif subset == "medcalc_zero_shot_cot":
47
+ df = df.sort_values(by=["Overall"], ascending=False)
48
+ elif subset == "medec_zero_shot":
49
+ df = df.sort_values(by=["Error Flag Accuracy (%)"], ascending=False)
50
+ elif subset == "medec_one_shot":
51
+ df = df.sort_values(by=["Error Flag Accuracy (%)"], ascending=False)
52
  cols = list(set(df.columns).intersection(set(cols)))
53
  df = df[cols].round(decimals=2)
54
  # filter out if any of the benchmarks have not been produced
55
  df = df[has_no_nan_values(df, benchmark_cols)]
56
  return raw_data, df
57
 
 
58
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
59
  """Creates the different dataframes for the evaluation queues requestes"""
60
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
61
  all_evals = []
62
+
63
  for entry in entries:
64
+ full_path = os.path.join(save_path, entry)
65
+
66
+ if os.path.isdir(full_path):
67
+ continue
68
+
69
+ if entry.endswith(".json"):
70
+ with open(full_path) as fp:
71
  data = json.load(fp)
72
+
73
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"]) if not data["private"] else data["model_name"]
74
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
75
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
76
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
77
  data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
 
80
  if PRIVATE_REPO:
81
  data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
82
  all_evals.append(data)
83
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  pending_list = []
85
  running_list = []
86
  finished_list = []
87
  for run in all_evals:
 
88
  status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
89
  if PRIVATE_REPO:
90
  status_list.append(run["status"]["closed-ended-arabic"])
91
+
92
  if "RUNNING" in status_list:
93
  running_list.append(run)
94
  elif "PENDING" in status_list or "RERUN" in status_list:
95
  pending_list.append(run)
96
  else:
97
  finished_list.append(run)
98
+
99
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
100
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
101
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
102
+
103
  return df_finished[cols], df_running[cols], df_pending[cols]