Quentin Mace commited on
Commit
856deb1
·
1 Parent(s): 2b95769
Files changed (3) hide show
  1. app.py +17 -10
  2. app/utils.py +45 -18
  3. data/pipeline_handler.py +34 -8
app.py CHANGED
@@ -52,7 +52,7 @@ def main():
52
  data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
53
  data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
54
 
55
- num_datasets_pipeline = len(data_pipeline.columns) - 4 # Excluding Rank, Model, QPS, Average
56
  num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
57
  num_pipelines = len(data_pipeline)
58
 
@@ -115,6 +115,13 @@ def main():
115
  min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
116
  max-width: 120px !important;
117
  }
 
 
 
 
 
 
 
118
  """
119
 
120
  with gr.Blocks(css=css) as block:
@@ -187,7 +194,7 @@ def main():
187
 
188
  if len(data_pipeline) > 0:
189
  datasets_columns_pipeline = [
190
- col for col in data_pipeline.columns[4:] if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average"]
191
  ]
192
 
193
  with gr.Row():
@@ -212,7 +219,7 @@ def main():
212
  )
213
 
214
  with gr.Row():
215
- # Datatype: Rank, Model, Total retrieval time (s), Indexing latency (s/doc), Search latency (s/query), Average + datasets
216
  datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(datasets_columns_pipeline)
217
  dataframe_pipeline = gr.Dataframe(data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table")
218
 
@@ -223,9 +230,9 @@ def main():
223
  data = filter_models(data, search_term)
224
  if selected_columns:
225
  # Include core columns plus selected dataset columns
226
- core_cols = ["Rank", "Model", "Indexing latency (s/doc)", "Search latency (s/query)"]
227
- if "Average" in data.columns:
228
- core_cols.insert(4, "Average")
229
  data = data[core_cols + selected_columns]
230
  return data
231
 
@@ -338,7 +345,7 @@ def main():
338
  data = filter_models(data, search_term)
339
  # data = remove_duplicates(data) # Add this line
340
  if selected_columns:
341
- data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
342
  return data
343
 
344
  with gr.Row():
@@ -440,7 +447,7 @@ def main():
440
  data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
441
  data = filter_models(data, search_term)
442
  if selected_columns:
443
- data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
444
  return data
445
 
446
  with gr.Row():
@@ -562,7 +569,7 @@ def main():
562
  data = filter_models(data, search_term)
563
  # data = remove_duplicates(data) # Add this line
564
  if selected_columns:
565
- data = data[["Rank", "Model", "Average"] + selected_columns]
566
  return data
567
 
568
  with gr.Row():
@@ -665,7 +672,7 @@ def main():
665
  data = filter_models(data, search_term)
666
  # data = remove_duplicates(data) # Add this line
667
  if selected_columns:
668
- data = data[["Rank", "Model", "Average"] + selected_columns]
669
  return data
670
 
671
  with gr.Row():
 
52
  data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
53
  data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
54
 
55
+ num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average
56
  num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
57
  num_pipelines = len(data_pipeline)
58
 
 
115
  min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
116
  max-width: 120px !important;
117
  }
118
+
119
+ /* 3. Make the Model column (2nd column) wider for pipeline table */
120
+ #pipeline-table table th:nth-child(2),
121
+ #pipeline-table table td:nth-child(2) {
122
+ min-width: 400px !important;
123
+ max-width: 500px !important;
124
+ }
125
  """
126
 
127
  with gr.Blocks(css=css) as block:
 
194
 
195
  if len(data_pipeline) > 0:
196
  datasets_columns_pipeline = [
197
+ col for col in data_pipeline.columns[4:] if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
198
  ]
199
 
200
  with gr.Row():
 
219
  )
220
 
221
  with gr.Row():
222
+ # Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
223
  datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(datasets_columns_pipeline)
224
  dataframe_pipeline = gr.Dataframe(data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table")
225
 
 
230
  data = filter_models(data, search_term)
231
  if selected_columns:
232
  # Include core columns plus selected dataset columns
233
+ core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
234
+ if "Average Score" in data.columns:
235
+ core_cols.insert(4, "Average Score")
236
  data = data[core_cols + selected_columns]
237
  return data
238
 
 
345
  data = filter_models(data, search_term)
346
  # data = remove_duplicates(data) # Add this line
347
  if selected_columns:
348
+ data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns]
349
  return data
350
 
351
  with gr.Row():
 
447
  data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
448
  data = filter_models(data, search_term)
449
  if selected_columns:
450
+ data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns]
451
  return data
452
 
453
  with gr.Row():
 
569
  data = filter_models(data, search_term)
570
  # data = remove_duplicates(data) # Add this line
571
  if selected_columns:
572
+ data = data[["Rank", "Model", "Average Score"] + selected_columns]
573
  return data
574
 
575
  with gr.Row():
 
672
  data = filter_models(data, search_term)
673
  # data = remove_duplicates(data) # Add this line
674
  if selected_columns:
675
+ data = data[["Rank", "Model", "Average Score"] + selected_columns]
676
  return data
677
 
678
  with gr.Row():
app/utils.py CHANGED
@@ -1,12 +1,17 @@
 
 
1
 
2
-
3
- def make_clickable_model(model_name, link=None, is_pipeline=False):
4
  if is_pipeline:
5
- # For pipelines: keep underscores as-is, only process __ and -thisisapoint-
6
- desanitized_model_name = model_name.replace("__", "/")
7
- desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
 
 
8
  if link is None:
9
- link = f"https://github.com/illuin-tech/vidore-benchmark/blob/vidore_v3_pipeline/results/pipeline_descriptions/{desanitized_model_name}/description.json"
 
 
10
  else:
11
  # For regular models: replace __ and _ with /, and -thisisapoint- with .
12
  desanitized_model_name = model_name.replace("__", "/")
@@ -25,7 +30,15 @@ def make_clickable_model(model_name, link=None, is_pipeline=False):
25
 
26
 
27
  def add_rank(df, benchmark_version=1, selected_columns=None):
28
- df.fillna(0.0, inplace=True)
 
 
 
 
 
 
 
 
29
  if selected_columns is None:
30
  cols_to_rank = [
31
  col
@@ -33,12 +46,14 @@ def add_rank(df, benchmark_version=1, selected_columns=None):
33
  if col
34
  not in [
35
  "Model",
 
36
  "Model Size (Million Parameters)",
37
  "Memory Usage (GB, fp32)",
38
  "Embedding Dimensions",
39
  "Max Tokens",
40
  "Compute Cost ($)",
41
  "Queries per Second",
 
42
  ]
43
  ]
44
  else:
@@ -48,24 +63,34 @@ def add_rank(df, benchmark_version=1, selected_columns=None):
48
  df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
49
  else:
50
  # Only add Average column if it doesn't already exist
51
- if "Average" not in df.columns:
52
- df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
53
- df.sort_values("Average", ascending=False, inplace=True)
54
  df.insert(0, "Rank", list(range(1, len(df) + 1)))
55
- # multiply values by 100 if they are floats and round to 1 decimal place
56
  for col in df.columns:
57
- if df[col].dtype == "float64" and col not in ["Model Size (Million Parameters)", "Compute Cost ($)", "Queries per Second", "Indexing latency (s/doc)", "Search latency (s/query)"]:
58
- df[col] = df[col].apply(lambda x: round(x * 100, 1))
59
- elif df[col].dtype == "float64" and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
60
- df[col] = df[col].apply(lambda x: round(x, 2))
61
  return df
62
 
63
 
64
  def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
65
  df = df.reset_index()
66
- df = df.rename(columns={"index": "Model"})
 
67
  df = add_rank(df, benchmark_version, selected_columns)
68
- df["Model"] = df["Model"].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
 
 
 
 
 
 
 
 
 
69
  # df = remove_duplicates(df)
70
  return df
71
 
@@ -110,5 +135,7 @@ def get_pipeline_refresh_function(pipeline_handler):
110
 
111
  def filter_models(data, search_term):
112
  if search_term:
113
- data = data[data["Model"].str.contains(search_term, case=False, na=False)]
 
 
114
  return data
 
1
+ import pandas as pd
2
+ import math
3
 
4
+ def make_clickable_model(model_name, link=None, is_pipeline=False, folder_name=None):
 
5
  if is_pipeline:
6
+ # For pipelines: use folder_name for link, model_name (alias) for display
7
+ link_folder = folder_name if folder_name else model_name
8
+ # Process folder name for link: only handle __ and -thisisapoint-
9
+ desanitized_folder = link_folder.replace("__", "/")
10
+ desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
11
  if link is None:
12
+ link = f"https://github.com/illuin-tech/vidore-benchmark/blob/vidore_v3_pipeline/results/pipeline_descriptions/{desanitized_folder}/description.json"
13
+ # Use word-wrap styling for potentially long pipeline aliases
14
+ return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
15
  else:
16
  # For regular models: replace __ and _ with /, and -thisisapoint- with .
17
  desanitized_model_name = model_name.replace("__", "/")
 
30
 
31
 
32
  def add_rank(df, benchmark_version=1, selected_columns=None):
33
+ # Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
34
+ for col in df.columns:
35
+ if col not in ["Model", "Pipeline", "_folder_name"]:
36
+ df[col] = pd.to_numeric(df[col], errors="coerce")
37
+
38
+ # Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
39
+ numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
40
+ df[numeric_cols] = df[numeric_cols].fillna(0.0)
41
+
42
  if selected_columns is None:
43
  cols_to_rank = [
44
  col
 
46
  if col
47
  not in [
48
  "Model",
49
+ "Pipeline", # For pipeline tables
50
  "Model Size (Million Parameters)",
51
  "Memory Usage (GB, fp32)",
52
  "Embedding Dimensions",
53
  "Max Tokens",
54
  "Compute Cost ($)",
55
  "Queries per Second",
56
+ "_folder_name", # Hidden column for pipeline link generation
57
  ]
58
  ]
59
  else:
 
63
  df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
64
  else:
65
  # Only add Average column if it doesn't already exist
66
+ if "Average Score" not in df.columns:
67
+ df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
68
+ df.sort_values("Average Score", ascending=False, inplace=True)
69
  df.insert(0, "Rank", list(range(1, len(df) + 1)))
70
+ # multiply values by 100 if they are floats and round to 2 decimal places
71
  for col in df.columns:
72
+ if pd.api.types.is_numeric_dtype(df[col]) and col not in ["Model Size (Million Parameters)", "Compute Cost ($)", "Queries per Second", "Indexing latency (s/doc)", "Search latency (s/query)", "Rank"]:
73
+ df[col] = df[col].apply(lambda x: round(x*100, 3 - int(math.floor(math.log10(abs(x*100)))) - 1))
74
+ elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
75
+ df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
76
  return df
77
 
78
 
79
  def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
80
  df = df.reset_index()
81
+ column_name = "Pipeline" if is_pipeline else "Model"
82
+ df = df.rename(columns={"index": column_name})
83
  df = add_rank(df, benchmark_version, selected_columns)
84
+
85
+ if is_pipeline and "_folder_name" in df.columns:
86
+ # For pipelines, use folder_name for link generation
87
+ df[column_name] = df.apply(
88
+ lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
89
+ axis=1
90
+ )
91
+ df = df.drop(columns=["_folder_name"])
92
+ else:
93
+ df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
94
  # df = remove_duplicates(df)
95
  return df
96
 
 
135
 
136
  def filter_models(data, search_term):
137
  if search_term:
138
+ # Use "Pipeline" column for pipeline tables, "Model" for others
139
+ col_name = "Pipeline" if "Pipeline" in data.columns else "Model"
140
+ data = data[data[col_name].str.contains(search_term, case=False, na=False)]
141
  return data
data/pipeline_handler.py CHANGED
@@ -10,10 +10,12 @@ class PipelineHandler:
10
 
11
  def __init__(self):
12
  self.pipeline_infos = {}
 
13
  self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/vidore_v3_pipeline/results/metrics"
 
14
  self.available_datasets = []
15
  self.available_languages = ["english"] # Default languages available
16
-
17
  # Setup GitHub authentication if token is available
18
  self.github_token = os.environ.get("GITHUB_TOKEN")
19
  self.headers = {}
@@ -62,7 +64,7 @@ class PipelineHandler:
62
  def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
63
  """Fetch a JSON file from GitHub raw content."""
64
  url = f"{self.github_base_url}/{pipeline_name}/{filename}"
65
-
66
  try:
67
  response = requests.get(url, headers=self.headers)
68
  response.raise_for_status()
@@ -71,6 +73,22 @@ class PipelineHandler:
71
  print(f"Error fetching {filename} from {pipeline_name}: {e}")
72
  return None
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def get_pipeline_data(self):
75
  """Fetch all pipeline data from GitHub."""
76
  pipeline_folders = self.get_pipeline_folders_from_github()
@@ -99,6 +117,10 @@ class PipelineHandler:
99
 
100
  if pipeline_data:
101
  self.pipeline_infos[pipeline_name] = pipeline_data
 
 
 
 
102
 
103
  self.available_datasets = sorted(list(datasets_set))
104
  self.available_languages = sorted(list(languages_set))
@@ -226,21 +248,25 @@ class PipelineHandler:
226
 
227
  # Calculate average across datasets if there are multiple
228
  if dataset_metrics:
229
- row_data["Average"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
230
-
231
- pipeline_res[pipeline_name] = row_data
 
 
 
 
232
 
233
  if pipeline_res:
234
  df = pd.DataFrame(pipeline_res).T
235
  # Reorder columns to have Average right after timing metrics
236
  cols = list(df.columns)
237
- if "Average" in cols:
238
- cols.remove("Average")
239
  # Insert Average after Search latency (s/query)
240
  insert_pos = (
241
  cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
242
  )
243
- cols.insert(insert_pos, "Average")
244
  df = df[cols]
245
  return df
246
 
 
10
 
11
  def __init__(self):
12
  self.pipeline_infos = {}
13
+ self.pipeline_aliases = {} # Maps folder_name -> pipeline_alias for display
14
  self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/vidore_v3_pipeline/results/metrics"
15
+ self.github_descriptions_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/vidore_v3_pipeline/results/pipeline_descriptions"
16
  self.available_datasets = []
17
  self.available_languages = ["english"] # Default languages available
18
+
19
  # Setup GitHub authentication if token is available
20
  self.github_token = os.environ.get("GITHUB_TOKEN")
21
  self.headers = {}
 
64
  def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
65
  """Fetch a JSON file from GitHub raw content."""
66
  url = f"{self.github_base_url}/{pipeline_name}/{filename}"
67
+
68
  try:
69
  response = requests.get(url, headers=self.headers)
70
  response.raise_for_status()
 
73
  print(f"Error fetching {filename} from {pipeline_name}: {e}")
74
  return None
75
 
76
+ def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
77
+ """Fetch the pipeline_alias from description.json for a pipeline.
78
+
79
+ Uses raw.githubusercontent.com to avoid API rate limits.
80
+ """
81
+ url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
82
+
83
+ try:
84
+ response = requests.get(url, headers=self.headers)
85
+ response.raise_for_status()
86
+ description = response.json()
87
+ return description.get("pipeline_alias")
88
+ except Exception as e:
89
+ print(f"Error fetching description for {pipeline_name}: {e}")
90
+ return None
91
+
92
  def get_pipeline_data(self):
93
  """Fetch all pipeline data from GitHub."""
94
  pipeline_folders = self.get_pipeline_folders_from_github()
 
117
 
118
  if pipeline_data:
119
  self.pipeline_infos[pipeline_name] = pipeline_data
120
+ # Fetch the pipeline alias for display (uses raw URL, not API)
121
+ alias = self.fetch_pipeline_alias(pipeline_name)
122
+ if alias:
123
+ self.pipeline_aliases[pipeline_name] = alias
124
 
125
  self.available_datasets = sorted(list(datasets_set))
126
  self.available_languages = sorted(list(languages_set))
 
248
 
249
  # Calculate average across datasets if there are multiple
250
  if dataset_metrics:
251
+ row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
252
+
253
+ # Use pipeline_alias for display if available, otherwise fallback to folder name
254
+ display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
255
+ # Store folder name for link generation (will be used in utils.py)
256
+ row_data["_folder_name"] = pipeline_name
257
+ pipeline_res[display_name] = row_data
258
 
259
  if pipeline_res:
260
  df = pd.DataFrame(pipeline_res).T
261
  # Reorder columns to have Average right after timing metrics
262
  cols = list(df.columns)
263
+ if "Average Score" in cols:
264
+ cols.remove("Average Score")
265
  # Insert Average after Search latency (s/query)
266
  insert_pos = (
267
  cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
268
  )
269
+ cols.insert(insert_pos, "Average Score")
270
  df = df[cols]
271
  return df
272