vidore-leaderboard-pipeline

Sleeping

App Files Files Community

Quentin Mace commited on Feb 26

Commit

856deb1

1 Parent(s): 2b95769

final_v0

Browse files

Files changed (3) hide show

app.py +17 -10
app/utils.py +45 -18
data/pipeline_handler.py +34 -8

app.py CHANGED Viewed

@@ -52,7 +52,7 @@ def main():
     data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
     data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
-    num_datasets_pipeline = len(data_pipeline.columns) - 4  # Excluding Rank, Model, QPS, Average
     num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
     num_pipelines = len(data_pipeline)
@@ -115,6 +115,13 @@ def main():
         min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
         max-width: 120px !important;
     }
     """
     with gr.Blocks(css=css) as block:
@@ -187,7 +194,7 @@ def main():
                 if len(data_pipeline) > 0:
                     datasets_columns_pipeline = [
-                        col for col in data_pipeline.columns[4:] if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average"]
                     ]
                     with gr.Row():
@@ -212,7 +219,7 @@ def main():
                             )
                     with gr.Row():
-                        # Datatype: Rank, Model, Total retrieval time (s), Indexing latency (s/doc), Search latency (s/query), Average + datasets
                         datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(datasets_columns_pipeline)
                         dataframe_pipeline = gr.Dataframe(data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table")
@@ -223,9 +230,9 @@ def main():
                         data = filter_models(data, search_term)
                         if selected_columns:
                             # Include core columns plus selected dataset columns
-                            core_cols = ["Rank", "Model", "Indexing latency (s/doc)", "Search latency (s/query)"]
-                            if "Average" in data.columns:
-                                core_cols.insert(4, "Average")
                             data = data[core_cols + selected_columns]
                         return data
@@ -338,7 +345,7 @@ def main():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
-                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -440,7 +447,7 @@ def main():
                     data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
                     data = filter_models(data, search_term)
                     if selected_columns:
-                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -562,7 +569,7 @@ def main():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
-                        data = data[["Rank", "Model", "Average"] + selected_columns]
                     return data
                 with gr.Row():
@@ -665,7 +672,7 @@ def main():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
-                        data = data[["Rank", "Model", "Average"] + selected_columns]
                     return data
                 with gr.Row():

     data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
     data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
+    num_datasets_pipeline = len(data_pipeline.columns) - 5  # Excluding Rank, Model, Indexing time, search time, Average
     num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
     num_pipelines = len(data_pipeline)
         min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
         max-width: 120px !important;
     }
+    /* 3. Make the Model column (2nd column) wider for pipeline table */
+    #pipeline-table table th:nth-child(2),
+    #pipeline-table table td:nth-child(2) {
+        min-width: 400px !important;
+        max-width: 500px !important;
+    }
     """
     with gr.Blocks(css=css) as block:
                 if len(data_pipeline) > 0:
                     datasets_columns_pipeline = [
+                        col for col in data_pipeline.columns[4:] if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
                     ]
                     with gr.Row():
                             )
                     with gr.Row():
+                        # Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
                         datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(datasets_columns_pipeline)
                         dataframe_pipeline = gr.Dataframe(data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table")
                         data = filter_models(data, search_term)
                         if selected_columns:
                             # Include core columns plus selected dataset columns
+                            core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
+                            if "Average Score" in data.columns:
+                                core_cols.insert(4, "Average Score")
                             data = data[core_cols + selected_columns]
                         return data
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns]
                     return data
                 with gr.Row():
                     data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
                     data = filter_models(data, search_term)
                     if selected_columns:
+                        data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns]
                     return data
                 with gr.Row():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[["Rank", "Model", "Average Score"] + selected_columns]
                     return data
                 with gr.Row():
                     data = filter_models(data, search_term)
                     # data = remove_duplicates(data)  # Add this line
                     if selected_columns:
+                        data = data[["Rank", "Model", "Average Score"] + selected_columns]
                     return data
                 with gr.Row():

app/utils.py CHANGED Viewed

@@ -1,12 +1,17 @@
-def make_clickable_model(model_name, link=None, is_pipeline=False):
     if is_pipeline:
-        # For pipelines: keep underscores as-is, only process __ and -thisisapoint-
-        desanitized_model_name = model_name.replace("__", "/")
-        desanitized_model_name = desanitized_model_name.replace("-thisisapoint-", ".")
         if link is None:
-            link = f"https://github.com/illuin-tech/vidore-benchmark/blob/vidore_v3_pipeline/results/pipeline_descriptions/{desanitized_model_name}/description.json"
     else:
         # For regular models: replace __ and _ with /, and -thisisapoint- with .
         desanitized_model_name = model_name.replace("__", "/")
@@ -25,7 +30,15 @@ def make_clickable_model(model_name, link=None, is_pipeline=False):
 def add_rank(df, benchmark_version=1, selected_columns=None):
-        df.fillna(0.0, inplace=True)
         if selected_columns is None:
             cols_to_rank = [
                 col
@@ -33,12 +46,14 @@ def add_rank(df, benchmark_version=1, selected_columns=None):
                 if col
                 not in [
                     "Model",
                     "Model Size (Million Parameters)",
                     "Memory Usage (GB, fp32)",
                     "Embedding Dimensions",
                     "Max Tokens",
                     "Compute Cost ($)",
                     "Queries per Second",
                 ]
             ]
         else:
@@ -48,24 +63,34 @@ def add_rank(df, benchmark_version=1, selected_columns=None):
             df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
         else:
             # Only add Average column if it doesn't already exist
-            if "Average" not in df.columns:
-                df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
-            df.sort_values("Average", ascending=False, inplace=True)
         df.insert(0, "Rank", list(range(1, len(df) + 1)))
-        # multiply values by 100 if they are floats and round to 1 decimal place
         for col in df.columns:
-            if df[col].dtype == "float64" and col not in ["Model Size (Million Parameters)", "Compute Cost ($)", "Queries per Second", "Indexing latency (s/doc)", "Search latency (s/query)"]:
-                df[col] = df[col].apply(lambda x: round(x * 100, 1))
-            elif df[col].dtype == "float64" and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
-                df[col] = df[col].apply(lambda x: round(x, 2))
         return df
 def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
     df = df.reset_index()
-    df = df.rename(columns={"index": "Model"})
     df = add_rank(df, benchmark_version, selected_columns)
-    df["Model"] = df["Model"].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
     # df = remove_duplicates(df)
     return df
@@ -110,5 +135,7 @@ def get_pipeline_refresh_function(pipeline_handler):
 def filter_models(data, search_term):
     if search_term:
-        data = data[data["Model"].str.contains(search_term, case=False, na=False)]
     return data

+import pandas as pd
+import math
+def make_clickable_model(model_name, link=None, is_pipeline=False, folder_name=None):
     if is_pipeline:
+        # For pipelines: use folder_name for link, model_name (alias) for display
+        link_folder = folder_name if folder_name else model_name
+        # Process folder name for link: only handle __ and -thisisapoint-
+        desanitized_folder = link_folder.replace("__", "/")
+        desanitized_folder = desanitized_folder.replace("-thisisapoint-", ".")
         if link is None:
+            link = f"https://github.com/illuin-tech/vidore-benchmark/blob/vidore_v3_pipeline/results/pipeline_descriptions/{desanitized_folder}/description.json"
+        # Use word-wrap styling for potentially long pipeline aliases
+        return f'<a target="_blank" style="text-decoration: underline; word-wrap: break-word; white-space: normal; display: inline-block; max-width: 450px;" href="{link}">{model_name}</a>'
     else:
         # For regular models: replace __ and _ with /, and -thisisapoint- with .
         desanitized_model_name = model_name.replace("__", "/")
 def add_rank(df, benchmark_version=1, selected_columns=None):
+        # Convert numeric columns to proper float type (they may be 'object' dtype due to mixed data)
+        for col in df.columns:
+            if col not in ["Model", "Pipeline", "_folder_name"]:
+                df[col] = pd.to_numeric(df[col], errors="coerce")
+        # Only fill NaN for numeric columns to avoid issues with string columns like _folder_name
+        numeric_cols = df.select_dtypes(include=["float64", "int64", "float32", "int32"]).columns
+        df[numeric_cols] = df[numeric_cols].fillna(0.0)
         if selected_columns is None:
             cols_to_rank = [
                 col
                 if col
                 not in [
                     "Model",
+                    "Pipeline",  # For pipeline tables
                     "Model Size (Million Parameters)",
                     "Memory Usage (GB, fp32)",
                     "Embedding Dimensions",
                     "Max Tokens",
                     "Compute Cost ($)",
                     "Queries per Second",
+                    "_folder_name",  # Hidden column for pipeline link generation
                 ]
             ]
         else:
             df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
         else:
             # Only add Average column if it doesn't already exist
+            if "Average Score" not in df.columns:
+                df.insert(len(df.columns) - len(cols_to_rank), "Average Score", df[cols_to_rank].mean(axis=1, skipna=False))
+            df.sort_values("Average Score", ascending=False, inplace=True)
         df.insert(0, "Rank", list(range(1, len(df) + 1)))
+        # multiply values by 100 if they are floats and round to 2 decimal places
         for col in df.columns:
+            if pd.api.types.is_numeric_dtype(df[col]) and col not in ["Model Size (Million Parameters)", "Compute Cost ($)", "Queries per Second", "Indexing latency (s/doc)", "Search latency (s/query)", "Rank"]:
+                df[col] = df[col].apply(lambda x: round(x*100, 3 - int(math.floor(math.log10(abs(x*100)))) - 1))
+            elif pd.api.types.is_numeric_dtype(df[col]) and col in ["Indexing latency (s/doc)", "Search latency (s/query)"]:
+                df[col] = df[col].apply(lambda x: round(x, 3 - int(math.floor(math.log10(abs(x)))) - 1))
         return df
 def add_rank_and_format(df, benchmark_version=1, selected_columns=None, is_pipeline=False):
     df = df.reset_index()
+    column_name = "Pipeline" if is_pipeline else "Model"
+    df = df.rename(columns={"index": column_name})
     df = add_rank(df, benchmark_version, selected_columns)
+    if is_pipeline and "_folder_name" in df.columns:
+        # For pipelines, use folder_name for link generation
+        df[column_name] = df.apply(
+            lambda row: make_clickable_model(row[column_name], is_pipeline=True, folder_name=row["_folder_name"]),
+            axis=1
+        )
+        df = df.drop(columns=["_folder_name"])
+    else:
+        df[column_name] = df[column_name].apply(lambda x: make_clickable_model(x, is_pipeline=is_pipeline))
     # df = remove_duplicates(df)
     return df
 def filter_models(data, search_term):
     if search_term:
+        # Use "Pipeline" column for pipeline tables, "Model" for others
+        col_name = "Pipeline" if "Pipeline" in data.columns else "Model"
+        data = data[data[col_name].str.contains(search_term, case=False, na=False)]
     return data

data/pipeline_handler.py CHANGED Viewed

@@ -10,10 +10,12 @@ class PipelineHandler:
     def __init__(self):
         self.pipeline_infos = {}
         self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/vidore_v3_pipeline/results/metrics"
         self.available_datasets = []
         self.available_languages = ["english"]  # Default languages available
         # Setup GitHub authentication if token is available
         self.github_token = os.environ.get("GITHUB_TOKEN")
         self.headers = {}
@@ -62,7 +64,7 @@ class PipelineHandler:
     def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
         """Fetch a JSON file from GitHub raw content."""
         url = f"{self.github_base_url}/{pipeline_name}/{filename}"
         try:
             response = requests.get(url, headers=self.headers)
             response.raise_for_status()
@@ -71,6 +73,22 @@ class PipelineHandler:
             print(f"Error fetching {filename} from {pipeline_name}: {e}")
             return None
     def get_pipeline_data(self):
         """Fetch all pipeline data from GitHub."""
         pipeline_folders = self.get_pipeline_folders_from_github()
@@ -99,6 +117,10 @@ class PipelineHandler:
             if pipeline_data:
                 self.pipeline_infos[pipeline_name] = pipeline_data
         self.available_datasets = sorted(list(datasets_set))
         self.available_languages = sorted(list(languages_set))
@@ -226,21 +248,25 @@ class PipelineHandler:
             # Calculate average across datasets if there are multiple
             if dataset_metrics:
-                row_data["Average"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
-            pipeline_res[pipeline_name] = row_data
         if pipeline_res:
             df = pd.DataFrame(pipeline_res).T
             # Reorder columns to have Average right after timing metrics
             cols = list(df.columns)
-            if "Average" in cols:
-                cols.remove("Average")
                 # Insert Average after Search latency (s/query)
                 insert_pos = (
                     cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
                 )
-                cols.insert(insert_pos, "Average")
                 df = df[cols]
             return df

     def __init__(self):
         self.pipeline_infos = {}
+        self.pipeline_aliases = {}  # Maps folder_name -> pipeline_alias for display
         self.github_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/vidore_v3_pipeline/results/metrics"
+        self.github_descriptions_base_url = "https://raw.githubusercontent.com/illuin-tech/vidore-benchmark/vidore_v3_pipeline/results/pipeline_descriptions"
         self.available_datasets = []
         self.available_languages = ["english"]  # Default languages available
         # Setup GitHub authentication if token is available
         self.github_token = os.environ.get("GITHUB_TOKEN")
         self.headers = {}
     def fetch_json_from_github(self, pipeline_name: str, filename: str) -> Optional[Dict]:
         """Fetch a JSON file from GitHub raw content."""
         url = f"{self.github_base_url}/{pipeline_name}/{filename}"
         try:
             response = requests.get(url, headers=self.headers)
             response.raise_for_status()
             print(f"Error fetching {filename} from {pipeline_name}: {e}")
             return None
+    def fetch_pipeline_alias(self, pipeline_name: str) -> Optional[str]:
+        """Fetch the pipeline_alias from description.json for a pipeline.
+        Uses raw.githubusercontent.com to avoid API rate limits.
+        """
+        url = f"{self.github_descriptions_base_url}/{pipeline_name}/description.json"
+        try:
+            response = requests.get(url, headers=self.headers)
+            response.raise_for_status()
+            description = response.json()
+            return description.get("pipeline_alias")
+        except Exception as e:
+            print(f"Error fetching description for {pipeline_name}: {e}")
+            return None
     def get_pipeline_data(self):
         """Fetch all pipeline data from GitHub."""
         pipeline_folders = self.get_pipeline_folders_from_github()
             if pipeline_data:
                 self.pipeline_infos[pipeline_name] = pipeline_data
+                # Fetch the pipeline alias for display (uses raw URL, not API)
+                alias = self.fetch_pipeline_alias(pipeline_name)
+                if alias:
+                    self.pipeline_aliases[pipeline_name] = alias
         self.available_datasets = sorted(list(datasets_set))
         self.available_languages = sorted(list(languages_set))
             # Calculate average across datasets if there are multiple
             if dataset_metrics:
+                row_data["Average Score"] = round(sum(dataset_metrics.values()) / len(dataset_metrics), 4)
+            # Use pipeline_alias for display if available, otherwise fallback to folder name
+            display_name = self.pipeline_aliases.get(pipeline_name, pipeline_name)
+            # Store folder name for link generation (will be used in utils.py)
+            row_data["_folder_name"] = pipeline_name
+            pipeline_res[display_name] = row_data
         if pipeline_res:
             df = pd.DataFrame(pipeline_res).T
             # Reorder columns to have Average right after timing metrics
             cols = list(df.columns)
+            if "Average Score" in cols:
+                cols.remove("Average Score")
                 # Insert Average after Search latency (s/query)
                 insert_pos = (
                     cols.index("Search latency (s/query)") + 1 if "Search latency (s/query)" in cols else 2
                 )
+                cols.insert(insert_pos, "Average Score")
                 df = df[cols]
             return df