import re import gradio as gr import pandas as pd import plotly.express as px from app.utils import ( add_rank_and_format, deprecated_get_refresh_function, filter_models, get_pipeline_refresh_function, get_refresh_function, ) from data.deprecated_model_handler import DeprecatedModelHandler from data.model_handler import ModelHandler from data.pipeline_handler import PipelineHandler METRICS = [ "ndcg_at_1", "ndcg_at_5", "ndcg_at_10", "ndcg_at_100", "recall_at_1", "recall_at_5", "recall_at_10", "recall_at_100", ] def main(): # Get new results model_handler = ModelHandler() initial_metric = "ndcg_at_5" data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1) data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1) data_benchmark_2 = model_handler.render_df(initial_metric, benchmark_version=2) data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2) num_datasets_1 = len(data_benchmark_1.columns) - 3 num_scores_1 = len(data_benchmark_1) * num_datasets_1 num_models_1 = len(data_benchmark_1) num_datasets_2 = len(data_benchmark_2.columns) - 3 num_scores_2 = len(data_benchmark_2) * num_datasets_2 num_models_2 = len(data_benchmark_2) deprecated_model_handler = DeprecatedModelHandler() initial_metric = "ndcg_at_5" initial_metric_v3 = "ndcg_at_10" # Get pipeline evaluation results pipeline_handler = PipelineHandler() pipeline_handler.get_pipeline_data() data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english") data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True) num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline num_pipelines = len(data_pipeline) deprecated_model_handler.get_vidore_data(initial_metric) deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1) deprecated_data_benchmark_1 = add_rank_and_format(deprecated_data_benchmark_1, benchmark_version=1) deprecated_data_benchmark_2 = deprecated_model_handler.render_df(initial_metric, benchmark_version=2) deprecated_data_benchmark_2 = add_rank_and_format(deprecated_data_benchmark_2, benchmark_version=2) deprecated_num_datasets_1 = len(deprecated_data_benchmark_1.columns) - 3 deprecated_num_scores_1 = len(deprecated_data_benchmark_1) * deprecated_num_datasets_1 deprecated_num_models_1 = len(deprecated_data_benchmark_1) deprecated_num_datasets_2 = len(deprecated_data_benchmark_2.columns) - 3 deprecated_num_scores_2 = len(deprecated_data_benchmark_2) * deprecated_num_datasets_2 deprecated_num_models_2 = len(deprecated_data_benchmark_2) css = """ table > thead { white-space: normal } table { --cell-width-1: 250px } table > tbody > tr > td:nth-child(2) > div { overflow-x: auto } .filter-checkbox-group { max-width: max-content; } #markdown size .markdown { font-size: 1rem; } .alert-info { background-color: #e3f2fd; border-left: 4px solid #2196f3; padding: 5px 15px; } /* 1. Force text wrapping on all headers from the 3rd column onwards */ #pipeline-table table th:nth-child(n+3) * { white-space: normal !important; overflow: visible !important; text-overflow: clip !important; line-height: 1.2 !important; word-break: normal !important; /* Prevents breaking in the middle of words */ overflow-wrap: normal !important; /* Prevents breaking in the middle of words */ } /* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */ #pipeline-table table th:nth-child(n+3), #pipeline-table table td:nth-child(n+3) { min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */ max-width: 120px !important; } /* 3. Make the Model column (2nd column) wider for pipeline table */ #pipeline-table table th:nth-child(2), #pipeline-table table td:nth-child(2) { min-width: 400px !important; max-width: 500px !important; } """ with gr.Blocks(css=css) as block: with gr.Tabs() as tabs: with gr.TabItem("ViDoRe V3", id="vidore-v3"): gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷‍♂️") gr.Markdown( """ Visual Document Retrieval Benchmark 3 leaderboard. To submit results, refer to the corresponding tab. Refer to: - 🤗 The [blogpost](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3) for all the details on the datasets, - 🤗 The [dataset collection](https://huggingface.co/collections/vidore/vidore-benchmark-v3), - 📝 The [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics. """ ) gr.Markdown(""" As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)), we embed it here. """ ) gr.Markdown( """**💡 To display English-only results:** - Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*), - Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it), - The *eng-Latn* column will show English-only results (= English queries on English documents).""", elem_classes="alert-info" ) gr.HTML( """ """ ) with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"): gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️") gr.Markdown("### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines") gr.Markdown( """ This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine. To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs. ⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results. *Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).* """ ) if len(data_pipeline) > 0: datasets_columns_pipeline = [ col for col in data_pipeline.columns[4:] if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"] ] with gr.Row(): metric_dropdown_pipeline = gr.Dropdown(choices=METRICS, value=initial_metric_v3, label="Select Metric") research_textbox_pipeline = gr.Textbox( placeholder="🔍 Search Pipelines... [press enter]", label="Filter Pipelines by Name", ) with gr.Row(): if datasets_columns_pipeline: column_checkboxes_pipeline = gr.CheckboxGroup( choices=datasets_columns_pipeline, value=datasets_columns_pipeline, label="Select Datasets to Display" ) else: column_checkboxes_pipeline = gr.CheckboxGroup( choices=[], value=[], label="Select Datasets to Display" ) with gr.Row(): # Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(datasets_columns_pipeline) dataframe_pipeline = gr.Dataframe(data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table") def clean_pipeline_name(name): if not isinstance(name, str): return str(name) # Remove Markdown links [text](url) -> text name = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', name) # Remove HTML tags text -> text name = re.sub(r'<[^>]+>', '', name) return name.strip() def create_pipeline_plot(df, latency_col): if df is None or len(df) == 0: return None # Ensure expected columns exist if latency_col not in df.columns or "Average Score" not in df.columns or "Pipeline" not in df.columns: return None # Clean the dataframe for plotting plot_df = df.copy() # Strip HTML and Markdown for clean hover text plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name) plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors='coerce') plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors='coerce') plot_df = plot_df.dropna(subset=[latency_col, "Average Score"]) plot_df = plot_df[plot_df[latency_col] > 0] plot_df = plot_df.sort_values(by=latency_col) if len(plot_df) == 0: return None fig = px.scatter( plot_df, x=latency_col, y="Average Score", hover_name="Cleaned Pipeline", # Use the clean text! title=f"Mean Performance vs {latency_col}", color_discrete_sequence=['orange'] ) fig.update_layout( xaxis_title=latency_col, yaxis_title="Average Score", plot_bgcolor='white', ) fig.update_xaxes(showgrid=True, gridcolor='lightgrey') fig.update_yaxes(showgrid=True, gridcolor='lightgrey') fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color='DarkSlateGrey'))) return fig with gr.Row(): latency_radio = gr.Radio( choices=["Search latency (s/query)", "Indexing latency (s/doc)"], value="Search latency (s/query)", label="Select Latency Metric for X-Axis" ) with gr.Row(): initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)") performance_plot = gr.Plot(value=initial_fig) def update_data_pipeline(metric, search_term, selected_columns): pipeline_handler.get_pipeline_data() data = pipeline_handler.render_df(metric, "english") data = add_rank_and_format(data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True) data = filter_models(data, search_term) if selected_columns: # Include core columns plus selected dataset columns core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"] if "Average Score" in data.columns: core_cols.insert(4, "Average Score") data = data[core_cols + selected_columns] return data with gr.Row(): refresh_button_pipeline = gr.Button("Refresh") refresh_button_pipeline.click( lambda metric: add_rank_and_format( pipeline_handler.render_df(metric, "english"), benchmark_version=3, is_pipeline=True ), inputs=[metric_dropdown_pipeline], outputs=dataframe_pipeline, concurrency_limit=20, ).then( fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot ) with gr.Row(): gr.Markdown( """ **Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts). We felt mutli-lingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results. If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository! """ ) # Automatically refresh the dataframe when the dropdown value changes def refresh_pipeline_data(metric): """Refresh pipeline data when metric changes.""" df = pipeline_handler.render_df(metric, "english") return add_rank_and_format(df, benchmark_version=3, is_pipeline=True) # Update dataframe and then update the plot metric_dropdown_pipeline.change( refresh_pipeline_data, inputs=[metric_dropdown_pipeline], outputs=dataframe_pipeline, ).then( fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot ) research_textbox_pipeline.submit( lambda metric, search_term, selected_columns: update_data_pipeline(metric, search_term, selected_columns), inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline], outputs=dataframe_pipeline, ).then( fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot ) column_checkboxes_pipeline.change( lambda metric, search_term, selected_columns: update_data_pipeline(metric, search_term, selected_columns), inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline], outputs=dataframe_pipeline, ).then( fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot ) # Update plot when the radio button changes latency_radio.change( fn=create_pipeline_plot, inputs=[dataframe_pipeline, latency_radio], outputs=performance_plot ) gr.Markdown( f""" - **Total Datasets**: {num_datasets_pipeline} - **Total Scores**: {num_scores_pipeline} - **Total Pipelines**: {num_pipelines} """ + r""" Please consider citing: ```bibtex @misc{faysse2024colpaliefficientdocumentretrieval, title={ColPali: Efficient Document Retrieval with Vision Language Models}, author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, year={2024}, eprint={2407.01449}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)}, } @misc{loison2026vidore, title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios}, author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier}, journal={arXiv preprint arXiv:2601.08620}, year={2026} } ``` """ ) else: gr.Markdown("**No pipeline evaluation results available yet. Check back later!**") with gr.TabItem("ViDoRe V2", id="vidore-v2"): gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍") gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀") gr.Markdown( """ Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab. Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models. """ ) datasets_columns_2 = list(data_benchmark_2.columns[4:]) with gr.Row(): metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") research_textbox_2 = gr.Textbox( placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", ) column_checkboxes_2 = gr.CheckboxGroup( choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display" ) with gr.Row(): datatype_2 = ["number", "markdown"] + ["number"] * (num_datasets_2 + 1) dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas") def update_data_2(metric, search_term, selected_columns): model_handler.get_vidore_data(metric) data = model_handler.render_df(metric, benchmark_version=2) data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns) data = filter_models(data, search_term) # data = remove_duplicates(data) # Add this line if selected_columns: data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns] return data with gr.Row(): refresh_button_2 = gr.Button("Refresh") refresh_button_2.click( get_refresh_function(model_handler, benchmark_version=2), inputs=[metric_dropdown_2], outputs=dataframe_2, concurrency_limit=20, ) with gr.Row(): gr.Markdown( """ **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side. Those numbers are not numbers obtained from the organisations that released those models. """ ) # Automatically refresh the dataframe when the dropdown value changes metric_dropdown_2.change( get_refresh_function(model_handler, benchmark_version=2), inputs=[metric_dropdown_2], outputs=dataframe_2, ) research_textbox_2.submit( lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns), inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2], outputs=dataframe_2, ) column_checkboxes_2.change( lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns), inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2], outputs=dataframe_2, ) gr.Markdown( f""" - **Total Datasets**: {num_datasets_2} - **Total Scores**: {num_scores_2} - **Total Models**: {num_models_2} """ + r""" Please consider citing: ```bibtex @misc{faysse2024colpaliefficientdocumentretrieval, title={ColPali: Efficient Document Retrieval with Vision Language Models}, author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, year={2024}, eprint={2407.01449}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2407.01449}, } @misc{macé2025vidorebenchmarkv2raising, title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval}, author={Quentin Macé and António Loison and Manuel Faysse}, year={2025}, eprint={2505.17166}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2505.17166}, } ``` """ ) with gr.TabItem("ViDoRe V1", id="vidore-v1"): gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍") gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀") gr.Markdown( """ Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab. Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models. """ ) datasets_columns_1 = list(data_benchmark_1.columns[4:]) with gr.Row(): metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") research_textbox_1 = gr.Textbox( placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", ) column_checkboxes_1 = gr.CheckboxGroup( choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display" ) with gr.Row(): datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1) dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas") def update_data_1(metric, search_term, selected_columns): model_handler.get_vidore_data(metric) data = model_handler.render_df(metric, benchmark_version=1) data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns) data = filter_models(data, search_term) if selected_columns: data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns] return data with gr.Row(): refresh_button_1 = gr.Button("Refresh") refresh_button_1.click( get_refresh_function(model_handler, benchmark_version=1), inputs=[metric_dropdown_1], outputs=dataframe_1, concurrency_limit=20, ) # Automatically refresh the dataframe when the dropdown value changes metric_dropdown_1.change( get_refresh_function(model_handler, benchmark_version=1), inputs=[metric_dropdown_1], outputs=dataframe_1, ) research_textbox_1.submit( lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns), inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1], outputs=dataframe_1, ) column_checkboxes_1.change( lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns), inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1], outputs=dataframe_1, ) gr.Markdown( f""" - **Total Datasets**: {num_datasets_1} - **Total Scores**: {num_scores_1} - **Total Models**: {num_models_1} """ + r""" Please consider citing: ```bibtex @misc{faysse2024colpaliefficientdocumentretrieval, title={ColPali: Efficient Document Retrieval with Vision Language Models}, author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, year={2024}, eprint={2407.01449}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2407.01449}, } @misc{macé2025vidorebenchmarkv2raising, title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval}, author={Quentin Macé and António Loison and Manuel Faysse}, year={2025}, eprint={2505.17166}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2505.17166}, } ``` """ ) with gr.TabItem("📚 Submit your model", id="submission"): gr.Markdown("# How to Submit a New Model to the Leaderboard") gr.Markdown( """ To submit a new model to the ViDoRe leaderboard, follow these steps: 1. **Evaluate your model**: - Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB. 2. **Format your submission file**: - Add the generated files to [MTEB results](https://github.com/embeddings-benchmark/results) project. Check the [Colpali results](https://github.com/embeddings-benchmark/results/tree/main/results/vidore__colpali-v1.3/1b5c8929330df1a66de441a9b5409a878f0de5b0) for an example. And you're done! Your model will appear on the leaderboard when you click refresh! Once the space gets rebooted, it will appear on startup. Note: For proper hyperlink redirection, please ensure that your model repository name is in kebab-case, e.g. `my-model-name`. """ ) with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"): gr.Markdown( "## Deprecation notice: This leaderboard contains the results computed with the " "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, " "which is no longer maintained. Results should be computed using the " "[mteb](https://github.com/embeddings-benchmark/mteb) package as described " "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md)." ) gr.Markdown("## Missing results in the new leaderboard are being added as they are re-computed.") gr.Markdown("# [Deprecated] ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍") gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀") gr.Markdown( """ Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab. Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models. """ ) deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:]) with gr.Row(): deprecated_metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") deprecated_research_textbox_1 = gr.Textbox( placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", ) deprecated_column_checkboxes_1 = gr.CheckboxGroup( choices=deprecated_datasets_columns_1, value=deprecated_datasets_columns_1, label="Select Columns to Display" ) with gr.Row(): deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1) deprecated_dataframe_1 = gr.Dataframe(deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas") def deprecated_update_data_1(metric, search_term, selected_columns): deprecated_model_handler.get_vidore_data(metric) data = deprecated_model_handler.render_df(metric, benchmark_version=1) data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns) data = filter_models(data, search_term) # data = remove_duplicates(data) # Add this line if selected_columns: data = data[["Rank", "Model", "Average Score"] + selected_columns] return data with gr.Row(): deprecated_refresh_button_1 = gr.Button("Refresh") deprecated_refresh_button_1.click( deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1), inputs=[deprecated_metric_dropdown_1], outputs=deprecated_dataframe_1, concurrency_limit=20, ) # Automatically refresh the dataframe when the dropdown value changes deprecated_metric_dropdown_1.change( deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1), inputs=[deprecated_metric_dropdown_1], outputs=deprecated_dataframe_1, ) deprecated_research_textbox_1.submit( lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns), inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1], outputs=deprecated_dataframe_1, ) deprecated_column_checkboxes_1.change( lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns), inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1], outputs=deprecated_dataframe_1, ) gr.Markdown( f""" - **Total Datasets**: {deprecated_num_datasets_1} - **Total Scores**: {deprecated_num_scores_1} - **Total Models**: {deprecated_num_models_1} """ + r""" Please consider citing: ```bibtex @misc{faysse2024colpaliefficientdocumentretrieval, title={ColPali: Efficient Document Retrieval with Vision Language Models}, author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, year={2024}, eprint={2407.01449}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2407.01449}, } @misc{macé2025vidorebenchmarkv2raising, title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval}, author={Quentin Macé and António Loison and Manuel Faysse}, year={2025}, eprint={2505.17166}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2505.17166}, } ``` """ ) with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"): gr.Markdown( "## Deprecation notice: This leaderboard contains the results computed with the " "[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, " "which is no longer maintained. Results should be computed using the " "[mteb](https://github.com/embeddings-benchmark/mteb) package as described " "[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md)." ) gr.Markdown("## Missing results in the new leaderboard are being added as they are re-computed.") gr.Markdown("# [Deprecated] ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍") gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀") gr.Markdown( """ Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab. Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models. """ ) deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:]) with gr.Row(): deprecated_metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") deprecated_research_textbox_2 = gr.Textbox( placeholder="🔍 Search Models... [press enter]", label="Filter Models by Name", ) deprecated_column_checkboxes_2 = gr.CheckboxGroup( choices=deprecated_datasets_columns_2, value=deprecated_datasets_columns_2, label="Select Columns to Display" ) with gr.Row(): deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1) deprecated_dataframe_2 = gr.Dataframe(deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas") def deprecated_update_data_2(metric, search_term, selected_columns): deprecated_model_handler.get_vidore_data(metric) data = deprecated_model_handler.render_df(metric, benchmark_version=2) data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns) data = filter_models(data, search_term) # data = remove_duplicates(data) # Add this line if selected_columns: data = data[["Rank", "Model", "Average Score"] + selected_columns] return data with gr.Row(): deprecated_refresh_button_2 = gr.Button("Refresh") deprecated_refresh_button_2.click( deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2), inputs=[deprecated_metric_dropdown_2], outputs=deprecated_dataframe_2, concurrency_limit=20, ) with gr.Row(): gr.Markdown( """ **Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side. Those numbers are not numbers obtained from the organisations that released those models. """ ) # Automatically refresh the dataframe when the dropdown value changes deprecated_metric_dropdown_2.change( deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2), inputs=[deprecated_metric_dropdown_2], outputs=deprecated_dataframe_2, ) deprecated_research_textbox_2.submit( lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns), inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2], outputs=deprecated_dataframe_2, ) deprecated_column_checkboxes_2.change( lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns), inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2], outputs=deprecated_dataframe_2, ) gr.Markdown( f""" - **Total Datasets**: {deprecated_num_datasets_2} - **Total Scores**: {deprecated_num_scores_2} - **Total Models**: {deprecated_num_models_2} """ + r""" Please consider citing: ```bibtex @misc{faysse2024colpaliefficientdocumentretrieval, title={ColPali: Efficient Document Retrieval with Vision Language Models}, author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, year={2024}, eprint={2407.01449}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2407.01449}, } @misc{macé2025vidorebenchmarkv2raising, title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval}, author={Quentin Macé and António Loison and Manuel Faysse}, year={2025}, eprint={2505.17166}, archivePrefix={arXiv}, primaryClass={cs.IR}, url={https://arxiv.org/abs/2505.17166}, } ``` """ ) def select_tab_from_url(request: gr.Request): # Grab query parameters from the URL query_params = dict(request.query_params) # Look for ?tab=..., default to the first tab's ID if not found target_tab = query_params.get("tab", "vidore-v3") # Update the tabs component to select the target ID return gr.update(selected=target_tab) block.load(select_tab_from_url, inputs=None, outputs=tabs) block.queue(max_size=10).launch(debug=True) if __name__ == "__main__": main()