import re
import gradio as gr
import pandas as pd
import plotly.express as px
from app.utils import (
add_rank_and_format,
deprecated_get_refresh_function,
filter_models,
get_pipeline_refresh_function,
get_refresh_function,
)
from data.deprecated_model_handler import DeprecatedModelHandler
from data.model_handler import ModelHandler
from data.pipeline_handler import PipelineHandler
METRICS = [
"ndcg_at_1",
"ndcg_at_5",
"ndcg_at_10",
"ndcg_at_100",
"recall_at_1",
"recall_at_5",
"recall_at_10",
"recall_at_100",
]
def main():
# Get new results
model_handler = ModelHandler()
initial_metric = "ndcg_at_5"
data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1)
data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1)
data_benchmark_2 = model_handler.render_df(initial_metric, benchmark_version=2)
data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2)
num_datasets_1 = len(data_benchmark_1.columns) - 3
num_scores_1 = len(data_benchmark_1) * num_datasets_1
num_models_1 = len(data_benchmark_1)
num_datasets_2 = len(data_benchmark_2.columns) - 3
num_scores_2 = len(data_benchmark_2) * num_datasets_2
num_models_2 = len(data_benchmark_2)
deprecated_model_handler = DeprecatedModelHandler()
initial_metric = "ndcg_at_5"
initial_metric_v3 = "ndcg_at_10"
# Get pipeline evaluation results
pipeline_handler = PipelineHandler()
pipeline_handler.get_pipeline_data()
data_pipeline = pipeline_handler.render_df(initial_metric_v3, "english")
data_pipeline = add_rank_and_format(data_pipeline, benchmark_version=3, is_pipeline=True)
num_datasets_pipeline = len(data_pipeline.columns) - 5 # Excluding Rank, Model, Indexing time, search time, Average
num_scores_pipeline = len(data_pipeline) * num_datasets_pipeline
num_pipelines = len(data_pipeline)
deprecated_model_handler.get_vidore_data(initial_metric)
deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1)
deprecated_data_benchmark_1 = add_rank_and_format(deprecated_data_benchmark_1, benchmark_version=1)
deprecated_data_benchmark_2 = deprecated_model_handler.render_df(initial_metric, benchmark_version=2)
deprecated_data_benchmark_2 = add_rank_and_format(deprecated_data_benchmark_2, benchmark_version=2)
deprecated_num_datasets_1 = len(deprecated_data_benchmark_1.columns) - 3
deprecated_num_scores_1 = len(deprecated_data_benchmark_1) * deprecated_num_datasets_1
deprecated_num_models_1 = len(deprecated_data_benchmark_1)
deprecated_num_datasets_2 = len(deprecated_data_benchmark_2.columns) - 3
deprecated_num_scores_2 = len(deprecated_data_benchmark_2) * deprecated_num_datasets_2
deprecated_num_models_2 = len(deprecated_data_benchmark_2)
css = """
table > thead {
white-space: normal
}
table {
--cell-width-1: 250px
}
table > tbody > tr > td:nth-child(2) > div {
overflow-x: auto
}
.filter-checkbox-group {
max-width: max-content;
}
#markdown size
.markdown {
font-size: 1rem;
}
.alert-info {
background-color: #e3f2fd;
border-left: 4px solid #2196f3;
padding: 5px 15px;
}
/* 1. Force text wrapping on all headers from the 3rd column onwards */
#pipeline-table table th:nth-child(n+3) * {
white-space: normal !important;
overflow: visible !important;
text-overflow: clip !important;
line-height: 1.2 !important;
word-break: normal !important; /* Prevents breaking in the middle of words */
overflow-wrap: normal !important; /* Prevents breaking in the middle of words */
}
/* 2. Set a fixed width and center alignment for headers and data cells from the 3rd column onwards */
#pipeline-table table th:nth-child(n+3),
#pipeline-table table td:nth-child(n+3) {
min-width: 100px !important; /* Just wide enough to fit long words like "Pharmaceuticals" */
max-width: 120px !important;
}
/* 3. Make the Model column (2nd column) wider for pipeline table */
#pipeline-table table th:nth-child(2),
#pipeline-table table td:nth-child(2) {
min-width: 400px !important;
max-width: 500px !important;
}
"""
with gr.Blocks(css=css) as block:
with gr.Tabs() as tabs:
with gr.TabItem("ViDoRe V3", id="vidore-v3"):
gr.Markdown("# ViDoRe V3: a comprehensive evaluation of retrieval for enterprise use-case 👷♂️")
gr.Markdown(
"""
Visual Document Retrieval Benchmark 3 leaderboard. To submit results, refer to the corresponding tab.
Refer to:
- 🤗 The [blogpost](https://huggingface.co/blog/QuentinJG/introducing-vidore-v3) for all the details on the datasets,
- 🤗 The [dataset collection](https://huggingface.co/collections/vidore/vidore-benchmark-v3),
- 📝 The [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics.
"""
)
gr.Markdown("""
As the reference results are now hosted on the [MTEB Leaderboard](https://mteb-leaderboard.hf.space/?benchmark_name=ViDoRe(v3)),
we embed it here.
""" )
gr.Markdown(
"""**💡 To display English-only results:**
- Under *Customize this Benchmark*, unselect the French datasets (*Vidore3EnergyRetrieval*, *Vidore3FinanceFrRetrieval*, *Vidore3PhysicsRetrieval*),
- Go to the *Performance per language* tab (you might have to click on the three dots on the right of the tab bar to see it),
- The *eng-Latn* column will show English-only results (= English queries on English documents).""",
elem_classes="alert-info"
)
gr.HTML(
"""
"""
)
with gr.TabItem("ViDoRe V3 (Pipeline)", id="vidore-v3-pipeline"):
gr.Markdown("# ViDoRe V3 (Pipeline Evaluation): Retrieval Performance for Complex Pipelines ⚙️")
gr.Markdown("### Assessing retrieval performance, latency, and compute costs of complex retrieval pipelines")
gr.Markdown(
"""
This leaderboard ranks full retrieval pipelines on **English-only queries** for **ViDoRe V3**. Instead of just testing standalone models, we evaluate real-world, multi-step retrieval systems. This includes everything from basic retrievers to advanced setups using AI agents, query reformulation, hybrid search, and any other creative retrieval pipeline one can imagine.
To show efficiency alongside accuracy, we include **Indexing latency** (seconds/doc) and **Search latency** (seconds/query). These numbers are self-reported and depend heavily on hardware, so treat them as rough estimates. Still, they give a helpful idea of how much computing power each pipeline needs.
⚠️ **Note:** Because this only uses English queries, these scores **cannot be directly compared** to the standard ViDoRe V3 results.
*Results are sourced from the [vidore-benchmark repository](https://github.com/illuin-tech/vidore-benchmark/tree/main/results).*
"""
)
if len(data_pipeline) > 0:
datasets_columns_pipeline = [
col for col in data_pipeline.columns[4:] if col not in ["Indexing latency (s/doc)", "Search latency (s/query)", "Average Score"]
]
with gr.Row():
metric_dropdown_pipeline = gr.Dropdown(choices=METRICS, value=initial_metric_v3, label="Select Metric")
research_textbox_pipeline = gr.Textbox(
placeholder="🔍 Search Pipelines... [press enter]",
label="Filter Pipelines by Name",
)
with gr.Row():
if datasets_columns_pipeline:
column_checkboxes_pipeline = gr.CheckboxGroup(
choices=datasets_columns_pipeline,
value=datasets_columns_pipeline,
label="Select Datasets to Display"
)
else:
column_checkboxes_pipeline = gr.CheckboxGroup(
choices=[],
value=[],
label="Select Datasets to Display"
)
with gr.Row():
# Datatype: Rank, Pipeline, Indexing latency (s/doc), Search latency (s/query), Average + datasets
datatype_pipeline = ["number", "markdown", "number", "number", "number"] + ["number"] * len(datasets_columns_pipeline)
dataframe_pipeline = gr.Dataframe(data_pipeline, datatype=datatype_pipeline, type="pandas", elem_id="pipeline-table")
def clean_pipeline_name(name):
if not isinstance(name, str):
return str(name)
# Remove Markdown links [text](url) -> text
name = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', name)
# Remove HTML tags text -> text
name = re.sub(r'<[^>]+>', '', name)
return name.strip()
def create_pipeline_plot(df, latency_col):
if df is None or len(df) == 0:
return None
# Ensure expected columns exist
if latency_col not in df.columns or "Average Score" not in df.columns or "Pipeline" not in df.columns:
return None
# Clean the dataframe for plotting
plot_df = df.copy()
# Strip HTML and Markdown for clean hover text
plot_df["Cleaned Pipeline"] = plot_df["Pipeline"].apply(clean_pipeline_name)
plot_df[latency_col] = pd.to_numeric(plot_df[latency_col], errors='coerce')
plot_df["Average Score"] = pd.to_numeric(plot_df["Average Score"], errors='coerce')
plot_df = plot_df.dropna(subset=[latency_col, "Average Score"])
plot_df = plot_df[plot_df[latency_col] > 0]
plot_df = plot_df.sort_values(by=latency_col)
if len(plot_df) == 0:
return None
fig = px.scatter(
plot_df,
x=latency_col,
y="Average Score",
hover_name="Cleaned Pipeline", # Use the clean text!
title=f"Mean Performance vs {latency_col}",
color_discrete_sequence=['orange']
)
fig.update_layout(
xaxis_title=latency_col,
yaxis_title="Average Score",
plot_bgcolor='white',
)
fig.update_xaxes(showgrid=True,
gridcolor='lightgrey')
fig.update_yaxes(showgrid=True,
gridcolor='lightgrey')
fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=1, color='DarkSlateGrey')))
return fig
with gr.Row():
latency_radio = gr.Radio(
choices=["Search latency (s/query)", "Indexing latency (s/doc)"],
value="Search latency (s/query)",
label="Select Latency Metric for X-Axis"
)
with gr.Row():
initial_fig = create_pipeline_plot(data_pipeline, "Search latency (s/query)")
performance_plot = gr.Plot(value=initial_fig)
def update_data_pipeline(metric, search_term, selected_columns):
pipeline_handler.get_pipeline_data()
data = pipeline_handler.render_df(metric, "english")
data = add_rank_and_format(data, benchmark_version=3, selected_columns=selected_columns, is_pipeline=True)
data = filter_models(data, search_term)
if selected_columns:
# Include core columns plus selected dataset columns
core_cols = ["Rank", "Pipeline", "Indexing latency (s/doc)", "Search latency (s/query)"]
if "Average Score" in data.columns:
core_cols.insert(4, "Average Score")
data = data[core_cols + selected_columns]
return data
with gr.Row():
refresh_button_pipeline = gr.Button("Refresh")
refresh_button_pipeline.click(
lambda metric: add_rank_and_format(
pipeline_handler.render_df(metric, "english"),
benchmark_version=3,
is_pipeline=True
),
inputs=[metric_dropdown_pipeline],
outputs=dataframe_pipeline,
concurrency_limit=20,
).then(
fn=create_pipeline_plot,
inputs=[dataframe_pipeline, latency_radio],
outputs=performance_plot
)
with gr.Row():
gr.Markdown(
"""
**Note**: These results represent full pipeline evaluations on english queries ONLY (since other queries were mostly directly translated from their english counterparts).
We felt mutli-lingual results were less critical (and much more costly to evaluate on the full set) for pipelines, since one user could just add a translation module to the pipeline and expect similar performance to the english results.
If you feel this is a mistake and multi-lingual results are critical for pipelines, please let us know by opening an issue in the GitHub repository!
"""
)
# Automatically refresh the dataframe when the dropdown value changes
def refresh_pipeline_data(metric):
"""Refresh pipeline data when metric changes."""
df = pipeline_handler.render_df(metric, "english")
return add_rank_and_format(df, benchmark_version=3, is_pipeline=True)
# Update dataframe and then update the plot
metric_dropdown_pipeline.change(
refresh_pipeline_data,
inputs=[metric_dropdown_pipeline],
outputs=dataframe_pipeline,
).then(
fn=create_pipeline_plot,
inputs=[dataframe_pipeline, latency_radio],
outputs=performance_plot
)
research_textbox_pipeline.submit(
lambda metric, search_term, selected_columns: update_data_pipeline(metric, search_term, selected_columns),
inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
outputs=dataframe_pipeline,
).then(
fn=create_pipeline_plot,
inputs=[dataframe_pipeline, latency_radio],
outputs=performance_plot
)
column_checkboxes_pipeline.change(
lambda metric, search_term, selected_columns: update_data_pipeline(metric, search_term, selected_columns),
inputs=[metric_dropdown_pipeline, research_textbox_pipeline, column_checkboxes_pipeline],
outputs=dataframe_pipeline,
).then(
fn=create_pipeline_plot,
inputs=[dataframe_pipeline, latency_radio],
outputs=performance_plot
)
# Update plot when the radio button changes
latency_radio.change(
fn=create_pipeline_plot,
inputs=[dataframe_pipeline, latency_radio],
outputs=performance_plot
)
gr.Markdown(
f"""
- **Total Datasets**: {num_datasets_pipeline}
- **Total Scores**: {num_scores_pipeline}
- **Total Pipelines**: {num_pipelines}
"""
+ r"""
Please consider citing:
```bibtex
@misc{faysse2024colpaliefficientdocumentretrieval,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={[https://arxiv.org/abs/2407.01449](https://arxiv.org/abs/2407.01449)},
}
@misc{loison2026vidore,
title={ViDoRe V3: A Comprehensive Evaluation of Retrieval Augmented Generation in Complex Real-World Scenarios},
author={Loison, Ant{\'o}nio and Mac{\'e}, Quentin and Edy, Antoine and Xing, Victor and Balough, Tom and Moreira, Gabriel and Liu, Bo and Faysse, Manuel and Hudelot, C{\'e}line and Viaud, Gautier},
journal={arXiv preprint arXiv:2601.08620},
year={2026}
}
```
"""
)
else:
gr.Markdown("**No pipeline evaluation results available yet. Check back later!**")
with gr.TabItem("ViDoRe V2", id="vidore-v2"):
gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
gr.Markdown(
"""
Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
"""
)
datasets_columns_2 = list(data_benchmark_2.columns[4:])
with gr.Row():
metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
research_textbox_2 = gr.Textbox(
placeholder="🔍 Search Models... [press enter]",
label="Filter Models by Name",
)
column_checkboxes_2 = gr.CheckboxGroup(
choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display"
)
with gr.Row():
datatype_2 = ["number", "markdown"] + ["number"] * (num_datasets_2 + 1)
dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas")
def update_data_2(metric, search_term, selected_columns):
model_handler.get_vidore_data(metric)
data = model_handler.render_df(metric, benchmark_version=2)
data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns)
data = filter_models(data, search_term)
# data = remove_duplicates(data) # Add this line
if selected_columns:
data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns]
return data
with gr.Row():
refresh_button_2 = gr.Button("Refresh")
refresh_button_2.click(
get_refresh_function(model_handler, benchmark_version=2),
inputs=[metric_dropdown_2],
outputs=dataframe_2,
concurrency_limit=20,
)
with gr.Row():
gr.Markdown(
"""
**Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
Those numbers are not numbers obtained from the organisations that released those models.
"""
)
# Automatically refresh the dataframe when the dropdown value changes
metric_dropdown_2.change(
get_refresh_function(model_handler, benchmark_version=2),
inputs=[metric_dropdown_2],
outputs=dataframe_2,
)
research_textbox_2.submit(
lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
outputs=dataframe_2,
)
column_checkboxes_2.change(
lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns),
inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2],
outputs=dataframe_2,
)
gr.Markdown(
f"""
- **Total Datasets**: {num_datasets_2}
- **Total Scores**: {num_scores_2}
- **Total Models**: {num_models_2}
"""
+ r"""
Please consider citing:
```bibtex
@misc{faysse2024colpaliefficientdocumentretrieval,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2407.01449},
}
@misc{macé2025vidorebenchmarkv2raising,
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
author={Quentin Macé and António Loison and Manuel Faysse},
year={2025},
eprint={2505.17166},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2505.17166},
}
```
"""
)
with gr.TabItem("ViDoRe V1", id="vidore-v1"):
gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
gr.Markdown(
"""
Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
"""
)
datasets_columns_1 = list(data_benchmark_1.columns[4:])
with gr.Row():
metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
research_textbox_1 = gr.Textbox(
placeholder="🔍 Search Models... [press enter]",
label="Filter Models by Name",
)
column_checkboxes_1 = gr.CheckboxGroup(
choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display"
)
with gr.Row():
datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1)
dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas")
def update_data_1(metric, search_term, selected_columns):
model_handler.get_vidore_data(metric)
data = model_handler.render_df(metric, benchmark_version=1)
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
data = filter_models(data, search_term)
if selected_columns:
data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average Score"] + selected_columns]
return data
with gr.Row():
refresh_button_1 = gr.Button("Refresh")
refresh_button_1.click(
get_refresh_function(model_handler, benchmark_version=1),
inputs=[metric_dropdown_1],
outputs=dataframe_1,
concurrency_limit=20,
)
# Automatically refresh the dataframe when the dropdown value changes
metric_dropdown_1.change(
get_refresh_function(model_handler, benchmark_version=1),
inputs=[metric_dropdown_1],
outputs=dataframe_1,
)
research_textbox_1.submit(
lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
outputs=dataframe_1,
)
column_checkboxes_1.change(
lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns),
inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1],
outputs=dataframe_1,
)
gr.Markdown(
f"""
- **Total Datasets**: {num_datasets_1}
- **Total Scores**: {num_scores_1}
- **Total Models**: {num_models_1}
"""
+ r"""
Please consider citing:
```bibtex
@misc{faysse2024colpaliefficientdocumentretrieval,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2407.01449},
}
@misc{macé2025vidorebenchmarkv2raising,
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
author={Quentin Macé and António Loison and Manuel Faysse},
year={2025},
eprint={2505.17166},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2505.17166},
}
```
"""
)
with gr.TabItem("📚 Submit your model", id="submission"):
gr.Markdown("# How to Submit a New Model to the Leaderboard")
gr.Markdown(
"""
To submit a new model to the ViDoRe leaderboard, follow these steps:
1. **Evaluate your model**:
- Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB.
2. **Format your submission file**:
- Add the generated files to [MTEB results](https://github.com/embeddings-benchmark/results) project. Check the [Colpali results](https://github.com/embeddings-benchmark/results/tree/main/results/vidore__colpali-v1.3/1b5c8929330df1a66de441a9b5409a878f0de5b0) for an example.
And you're done! Your model will appear on the leaderboard when you click refresh! Once the space
gets rebooted, it will appear on startup.
Note: For proper hyperlink redirection, please ensure that your model repository name is in
kebab-case, e.g. `my-model-name`.
"""
)
with gr.TabItem("Deprecated ViDoRe V1", id="vidore-v1-deprecated"):
gr.Markdown(
"## Deprecation notice: This leaderboard contains the results computed with the "
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
"which is no longer maintained. Results should be computed using the "
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md)."
)
gr.Markdown("## Missing results in the new leaderboard are being added as they are re-computed.")
gr.Markdown("# [Deprecated] ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍")
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
gr.Markdown(
"""
Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab.
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models.
"""
)
deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:])
with gr.Row():
deprecated_metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
deprecated_research_textbox_1 = gr.Textbox(
placeholder="🔍 Search Models... [press enter]",
label="Filter Models by Name",
)
deprecated_column_checkboxes_1 = gr.CheckboxGroup(
choices=deprecated_datasets_columns_1, value=deprecated_datasets_columns_1, label="Select Columns to Display"
)
with gr.Row():
deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1)
deprecated_dataframe_1 = gr.Dataframe(deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas")
def deprecated_update_data_1(metric, search_term, selected_columns):
deprecated_model_handler.get_vidore_data(metric)
data = deprecated_model_handler.render_df(metric, benchmark_version=1)
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns)
data = filter_models(data, search_term)
# data = remove_duplicates(data) # Add this line
if selected_columns:
data = data[["Rank", "Model", "Average Score"] + selected_columns]
return data
with gr.Row():
deprecated_refresh_button_1 = gr.Button("Refresh")
deprecated_refresh_button_1.click(
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1),
inputs=[deprecated_metric_dropdown_1],
outputs=deprecated_dataframe_1,
concurrency_limit=20,
)
# Automatically refresh the dataframe when the dropdown value changes
deprecated_metric_dropdown_1.change(
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1),
inputs=[deprecated_metric_dropdown_1],
outputs=deprecated_dataframe_1,
)
deprecated_research_textbox_1.submit(
lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns),
inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1],
outputs=deprecated_dataframe_1,
)
deprecated_column_checkboxes_1.change(
lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns),
inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1],
outputs=deprecated_dataframe_1,
)
gr.Markdown(
f"""
- **Total Datasets**: {deprecated_num_datasets_1}
- **Total Scores**: {deprecated_num_scores_1}
- **Total Models**: {deprecated_num_models_1}
"""
+ r"""
Please consider citing:
```bibtex
@misc{faysse2024colpaliefficientdocumentretrieval,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2407.01449},
}
@misc{macé2025vidorebenchmarkv2raising,
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
author={Quentin Macé and António Loison and Manuel Faysse},
year={2025},
eprint={2505.17166},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2505.17166},
}
```
"""
)
with gr.TabItem("Deprecated ViDoRe V2", id="vidore-v2-deprecated"):
gr.Markdown(
"## Deprecation notice: This leaderboard contains the results computed with the "
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, "
"which is no longer maintained. Results should be computed using the "
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described "
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md)."
)
gr.Markdown("## Missing results in the new leaderboard are being added as they are re-computed.")
gr.Markdown("# [Deprecated] ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍")
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀")
gr.Markdown(
"""
Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab.
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models.
"""
)
deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:])
with gr.Row():
deprecated_metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
deprecated_research_textbox_2 = gr.Textbox(
placeholder="🔍 Search Models... [press enter]",
label="Filter Models by Name",
)
deprecated_column_checkboxes_2 = gr.CheckboxGroup(
choices=deprecated_datasets_columns_2, value=deprecated_datasets_columns_2, label="Select Columns to Display"
)
with gr.Row():
deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1)
deprecated_dataframe_2 = gr.Dataframe(deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas")
def deprecated_update_data_2(metric, search_term, selected_columns):
deprecated_model_handler.get_vidore_data(metric)
data = deprecated_model_handler.render_df(metric, benchmark_version=2)
data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns)
data = filter_models(data, search_term)
# data = remove_duplicates(data) # Add this line
if selected_columns:
data = data[["Rank", "Model", "Average Score"] + selected_columns]
return data
with gr.Row():
deprecated_refresh_button_2 = gr.Button("Refresh")
deprecated_refresh_button_2.click(
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2),
inputs=[deprecated_metric_dropdown_2],
outputs=deprecated_dataframe_2,
concurrency_limit=20,
)
with gr.Row():
gr.Markdown(
"""
**Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side.
Those numbers are not numbers obtained from the organisations that released those models.
"""
)
# Automatically refresh the dataframe when the dropdown value changes
deprecated_metric_dropdown_2.change(
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2),
inputs=[deprecated_metric_dropdown_2],
outputs=deprecated_dataframe_2,
)
deprecated_research_textbox_2.submit(
lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns),
inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2],
outputs=deprecated_dataframe_2,
)
deprecated_column_checkboxes_2.change(
lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns),
inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2],
outputs=deprecated_dataframe_2,
)
gr.Markdown(
f"""
- **Total Datasets**: {deprecated_num_datasets_2}
- **Total Scores**: {deprecated_num_scores_2}
- **Total Models**: {deprecated_num_models_2}
"""
+ r"""
Please consider citing:
```bibtex
@misc{faysse2024colpaliefficientdocumentretrieval,
title={ColPali: Efficient Document Retrieval with Vision Language Models},
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
year={2024},
eprint={2407.01449},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2407.01449},
}
@misc{macé2025vidorebenchmarkv2raising,
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval},
author={Quentin Macé and António Loison and Manuel Faysse},
year={2025},
eprint={2505.17166},
archivePrefix={arXiv},
primaryClass={cs.IR},
url={https://arxiv.org/abs/2505.17166},
}
```
"""
)
def select_tab_from_url(request: gr.Request):
# Grab query parameters from the URL
query_params = dict(request.query_params)
# Look for ?tab=..., default to the first tab's ID if not found
target_tab = query_params.get("tab", "vidore-v3")
# Update the tabs component to select the target ID
return gr.update(selected=target_tab)
block.load(select_tab_from_url, inputs=None, outputs=tabs)
block.queue(max_size=10).launch(debug=True)
if __name__ == "__main__":
main()