{"Judge": "Rule-based", "Overall": 83.8, "Recall": 55.9, "F1": 67.1, "AB": 25.0, "VWA": 85.2, "WA": 79.0, "Work": 100.0, "Work++": 83.3, "Author": "Benchmark authors", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "World-State-Model-7B", "Overall": 71.2, "Recall": 72.2, "F1": 71.7, "AB": 53.8, "VWA": 64.4, "WA": 70.1, "Work": 93.3, "Work++": 86.4, "Author": "Sun et al.", "Project URL": "https://arxiv.org/abs/2508.04700", "Logs URL": "https://huggingface.co/datasets/Zery/WSM-7B-AgentRewardBench/tree/main"} {"Judge": "AER-C (GPT-4o)", "Overall": 67.7, "Recall": 71.9, "F1": 69.7, "AB": 83.3, "VWA": 56.0, "WA": 68.8, "Work": 100.0, "Work++": 66.7, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "AER-V (GPT-4o)", "Overall": 67.6, "Recall": 71.5, "F1": 69.5, "AB": 83.3, "VWA": 61.2, "WA": 67.6, "Work": 96.4, "Work++": 59.3, "Author": "Pan et al.", "Project URL": "https://arxiv.org/abs/2404.06474", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "NNetNav (Llama-3.3 70B)", "Overall": 52.5, "Recall": 82.4, "F1": 64.1, "AB": 20.8, "VWA": 54.5, "WA": 54.3, "Work": 77.3, "Work++": 43.2, "Author": "Murty et al.", "Project URL": "https://arxiv.org/abs/2410.02907", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "Claude 3.7 S. (A)", "Overall": 68.8, "Recall": 81.6, "F1": 74.7, "AB": 87.5, "VWA": 61.0, "WA": 69.3, "Work": 85.0, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "GPT-4o (A)", "Overall": 69.8, "Recall": 83.1, "F1": 75.9, "AB": 77.8, "VWA": 63.0, "WA": 70.2, "Work": 94.6, "Work++": 63.0, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "GPT-4o Mini (A)", "Overall": 61.5, "Recall": 86.1, "F1": 71.7, "AB": 80.0, "VWA": 57.9, "WA": 63.5, "Work": 84.2, "Work++": 49.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "Llama 3.3 (A)", "Overall": 67.7, "Recall": 79.0, "F1": 72.9, "AB": 75.0, "VWA": 59.6, "WA": 68.2, "Work": 94.3, "Work++": 62.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "Qwen2.5-VL (A)", "Overall": 64.3, "Recall": 89.8, "F1": 75.0, "AB": 72.7, "VWA": 59.3, "WA": 63.6, "Work": 87.2, "Work++": 60.3, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "Claude 3.7 S. (S)", "Overall": 69.4, "Recall": 76.3, "F1": 72.7, "AB": 71.4, "VWA": 64.8, "WA": 69.3, "Work": 85.3, "Work++": 66.7, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "GPT-4o (S)", "Overall": 68.1, "Recall": 80.3, "F1": 73.7, "AB": 77.8, "VWA": 60.7, "WA": 69.9, "Work": 93.8, "Work++": 59.6, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "GPT-4o Mini (S)", "Overall": 64.5, "Recall": 78.3, "F1": 70.8, "AB": 80.0, "VWA": 57.4, "WA": 66.9, "Work": 90.3, "Work++": 54.8, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "Qwen2.5-VL (S)", "Overall": 64.5, "Recall": 86.1, "F1": 73.7, "AB": 70.0, "VWA": 58.5, "WA": 62.9, "Work": 93.8, "Work++": 64.4, "Author": "L\u00f9 et al.", "Project URL": "https://arxiv.org/abs/2504.08942", "Logs URL": "https://huggingface.co/datasets/McGill-NLP/agent-reward-bench/tree/main/judgments"} {"Judge": "WebJudge-7B", "Overall": 75.7, "Recall": 58.0, "F1": 65.6, "AB": 80.0, "VWA": 66.7, "WA": 77.5, "Work": 100.0, "Work++": 70.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/pdf/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"} {"Judge": "WebJudge (o4-mini)", "Overall": 82.0, "Recall": 47.8, "F1": 60.4, "AB": 100.0, "VWA": 74.5, "WA": 81.2, "Work": 100.0, "Work++": 90.0, "Author": "Xue et al.", "Project URL": "https://arxiv.org/abs/2504.01382", "Logs URL": "https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/agent_reward_bench_evaluation_results"}