| Judge Model,full_name,Realization,Ranking Agreement,Decisiveness,Bias,ci_low,ci_high | |
| URM-LLaMa-3.1-8B,URM-LLaMa-3.1-8B_BT,Reward,0.818819404,1.836856612,0.084857794,, | |
| Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#bad-good_textual-score_BT,Likert,0.817304917,4.755366194,0.079246328,, | |
| Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#0-100_verbalized-score_BT,Numeric,0.814143776,4.087812685,0.079252047,, | |
| mistral-large-instruct-2407,mistral-large-instruct-2407#bad-good_textual-score_BT,Likert,0.810637054,5.471086171,0.085717614,, | |
| gpt-4o-2024-11-20,gpt-4o-2024-11-20#comparative-anchor-gpt-4-0314_BT,Anchor,0.809468147,3.073700094,0.084683633,, | |
| mistral-large-instruct-2407,mistral-large-instruct-2407#0-100_verbalized-score_BT,Numeric,0.809468147,3.010016115,0.082093515,, | |
| llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#0-100_verbalized-score_BT,Numeric,0.804792519,4.330580225,0.087135759,, | |
| gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#0-100_verbalized-score_BT,Numeric,0.803623612,2.911340337,0.076904561,, | |
| gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#bad-good_textual-score_BT,Likert,0.797779077,4.610807214,0.087159333,, | |
| llama-3-1-70b-instruct,llama-3-1-70b-instruct#0-100_verbalized-score_BT,Numeric,0.797779077,2.693966881,0.086833957,, | |
| Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.794272355,2.929554168,0.089619978,, | |
| llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#bad-good_textual-score_BT,Likert,0.787258913,5.218423477,0.097263668,, | |
| Skywork-Reward-Llama-3.1-8B-v0.2,Skywork-Reward-Llama-3.1-8B-v0.2_BT,Reward,0.777907656,2.461196439,0.099684483,, | |
| Qwen2.5-72B-Instruct,Qwen2.5-72B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.776738749,2.689252148,0.081655614,, | |
| mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.775569842,2.123702381,0.088781499,, | |
| gpt-4o-2024-11-20,gpt-4o-2024-11-20#0-100_verbalized-score_BT,Numeric,0.774400935,2.147368211,0.07704892,, | |
| gpt-4o-2024-11-20,gpt-4o-2024-11-20#bad-good_textual-score_BT,Likert,0.772873462,5.485635896,0.089067918,, | |
| llama-3-1-70b-instruct,llama-3-1-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.765049679,1.259994089,0.069748002,, | |
| Llama-3-OffsetBias-RM-8B,Llama-3-OffsetBias-RM-8B_BT,Reward,0.765049679,1.386859931,0.075669848,, | |
| ArmoRM-Llama3-8B-v0.1,ArmoRM-Llama3-8B-v0.1_BT,Reward,0.762711864,1.839870032,0.092372835,, | |
| gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#good-yes-no_logprob-score_BT,TokenProbs,0.752191701,2.102594937,0.08401741,, | |
| llama-3-70b-instruct,llama-3-70b-instruct#0-100_verbalized-score_BT,Numeric,0.74868498,1.273829005,0.08433286,, | |
| Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#good-yes-no_logprob-score_BT,TokenProbs,0.74050263,0.598380841,0.0612823,, | |
| mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.738164816,2.534301905,0.107585602,, | |
| llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#comparative-anchor-gpt-4-0314_BT,Anchor,0.729982466,3.577096075,0.112359855,, | |
| mistral-large-instruct-2407,mistral-large-instruct-2407#comparative-anchor-gpt-4-0314_BT,Anchor,0.725306838,2.129762371,0.111101469,, | |
| Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#bad-good_textual-score_BT,Likert,0.722969024,0.934857221,0.090203472,, | |
| llama-3-1-70b-instruct,llama-3-1-70b-instruct#bad-good_textual-score_BT,Likert,0.721800117,3.901943148,0.120093327,, | |
| internlm2-20b-reward,internlm2-20b-reward_BT,Reward,0.717124489,1.900369161,0.098389178,, | |
| internlm2-7b-reward,internlm2-7b-reward_BT,Reward,0.71244886,2.3536645,0.113364304,, | |
| GRM-Llama3.2-3B-rewardmodel-ft,GRM-Llama3.2-3B-rewardmodel-ft_BT,Reward,0.711279953,2.302320479,0.113801314,, | |
| mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.701928697,1.84957128,0.088445538,, | |
| gpt-4o-2024-11-20,gpt-4o-2024-11-20#good-yes-no_logprob-score_BT,TokenProbs,0.700380036,2.224158523,0.093196512,, | |
| llama-3-70b-instruct,llama-3-70b-instruct#bad-good_textual-score_BT,Likert,0.698421975,2.400124125,0.122002475,, | |
| llama-3-1-70b-instruct,llama-3-1-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.687901812,2.711477731,0.12621163,, | |
| Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.677381648,0.868408018,0.085109852,, | |
| llama-3-1-405b-instruct-fp8,llama-3-1-405b-instruct-fp8#good-yes-no_logprob-score_BT,TokenProbs,0.671537113,1.549707158,0.09227088,, | |
| Llama-3.1-8B-Instruct,Llama-3.1-8B-Instruct#0-100_verbalized-score_BT,Numeric,0.668030392,1.204521517,0.104312797,, | |
| llama-3-70b-instruct,llama-3-70b-instruct#good-yes-no_logprob-score_BT,TokenProbs,0.663354763,0.774852442,0.07120683,, | |
| gpt-4o-mini-2024-07-18,gpt-4o-mini-2024-07-18#comparative-anchor-gpt-4-0314_BT,Anchor,0.658679135,1.412320144,0.110822533,, | |
| mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#0-100_verbalized-score_BT,Numeric,0.656341321,1.270349906,0.102331076,, | |
| mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.655172414,1.167971623,0.101686017,, | |
| mixtral-8x22B-instruct-v0.1,mixtral-8x22B-instruct-v0.1#comparative-anchor-gpt-4-0314_BT,Anchor,0.641145529,1.497179025,0.13983279,, | |
| llama-3-70b-instruct,llama-3-70b-instruct#comparative-anchor-gpt-4-0314_BT,Anchor,0.632963179,1.820241227,0.131968221,, | |
| Eurus-RM-7b,Eurus-RM-7b_BT,Reward,0.628287551,2.492726583,0.138112675,, | |
| mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#bad-good_textual-score_BT,Likert,0.589713618,0.838122343,0.110173865,, | |
| mixtral-8x7B-instruct-v0.1,mixtral-8x7B-instruct-v0.1#good-yes-no_logprob-score_BT,TokenProbs,0.427235535,0.738981983,0.107075052,, | |
| mistral-large-instruct-2407,mistral-large-instruct-2407#good-yes-no_logprob-score_BT,TokenProbs,0.368790181,1.165216882,0.122587285,, |