import gradio as gr
import pandas as pd
# ============== DATA ==============
LEADERBOARD_DATA = [
# Open-Source Agent Framework
{"model": "Deep Researcher", "category": "Open-Source", "overall": 2.28, "well_written": 1.90, "neutral": 1.90, "broad": 3.75, "cov_wiki": 5.62, "ref_acc": None},
{"model": "Tongyi Deep Research", "category": "Open-Source", "overall": 15.05, "well_written": 10.90, "neutral": 11.60, "broad": 30.25, "cov_wiki": 22.73, "ref_acc": None},
{"model": "Langchain (GPT-4.1)", "category": "Open-Source", "overall": 20.67, "well_written": 18.76, "neutral": 19.40, "broad": 27.25, "cov_wiki": 7.08, "ref_acc": 7.34},
{"model": "Langchain (GPT-5)", "category": "Open-Source", "overall": 53.62, "well_written": 50.95, "neutral": 54.20, "broad": 59.88, "cov_wiki": 20.96, "ref_acc": 67.60},
# Proprietary Agent Framework
{"model": "Doubao Deep Research", "category": "Proprietary", "overall": 19.13, "well_written": 16.00, "neutral": 16.10, "broad": 31.13, "cov_wiki": 22.97, "ref_acc": 37.05},
{"model": "Qwen-3-max Deep Research", "category": "Proprietary", "overall": 25.15, "well_written": 18.29, "neutral": 27.70, "broad": 40.00, "cov_wiki": 22.22, "ref_acc": 61.44},
{"model": "Perplexity Deep Research", "category": "Proprietary", "overall": 27.38, "well_written": 26.79, "neutral": 20.40, "broad": 37.63, "cov_wiki": 29.21, "ref_acc": 28.27},
{"model": "Grok Deep Search", "category": "Proprietary", "overall": 28.38, "well_written": 27.52, "neutral": 24.30, "broad": 35.75, "cov_wiki": 20.73, "ref_acc": 60.63},
{"model": "OpenAI o3 Deep Research", "category": "Proprietary", "overall": 31.08, "well_written": 28.43, "neutral": 24.90, "broad": 45.75, "cov_wiki": 25.12, "ref_acc": 57.44},
{"model": "Gemini-2.5-pro Deep Research", "category": "Proprietary", "overall": 35.18, "well_written": 30.10, "neutral": 26.10, "broad": 59.88, "cov_wiki": 30.76, "ref_acc": 41.68},
{"model": "Gemini-3-pro Deep Research", "category": "Proprietary", "overall": 58.33, "well_written": 60.81, "neutral": 46.10, "broad": 67.12, "cov_wiki": 28.83, "ref_acc": 66.98},
]
# Create DataFrame
df = pd.DataFrame(LEADERBOARD_DATA)
df = df.sort_values("overall", ascending=False).reset_index(drop=True)
df.insert(0, "rank", range(1, len(df) + 1))
# Add rank emoji for top 3
def format_rank(rank):
if rank == 1:
return "🥇"
elif rank == 2:
return "🥈"
elif rank == 3:
return "🥉"
return str(rank)
df["rank"] = df["rank"].apply(format_rank)
# ============== CONSTANTS ==============
TITLE = """
Wiki Live Challenge Leaderboard
"""
SUBTITLE = """
A Live Benchmark for Evaluating Deep Research Agents on Wikipedia-Quality Article Generation
Code |
Website |
Dataset |
Total Models: {num_models} | Last Update: February 2026
""".format(num_models=len(LEADERBOARD_DATA))
ABOUT_TEXT = """
## 📖 About Wiki Live Challenge
**Wiki Live Challenge (WLC)** is the first **live benchmark** for evaluating Deep Research Agents (DRAs) on their ability to generate Wikipedia-quality articles.
### 🔄 Live Benchmark Design
- **Six-Month Rolling Window**: Continuously collects newly promoted Wikipedia Good Articles
- **Periodic Updates**: New benchmark versions released as `__`
- **Current Version**: `2025_Mar_Nov` with 100 high-quality articles
### 📐 Evaluation Dimensions
#### Wiki Writing (Criteria-based Quality Evaluation)
Compares article quality against Wikipedia ground truth using **39 criteria** from Wikipedia's Manual of Style:
| Category | Criteria | Description |
|----------|----------|-------------|
| **Well-written** | 21 | Encyclopedic style, lead section quality, words to watch |
| **Broad in coverage** | 8 | Topic coverage, focus, structure completeness |
| **Neutral** | 10 | Fair viewpoints, avoid opinions as facts |
**Metric**: Win rate (percentage of criteria where generated article wins)
#### Wiki Fact (Factual Accuracy Evaluation)
- **Cov. Wiki (Coverage)**: Factual coverage against extracted Wikipedia fact list
- **Ref. Acc. (Reference Accuracy)**: Proportion of cited statements supported by their referenced webpages
### 📊 Column Descriptions
| Column | Description |
|--------|-------------|
| **Rank** | Model ranking based on Wiki Writing Overall score |
| **Model** | Name of the Deep Research Agent |
| **Category** | Open-Source or Proprietary framework |
| **Overall** | Overall Wiki Writing win rate |
| **Well-writ.** | Well-written criteria win rate |
| **Neutral** | Neutrality criteria win rate |
| **Broad** | Broad coverage criteria win rate |
| **Cov. Wiki** | Factual coverage against Wikipedia |
| **Ref. Acc.** | Reference accuracy (-- if citation extraction not possible) |
### 🔗 Resources
- 🌐 **Website**: [Wiki Live Challenge](http://agentresearchlab.org/benchmarks/wiki-live-challenge/index.html#home)
- 💻 **Code**: [github.com/WangShao2000/Wiki_Live_Challenge](https://github.com/WangShao2000/Wiki_Live_Challenge)
- 📊 **Dataset**: [huggingface.co/datasets/muset-ai/Wiki_Live_Challenge](https://huggingface.co/datasets/muset-ai/Wiki_Live_Challenge)
"""
CITATION_TEXT = """@misc{wang2026wikilivechallengechallenging,
title={Wiki Live Challenge: Challenging Deep Research Agents with Expert-Level Wikipedia Articles},
author={Shaohan Wang and Benfeng Xu and Licheng Zhang and Mingxuan Du and Chiwei Zhu and Xiaorui Wang and Zhendong Mao and Yongdong Zhang},
year={2026},
eprint={2602.01590},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2602.01590},
}"""
CUSTOM_CSS = """
.markdown-text {
font-size: 16px !important;
}
#leaderboard-table {
margin-top: 15px;
overflow-x: auto !important;
}
/* Gradio 5.x Dataframe styling */
#leaderboard-table table {
table-layout: auto !important;
min-width: 100% !important;
}
#leaderboard-table th,
#leaderboard-table td,
#leaderboard-table .cell-wrap,
#leaderboard-table span,
#leaderboard-table div {
white-space: nowrap !important;
overflow: visible !important;
text-overflow: clip !important;
}
#leaderboard-table th {
min-width: 80px !important;
}
.tab-buttons button {
font-size: 18px;
}
#citation-button textarea {
font-size: 14px !important;
font-family: monospace;
}
h1 {
font-weight: bold;
}
.gradio-container {
max-width: 1600px !important;
}
"""
# ============== FILTER FUNCTIONS ==============
def filter_leaderboard(search_query, categories):
filtered_df = df.copy()
# Filter by search query
if search_query:
filtered_df = filtered_df[filtered_df["model"].str.lower().str.contains(search_query.lower())]
# Filter by category
if categories:
filtered_df = filtered_df[filtered_df["category"].isin(categories)]
# Re-rank after filtering
filtered_df = filtered_df.reset_index(drop=True)
return filtered_df
def get_display_df(search_query, categories):
filtered = filter_leaderboard(search_query, categories)
# Rename columns for display
display_df = filtered.rename(columns={
"rank": "Rank",
"model": "Model",
"category": "Category",
"overall": "Overall",
"well_written": "Well-writ.",
"neutral": "Neutral",
"broad": "Broad",
"cov_wiki": "Cov. Wiki",
"ref_acc": "Ref. Acc."
})
# Format ref_acc column (replace None with --)
display_df["Ref. Acc."] = display_df["Ref. Acc."].apply(lambda x: "--" if pd.isna(x) else f"{x:.2f}")
# Format numeric columns
for col in ["Overall", "Well-writ.", "Neutral", "Broad", "Cov. Wiki"]:
display_df[col] = display_df[col].apply(lambda x: f"{x:.2f}")
return display_df
# ============== GRADIO APP ==============
demo = gr.Blocks(css=CUSTOM_CSS, title="Wiki Live Challenge Leaderboard")
with demo:
gr.HTML(TITLE)
gr.HTML(SUBTITLE)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
# Leaderboard Tab
with gr.TabItem("🏆 Leaderboard", elem_id="leaderboard-tab", id=0):
with gr.Row():
with gr.Column(scale=1):
search_box = gr.Textbox(
label="Model Search",
placeholder="Enter model name to search...",
show_label=True
)
with gr.Column(scale=1):
category_filter = gr.CheckboxGroup(
choices=["Open-Source", "Proprietary"],
value=["Open-Source", "Proprietary"],
label="Model Categories",
interactive=True
)
# Initial display
initial_df = get_display_df("", ["Open-Source", "Proprietary"])
leaderboard_table = gr.Dataframe(
value=initial_df,
headers=["Rank", "Model", "Category", "Overall", "Well-writ.", "Neutral", "Broad", "Cov. Wiki", "Ref. Acc."],
datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str"],
elem_id="leaderboard-table",
interactive=False,
column_widths=["60px", "250px", "100px", "80px", "80px", "80px", "80px", "80px", "80px"],
)
# Update table on filter change
search_box.change(
fn=get_display_df,
inputs=[search_box, category_filter],
outputs=leaderboard_table
)
category_filter.change(
fn=get_display_df,
inputs=[search_box, category_filter],
outputs=leaderboard_table
)
# Column descriptions
gr.Markdown("""
### 📊 Column Descriptions
- **Rank**: Model ranking based on Overall score
- **Model**: Deep Research Agent name
- **Overall/Well-writ./Neutral/Broad**: Wiki Writing win rates (%)
- **Cov. Wiki**: Factual coverage against Wikipedia (%)
- **Ref. Acc.**: Reference accuracy (-- indicates citation extraction not possible)
""")
# About Tab
with gr.TabItem("📝 About", elem_id="about-tab", id=1):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
# Citation
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
gr.Textbox(
value=CITATION_TEXT,
label="Copy the following snippet to cite these results",
lines=8,
elem_id="citation-button",
show_copy_button=True,
)
demo.launch()