Wiki Live Challenge Leaderboard

import gradio as gr
import pandas as pd

# ============== DATA ==============
LEADERBOARD_DATA = [
    # Open-Source Agent Framework
    {"model": "Deep Researcher", "category": "Open-Source", "overall": 2.28, "well_written": 1.90, "neutral": 1.90, "broad": 3.75, "cov_wiki": 5.62, "ref_acc": None},
    {"model": "Tongyi Deep Research", "category": "Open-Source", "overall": 15.05, "well_written": 10.90, "neutral": 11.60, "broad": 30.25, "cov_wiki": 22.73, "ref_acc": None},
    {"model": "Langchain (GPT-4.1)", "category": "Open-Source", "overall": 20.67, "well_written": 18.76, "neutral": 19.40, "broad": 27.25, "cov_wiki": 7.08, "ref_acc": 7.34},
    {"model": "Langchain (GPT-5)", "category": "Open-Source", "overall": 53.62, "well_written": 50.95, "neutral": 54.20, "broad": 59.88, "cov_wiki": 20.96, "ref_acc": 67.60},
    # Proprietary Agent Framework
    {"model": "Doubao Deep Research", "category": "Proprietary", "overall": 19.13, "well_written": 16.00, "neutral": 16.10, "broad": 31.13, "cov_wiki": 22.97, "ref_acc": 37.05},
    {"model": "Qwen-3-max Deep Research", "category": "Proprietary", "overall": 25.15, "well_written": 18.29, "neutral": 27.70, "broad": 40.00, "cov_wiki": 22.22, "ref_acc": 61.44},
    {"model": "Perplexity Deep Research", "category": "Proprietary", "overall": 27.38, "well_written": 26.79, "neutral": 20.40, "broad": 37.63, "cov_wiki": 29.21, "ref_acc": 28.27},
    {"model": "Grok Deep Search", "category": "Proprietary", "overall": 28.38, "well_written": 27.52, "neutral": 24.30, "broad": 35.75, "cov_wiki": 20.73, "ref_acc": 60.63},
    {"model": "OpenAI o3 Deep Research", "category": "Proprietary", "overall": 31.08, "well_written": 28.43, "neutral": 24.90, "broad": 45.75, "cov_wiki": 25.12, "ref_acc": 57.44},
    {"model": "Gemini-2.5-pro Deep Research", "category": "Proprietary", "overall": 35.18, "well_written": 30.10, "neutral": 26.10, "broad": 59.88, "cov_wiki": 30.76, "ref_acc": 41.68},
    {"model": "Gemini-3-pro Deep Research", "category": "Proprietary", "overall": 58.33, "well_written": 60.81, "neutral": 46.10, "broad": 67.12, "cov_wiki": 28.83, "ref_acc": 66.98},
]

# Create DataFrame
df = pd.DataFrame(LEADERBOARD_DATA)
df = df.sort_values("overall", ascending=False).reset_index(drop=True)
df.insert(0, "rank", range(1, len(df) + 1))

# Add rank emoji for top 3
def format_rank(rank):
    if rank == 1:
        return "🥇"
    elif rank == 2:
        return "🥈"
    elif rank == 3:
        return "🥉"
    return str(rank)

df["rank"] = df["rank"].apply(format_rank)

# ============== CONSTANTS ==============
TITLE = """<h1 align="center" style="color: #ff6b35; margin-bottom: 0;">Wiki Live Challenge Leaderboard</h1>"""

SUBTITLE = """
<p align="center" style="font-size: 16px; color: #666; margin-top: 5px;">
A Live Benchmark for Evaluating Deep Research Agents on Wikipedia-Quality Article Generation
</p>
<p align="center" style="font-size: 14px;">
<a href="https://github.com/WangShao2000/Wiki_Live_Challenge" target="_blank">Code</a> | 
<a href="http://agentresearchlab.org/benchmarks/wiki-live-challenge/index.html#home" target="_blank">Website</a> | 
<a href="https://huggingface.co/datasets/muset-ai/Wiki_Live_Challenge" target="_blank">Dataset</a> |
Total Models: <b>{num_models}</b> | Last Update: <b>February 2026</b>
</p>
""".format(num_models=len(LEADERBOARD_DATA))

ABOUT_TEXT = """
## 📖 About Wiki Live Challenge

**Wiki Live Challenge (WLC)** is the first **live benchmark** for evaluating Deep Research Agents (DRAs) on their ability to generate Wikipedia-quality articles.

### 🔄 Live Benchmark Design
- **Six-Month Rolling Window**: Continuously collects newly promoted Wikipedia Good Articles
- **Periodic Updates**: New benchmark versions released as `<year>_<start_month>_<end_month>`
- **Current Version**: `2025_Mar_Nov` with 100 high-quality articles

### 📐 Evaluation Dimensions

#### Wiki Writing (Criteria-based Quality Evaluation)
Compares article quality against Wikipedia ground truth using **39 criteria** from Wikipedia's Manual of Style:

| Category | Criteria | Description |
|----------|----------|-------------|
| **Well-written** | 21 | Encyclopedic style, lead section quality, words to watch |
| **Broad in coverage** | 8 | Topic coverage, focus, structure completeness |
| **Neutral** | 10 | Fair viewpoints, avoid opinions as facts |

**Metric**: Win rate (percentage of criteria where generated article wins)

#### Wiki Fact (Factual Accuracy Evaluation)

- **Cov. Wiki (Coverage)**: Factual coverage against extracted Wikipedia fact list
- **Ref. Acc. (Reference Accuracy)**: Proportion of cited statements supported by their referenced webpages

### 📊 Column Descriptions

| Column | Description |
|--------|-------------|
| **Rank** | Model ranking based on Wiki Writing Overall score |
| **Model** | Name of the Deep Research Agent |
| **Category** | Open-Source or Proprietary framework |
| **Overall** | Overall Wiki Writing win rate |
| **Well-writ.** | Well-written criteria win rate |
| **Neutral** | Neutrality criteria win rate |
| **Broad** | Broad coverage criteria win rate |
| **Cov. Wiki** | Factual coverage against Wikipedia |
| **Ref. Acc.** | Reference accuracy (-- if citation extraction not possible) |

### 🔗 Resources
- 🌐 **Website**: [Wiki Live Challenge](http://agentresearchlab.org/benchmarks/wiki-live-challenge/index.html#home)
- 💻 **Code**: [github.com/WangShao2000/Wiki_Live_Challenge](https://github.com/WangShao2000/Wiki_Live_Challenge)
- 📊 **Dataset**: [huggingface.co/datasets/muset-ai/Wiki_Live_Challenge](https://huggingface.co/datasets/muset-ai/Wiki_Live_Challenge)
"""

CITATION_TEXT = """@misc{wang2026wikilivechallengechallenging,
      title={Wiki Live Challenge: Challenging Deep Research Agents with Expert-Level Wikipedia Articles}, 
      author={Shaohan Wang and Benfeng Xu and Licheng Zhang and Mingxuan Du and Chiwei Zhu and Xiaorui Wang and Zhendong Mao and Yongdong Zhang},
      year={2026},
      eprint={2602.01590},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2602.01590}, 
}"""

CUSTOM_CSS = """
.markdown-text {
    font-size: 16px !important;
}
#leaderboard-table {
    margin-top: 15px;
    overflow-x: auto !important;
}
/* Gradio 5.x Dataframe styling */
#leaderboard-table table {
    table-layout: auto !important;
    min-width: 100% !important;
}
#leaderboard-table th,
#leaderboard-table td,
#leaderboard-table .cell-wrap,
#leaderboard-table span,
#leaderboard-table div {
    white-space: nowrap !important;
    overflow: visible !important;
    text-overflow: clip !important;
}
#leaderboard-table th {
    min-width: 80px !important;
}
.tab-buttons button {
    font-size: 18px;
}
#citation-button textarea {
    font-size: 14px !important;
    font-family: monospace;
}
h1 {
    font-weight: bold;
}
.gradio-container {
    max-width: 1600px !important;
}
"""

# ============== FILTER FUNCTIONS ==============
def filter_leaderboard(search_query, categories):
    filtered_df = df.copy()
    
    # Filter by search query
    if search_query:
        filtered_df = filtered_df[filtered_df["model"].str.lower().str.contains(search_query.lower())]
    
    # Filter by category
    if categories:
        filtered_df = filtered_df[filtered_df["category"].isin(categories)]
    
    # Re-rank after filtering
    filtered_df = filtered_df.reset_index(drop=True)
    
    return filtered_df

def get_display_df(search_query, categories):
    filtered = filter_leaderboard(search_query, categories)
    
    # Rename columns for display
    display_df = filtered.rename(columns={
        "rank": "Rank",
        "model": "Model",
        "category": "Category",
        "overall": "Overall",
        "well_written": "Well-writ.",
        "neutral": "Neutral",
        "broad": "Broad",
        "cov_wiki": "Cov. Wiki",
        "ref_acc": "Ref. Acc."
    })
    
    # Format ref_acc column (replace None with --)
    display_df["Ref. Acc."] = display_df["Ref. Acc."].apply(lambda x: "--" if pd.isna(x) else f"{x:.2f}")
    
    # Format numeric columns
    for col in ["Overall", "Well-writ.", "Neutral", "Broad", "Cov. Wiki"]:
        display_df[col] = display_df[col].apply(lambda x: f"{x:.2f}")
    
    return display_df

# ============== GRADIO APP ==============
demo = gr.Blocks(css=CUSTOM_CSS, title="Wiki Live Challenge Leaderboard")

with demo:
    gr.HTML(TITLE)
    gr.HTML(SUBTITLE)
    
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        # Leaderboard Tab
        with gr.TabItem("🏆 Leaderboard", elem_id="leaderboard-tab", id=0):
            with gr.Row():
                with gr.Column(scale=1):
                    search_box = gr.Textbox(
                        label="Model Search",
                        placeholder="Enter model name to search...",
                        show_label=True
                    )
                with gr.Column(scale=1):
                    category_filter = gr.CheckboxGroup(
                        choices=["Open-Source", "Proprietary"],
                        value=["Open-Source", "Proprietary"],
                        label="Model Categories",
                        interactive=True
                    )
            
            # Initial display
            initial_df = get_display_df("", ["Open-Source", "Proprietary"])
            
            leaderboard_table = gr.Dataframe(
                value=initial_df,
                headers=["Rank", "Model", "Category", "Overall", "Well-writ.", "Neutral", "Broad", "Cov. Wiki", "Ref. Acc."],
                datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str"],
                elem_id="leaderboard-table",
                interactive=False,
                column_widths=["60px", "250px", "100px", "80px", "80px", "80px", "80px", "80px", "80px"],
            )
            
            # Update table on filter change
            search_box.change(
                fn=get_display_df,
                inputs=[search_box, category_filter],
                outputs=leaderboard_table
            )
            category_filter.change(
                fn=get_display_df,
                inputs=[search_box, category_filter],
                outputs=leaderboard_table
            )
            
            # Column descriptions
            gr.Markdown("""
### 📊 Column Descriptions
- **Rank**: Model ranking based on Overall score
- **Model**: Deep Research Agent name
- **Overall/Well-writ./Neutral/Broad**: Wiki Writing win rates (%)
- **Cov. Wiki**: Factual coverage against Wikipedia (%)
- **Ref. Acc.**: Reference accuracy (-- indicates citation extraction not possible)
            """)
        
        # About Tab
        with gr.TabItem("📝 About", elem_id="about-tab", id=1):
            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
    
    # Citation
    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            gr.Textbox(
                value=CITATION_TEXT,
                label="Copy the following snippet to cite these results",
                lines=8,
                elem_id="citation-button",
                show_copy_button=True,
            )

demo.launch()