import gradio as gr import pandas as pd # ============== DATA ============== LEADERBOARD_DATA = [ # Open-Source Agent Framework {"model": "Deep Researcher", "category": "Open-Source", "overall": 2.28, "well_written": 1.90, "neutral": 1.90, "broad": 3.75, "cov_wiki": 5.62, "ref_acc": None}, {"model": "Tongyi Deep Research", "category": "Open-Source", "overall": 15.05, "well_written": 10.90, "neutral": 11.60, "broad": 30.25, "cov_wiki": 22.73, "ref_acc": None}, {"model": "Langchain (GPT-4.1)", "category": "Open-Source", "overall": 20.67, "well_written": 18.76, "neutral": 19.40, "broad": 27.25, "cov_wiki": 7.08, "ref_acc": 7.34}, {"model": "Langchain (GPT-5)", "category": "Open-Source", "overall": 53.62, "well_written": 50.95, "neutral": 54.20, "broad": 59.88, "cov_wiki": 20.96, "ref_acc": 67.60}, # Proprietary Agent Framework {"model": "Doubao Deep Research", "category": "Proprietary", "overall": 19.13, "well_written": 16.00, "neutral": 16.10, "broad": 31.13, "cov_wiki": 22.97, "ref_acc": 37.05}, {"model": "Qwen-3-max Deep Research", "category": "Proprietary", "overall": 25.15, "well_written": 18.29, "neutral": 27.70, "broad": 40.00, "cov_wiki": 22.22, "ref_acc": 61.44}, {"model": "Perplexity Deep Research", "category": "Proprietary", "overall": 27.38, "well_written": 26.79, "neutral": 20.40, "broad": 37.63, "cov_wiki": 29.21, "ref_acc": 28.27}, {"model": "Grok Deep Search", "category": "Proprietary", "overall": 28.38, "well_written": 27.52, "neutral": 24.30, "broad": 35.75, "cov_wiki": 20.73, "ref_acc": 60.63}, {"model": "OpenAI o3 Deep Research", "category": "Proprietary", "overall": 31.08, "well_written": 28.43, "neutral": 24.90, "broad": 45.75, "cov_wiki": 25.12, "ref_acc": 57.44}, {"model": "Gemini-2.5-pro Deep Research", "category": "Proprietary", "overall": 35.18, "well_written": 30.10, "neutral": 26.10, "broad": 59.88, "cov_wiki": 30.76, "ref_acc": 41.68}, {"model": "Gemini-3-pro Deep Research", "category": "Proprietary", "overall": 58.33, "well_written": 60.81, "neutral": 46.10, "broad": 67.12, "cov_wiki": 28.83, "ref_acc": 66.98}, ] # Create DataFrame df = pd.DataFrame(LEADERBOARD_DATA) df = df.sort_values("overall", ascending=False).reset_index(drop=True) df.insert(0, "rank", range(1, len(df) + 1)) # Add rank emoji for top 3 def format_rank(rank): if rank == 1: return "🥇" elif rank == 2: return "🥈" elif rank == 3: return "🥉" return str(rank) df["rank"] = df["rank"].apply(format_rank) # ============== CONSTANTS ============== TITLE = """

Wiki Live Challenge Leaderboard

""" SUBTITLE = """

A Live Benchmark for Evaluating Deep Research Agents on Wikipedia-Quality Article Generation

Code | Website | Dataset | Total Models: {num_models} | Last Update: February 2026

""".format(num_models=len(LEADERBOARD_DATA)) ABOUT_TEXT = """ ## 📖 About Wiki Live Challenge **Wiki Live Challenge (WLC)** is the first **live benchmark** for evaluating Deep Research Agents (DRAs) on their ability to generate Wikipedia-quality articles. ### 🔄 Live Benchmark Design - **Six-Month Rolling Window**: Continuously collects newly promoted Wikipedia Good Articles - **Periodic Updates**: New benchmark versions released as `__` - **Current Version**: `2025_Mar_Nov` with 100 high-quality articles ### 📐 Evaluation Dimensions #### Wiki Writing (Criteria-based Quality Evaluation) Compares article quality against Wikipedia ground truth using **39 criteria** from Wikipedia's Manual of Style: | Category | Criteria | Description | |----------|----------|-------------| | **Well-written** | 21 | Encyclopedic style, lead section quality, words to watch | | **Broad in coverage** | 8 | Topic coverage, focus, structure completeness | | **Neutral** | 10 | Fair viewpoints, avoid opinions as facts | **Metric**: Win rate (percentage of criteria where generated article wins) #### Wiki Fact (Factual Accuracy Evaluation) - **Cov. Wiki (Coverage)**: Factual coverage against extracted Wikipedia fact list - **Ref. Acc. (Reference Accuracy)**: Proportion of cited statements supported by their referenced webpages ### 📊 Column Descriptions | Column | Description | |--------|-------------| | **Rank** | Model ranking based on Wiki Writing Overall score | | **Model** | Name of the Deep Research Agent | | **Category** | Open-Source or Proprietary framework | | **Overall** | Overall Wiki Writing win rate | | **Well-writ.** | Well-written criteria win rate | | **Neutral** | Neutrality criteria win rate | | **Broad** | Broad coverage criteria win rate | | **Cov. Wiki** | Factual coverage against Wikipedia | | **Ref. Acc.** | Reference accuracy (-- if citation extraction not possible) | ### 🔗 Resources - 🌐 **Website**: [Wiki Live Challenge](http://agentresearchlab.org/benchmarks/wiki-live-challenge/index.html#home) - 💻 **Code**: [github.com/WangShao2000/Wiki_Live_Challenge](https://github.com/WangShao2000/Wiki_Live_Challenge) - 📊 **Dataset**: [huggingface.co/datasets/muset-ai/Wiki_Live_Challenge](https://huggingface.co/datasets/muset-ai/Wiki_Live_Challenge) """ CITATION_TEXT = """@misc{wang2026wikilivechallengechallenging, title={Wiki Live Challenge: Challenging Deep Research Agents with Expert-Level Wikipedia Articles}, author={Shaohan Wang and Benfeng Xu and Licheng Zhang and Mingxuan Du and Chiwei Zhu and Xiaorui Wang and Zhendong Mao and Yongdong Zhang}, year={2026}, eprint={2602.01590}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2602.01590}, }""" CUSTOM_CSS = """ .markdown-text { font-size: 16px !important; } #leaderboard-table { margin-top: 15px; overflow-x: auto !important; } /* Gradio 5.x Dataframe styling */ #leaderboard-table table { table-layout: auto !important; min-width: 100% !important; } #leaderboard-table th, #leaderboard-table td, #leaderboard-table .cell-wrap, #leaderboard-table span, #leaderboard-table div { white-space: nowrap !important; overflow: visible !important; text-overflow: clip !important; } #leaderboard-table th { min-width: 80px !important; } .tab-buttons button { font-size: 18px; } #citation-button textarea { font-size: 14px !important; font-family: monospace; } h1 { font-weight: bold; } .gradio-container { max-width: 1600px !important; } """ # ============== FILTER FUNCTIONS ============== def filter_leaderboard(search_query, categories): filtered_df = df.copy() # Filter by search query if search_query: filtered_df = filtered_df[filtered_df["model"].str.lower().str.contains(search_query.lower())] # Filter by category if categories: filtered_df = filtered_df[filtered_df["category"].isin(categories)] # Re-rank after filtering filtered_df = filtered_df.reset_index(drop=True) return filtered_df def get_display_df(search_query, categories): filtered = filter_leaderboard(search_query, categories) # Rename columns for display display_df = filtered.rename(columns={ "rank": "Rank", "model": "Model", "category": "Category", "overall": "Overall", "well_written": "Well-writ.", "neutral": "Neutral", "broad": "Broad", "cov_wiki": "Cov. Wiki", "ref_acc": "Ref. Acc." }) # Format ref_acc column (replace None with --) display_df["Ref. Acc."] = display_df["Ref. Acc."].apply(lambda x: "--" if pd.isna(x) else f"{x:.2f}") # Format numeric columns for col in ["Overall", "Well-writ.", "Neutral", "Broad", "Cov. Wiki"]: display_df[col] = display_df[col].apply(lambda x: f"{x:.2f}") return display_df # ============== GRADIO APP ============== demo = gr.Blocks(css=CUSTOM_CSS, title="Wiki Live Challenge Leaderboard") with demo: gr.HTML(TITLE) gr.HTML(SUBTITLE) with gr.Tabs(elem_classes="tab-buttons") as tabs: # Leaderboard Tab with gr.TabItem("🏆 Leaderboard", elem_id="leaderboard-tab", id=0): with gr.Row(): with gr.Column(scale=1): search_box = gr.Textbox( label="Model Search", placeholder="Enter model name to search...", show_label=True ) with gr.Column(scale=1): category_filter = gr.CheckboxGroup( choices=["Open-Source", "Proprietary"], value=["Open-Source", "Proprietary"], label="Model Categories", interactive=True ) # Initial display initial_df = get_display_df("", ["Open-Source", "Proprietary"]) leaderboard_table = gr.Dataframe( value=initial_df, headers=["Rank", "Model", "Category", "Overall", "Well-writ.", "Neutral", "Broad", "Cov. Wiki", "Ref. Acc."], datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str"], elem_id="leaderboard-table", interactive=False, column_widths=["60px", "250px", "100px", "80px", "80px", "80px", "80px", "80px", "80px"], ) # Update table on filter change search_box.change( fn=get_display_df, inputs=[search_box, category_filter], outputs=leaderboard_table ) category_filter.change( fn=get_display_df, inputs=[search_box, category_filter], outputs=leaderboard_table ) # Column descriptions gr.Markdown(""" ### 📊 Column Descriptions - **Rank**: Model ranking based on Overall score - **Model**: Deep Research Agent name - **Overall/Well-writ./Neutral/Broad**: Wiki Writing win rates (%) - **Cov. Wiki**: Factual coverage against Wikipedia (%) - **Ref. Acc.**: Reference accuracy (-- indicates citation extraction not possible) """) # About Tab with gr.TabItem("📝 About", elem_id="about-tab", id=1): gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") # Citation with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_TEXT, label="Copy the following snippet to cite these results", lines=8, elem_id="citation-button", show_copy_button=True, ) demo.launch()