Skip to content

Commit bfc26a0

Browse files
adityamittal13efrick2002ygtanggsophie200
authored
Add edit leaderboard + citation (#3661)
Co-authored-by: eFrick <[email protected]> Co-authored-by: Kelly Tang <[email protected]> Co-authored-by: Sophie Xie <[email protected]>
1 parent 03d376c commit bfc26a0

File tree

3 files changed

+124
-43
lines changed

3 files changed

+124
-43
lines changed

fastchat/serve/gradio_web_server.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@
6666
enable_moderation = False
6767
use_remote_storage = False
6868

69+
default_citation_md = """
70+
### Citation
71+
Please cite the following paper if you find our leaderboard or dataset helpful.
72+
```
73+
@misc{chiang2024chatbot,
74+
title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
75+
author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
76+
year={2024},
77+
eprint={2403.04132},
78+
archivePrefix={arXiv},
79+
primaryClass={cs.AI}
80+
}
81+
"""
6982
acknowledgment_md = """
7083
### Terms of Service
7184

fastchat/serve/monitor/copilot_arena.py

Lines changed: 90 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,16 @@
66
from fastchat.serve.monitor.monitor import recompute_final_ranking
77

88
copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL")
9+
copilot_arena_citation_md = """
10+
### Citation
11+
Please cite the following paper if you find our leaderboard helpful.
12+
```
13+
@misc{chi2025copilot,
14+
title={Copilot Arena: A Platform for Code LLM Evaluation in the Wild},
15+
author={Wayne Chi and Valerie Chen and Wei-Lin Chiang and Anastasios N. Angelopoulos and Aditya Mittal and Naman Jain and Tianjun Zhang and Ion Stoica and Chris Donahue and Ameet Talwalkar}
16+
year={2025},
17+
}
18+
"""
919

1020

1121
def process_copilot_arena_leaderboard(leaderboard):
@@ -31,41 +41,88 @@ def process_copilot_arena_leaderboard(leaderboard):
3141
by=["Rank* (UB)", "score"], ascending=[True, False]
3242
)
3343

44+
leaderboard = leaderboard.rename(
45+
columns={
46+
"name": "Model",
47+
"confidence_interval": "Confidence Interval",
48+
"score": "Arena Score",
49+
"organization": "Organization",
50+
"votes": "Votes",
51+
}
52+
)
53+
54+
column_order = [
55+
"Rank* (UB)",
56+
"Model",
57+
"Arena Score",
58+
"Confidence Interval",
59+
"Votes",
60+
"Organization",
61+
]
62+
leaderboard = leaderboard[column_order]
63+
3464
return leaderboard
3565

3666

67+
def make_copilot_arena_leaderboard_md(leaderboard, interaction_mode):
68+
num_models = len(leaderboard)
69+
total_battles = int(leaderboard["Votes"].sum()) // 2
70+
space = "&nbsp;&nbsp;&nbsp;"
71+
leaderboard_md = f"""### {interaction_mode}
72+
#### {space} #models: **{num_models}** {space} #votes: **{"{:,}".format(total_battles)}** {space}
73+
"""
74+
return leaderboard_md
75+
76+
3777
def build_copilot_arena_tab():
3878
response = requests.get(copilot_arena_leaderboard_url)
3979
if response.status_code == 200:
40-
leaderboard = pd.DataFrame(response.json()["elo_data"])
41-
leaderboard = process_copilot_arena_leaderboard(leaderboard)
42-
leaderboard = leaderboard.rename(
43-
columns={
44-
"name": "Model",
45-
"confidence_interval": "Confidence Interval",
46-
"score": "Arena Score",
47-
"organization": "Organization",
48-
"votes": "Votes",
49-
}
80+
response_json = response.json()
81+
82+
def update_copilot_arena_leaderboard(interaction_mode):
83+
if interaction_mode == "Code Completion":
84+
leaderboard = pd.DataFrame(response_json["elo_data"])
85+
else:
86+
leaderboard = pd.DataFrame(response_json["edit_elo_data"])
87+
leaderboard = process_copilot_arena_leaderboard(leaderboard)
88+
leaderboard_df = gr.DataFrame(
89+
leaderboard,
90+
datatype=["str" for _ in leaderboard.columns],
91+
elem_id="arena_hard_leaderboard",
92+
height=600,
93+
wrap=True,
94+
interactive=False,
95+
column_widths=[70, 130, 60, 80, 50, 80],
96+
)
97+
98+
md = make_copilot_arena_leaderboard_md(leaderboard, interaction_mode)
99+
leaderboard_md = gr.Markdown(md, elem_id="leaderboard_markdown")
100+
101+
return leaderboard_df, leaderboard_md
102+
103+
gr.Markdown(
104+
"[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs.",
105+
elem_id="copilot_arena_introduction",
50106
)
51107

52-
column_order = [
53-
"Rank* (UB)",
54-
"Model",
55-
"Arena Score",
56-
"Confidence Interval",
57-
"Votes",
58-
"Organization",
59-
]
60-
leaderboard = leaderboard[column_order]
61-
num_models = len(leaderboard)
62-
total_battles = int(leaderboard["Votes"].sum()) // 2
63-
md = f"""
64-
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
65-
"""
66-
67-
gr.Markdown(md, elem_id="leaderboard_markdown")
68-
gr.DataFrame(
108+
leaderboard = pd.DataFrame(response_json["elo_data"])
109+
leaderboard = process_copilot_arena_leaderboard(leaderboard)
110+
with gr.Row():
111+
with gr.Column(scale=2):
112+
interaction_mode_dropdown = gr.Radio(
113+
choices=["Code Completion", "Code Edit"],
114+
label="Interaction Mode",
115+
value="Code Completion",
116+
)
117+
vote_data = make_copilot_arena_leaderboard_md(
118+
leaderboard, "Code Completion"
119+
)
120+
with gr.Column(scale=3, variant="panel"):
121+
interaction_mode_details = gr.Markdown(
122+
vote_data, elem_id="interaction_mode_details"
123+
)
124+
125+
leaderboard_df = gr.DataFrame(
69126
leaderboard,
70127
datatype=["str" for _ in leaderboard.columns],
71128
elem_id="arena_hard_leaderboard",
@@ -83,5 +140,11 @@ def build_copilot_arena_tab():
83140
""",
84141
elem_id="leaderboard_markdown",
85142
)
143+
144+
interaction_mode_dropdown.change(
145+
update_copilot_arena_leaderboard,
146+
inputs=[interaction_mode_dropdown],
147+
outputs=[leaderboard_df, interaction_mode_details],
148+
)
86149
else:
87150
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")

fastchat/serve/monitor/monitor.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,10 +1038,13 @@ def build_leaderboard_tab(
10381038
from fastchat.serve.monitor.copilot_arena import (
10391039
build_copilot_arena_tab,
10401040
copilot_arena_leaderboard_url,
1041+
copilot_arena_citation_md,
10411042
)
10421043

10431044
if copilot_arena_leaderboard_url:
1044-
with gr.Tab("Copilot Arena Leaderboard", id=5):
1045+
with gr.Tab(
1046+
"Copilot Arena Leaderboard", id=5
1047+
) as copilot_arena_leaderboard_tab:
10451048
build_copilot_arena_tab()
10461049
else:
10471050
print(
@@ -1060,27 +1063,29 @@ def build_leaderboard_tab(
10601063
else:
10611064
pass
10621065

1063-
from fastchat.serve.gradio_web_server import acknowledgment_md
1066+
from fastchat.serve.gradio_web_server import default_citation_md, acknowledgment_md
10641067

10651068
with gr.Accordion(
10661069
"Citation",
10671070
open=True,
10681071
):
1069-
citation_md = """
1070-
### Citation
1071-
Please cite the following paper if you find our leaderboard or dataset helpful.
1072-
```
1073-
@misc{chiang2024chatbot,
1074-
title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
1075-
author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
1076-
year={2024},
1077-
eprint={2403.04132},
1078-
archivePrefix={arXiv},
1079-
primaryClass={cs.AI}
1080-
}
1081-
"""
1082-
gr.Markdown(citation_md, elem_id="leaderboard_markdown")
1072+
leaderboard_citation_md = gr.Markdown(
1073+
default_citation_md, elem_id="leaderboard_markdown"
1074+
)
10831075
gr.Markdown(acknowledgment_md, elem_id="ack_markdown")
1076+
if copilot_arena_leaderboard_tab:
1077+
copilot_arena_leaderboard_tab.select(
1078+
fn=lambda: gr.Markdown(copilot_arena_citation_md),
1079+
inputs=[],
1080+
outputs=[leaderboard_citation_md],
1081+
)
1082+
for tab in tabs.children:
1083+
if (not copilot_arena_leaderboard_tab) or tab != copilot_arena_leaderboard_tab:
1084+
tab.select(
1085+
fn=lambda: gr.Markdown(default_citation_md),
1086+
inputs=[],
1087+
outputs=[leaderboard_citation_md],
1088+
)
10841089

10851090
return [md_1] + gr_plots
10861091

0 commit comments

Comments
 (0)