Skip to content

Commit

Permalink
push benchmark analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
YannDubs committed Mar 17, 2024
1 parent 15c5984 commit e162b90
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 0 deletions.
213 changes: 213 additions & 0 deletions notebooks/analyzing_benchmarks.ipynb

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions notebooks/benchmarks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
Model,"Arena Elo
[Feb 2, 2024]",LC AlpacaEval 2.0,AlpacaEval 2.0,AlpacaEval 1.0,"MT-bench
(multi-turn)",WildBench,"Open LLM
(average)","ARC-C
(25-shot)","HellaSwag
(10-shot)","MMLU
(5-shot)","TruthfulQA
(0-shot)","WinoGrande
(5-shot)","GSM-8K
(5-shot)","GPT4All
(average)","AGI Eval
(en)","HELM
Lite","BBH, cot
(3-shot)","HumanEval
(pass @1)","LLMonitor
(01-10)","OpenComp.
(en, avg)","MBPP
(pass @1)",Output Length
gpt4_1106_preview,1251.0,50.0,50.0,97.69900497512438,9.32,,,,,80.5,,,,,,0.834,83.9,85.4,,,83.0,2049.0
gpt4_0125_preview,1249.0,,,,,940.6,,,,,,,,,,,,,,,,
claude-3-opus-20240229,1247.0,40.39177606350116,29.04176413403727,,,852.6,,,,86.8,,,,,,,,,,,,1388.0
claude-3-sonnet-20240229,1190.0,34.87247436243302,25.556325292273296,,,835.8,,,,79.0,,,,,,,,,,,,1420.0
gpt4_0314,1185.0,35.30706121640206,22.073258928708075,94.78260869565216,8.96,,,96.3,95.3,86.4,59.0,,,,57.0,,86.7,88.4,93.0,73.3,,1371.0
gpt4_0613,1160.0,30.18332231673423,15.75503808763975,93.78109452736318,9.18,,,,,,,,,,57.0,0.962,86.7,88.4,89.0,73.3,,1140.0
mistral-large-2402,1155.0,32.65207998531868,21.43877598137888,,,824.2,,,,81.2,,,,,,,,,,,,1362.0
Qwen1.5-72B-Chat,1146.0,36.571754111987296,26.49828339562733,,8.61,,,,,77.5,,,,,,,,,,,,1549.0
claude,1145.0,27.289504443727107,16.98534361236025,91.5527950310559,7.9,,,,,77.0,,,,,49.7,0.724,67.3,56.0,66.0,46.3,,1082.0
mistral-medium,1145.0,28.614337401726104,21.855772543652176,96.83229813664596,8.61,,,89.9,88.0,75.3,,88.0,66.7,,,,,,,,62.3,1500.0
claude-2,1126.0,28.155196141629148,17.188240356708075,91.35572139303484,8.06,,,,,78.5,,,,,,0.679,,71.2,68.0,,,1069.0
Mistral-Next,1123.0,,,,,,,,,,,,,,,,,,,,,
Gemini Pro (Dev API),1118.0,,,,,,,,,71.8,,,,,,,,,,,,
claude-2.1,1115.0,25.251943886133027,15.733506736409938,87.0807453416149,8.18,,,,,,,,,,,0.593,,,,,,1096.0
Mixtral-8x7B-Instruct-v0.1,1114.0,23.68848260134481,18.25531762637268,94.78260869565216,8.3,765.7,72.62,70.22,87.63,70.6,64.58,81.37,60.73,76.41,45.3,0.728,67.0,54.9,,56.8,60.7,1465.0
gpt-3.5-turbo-0613,1113.0,22.35251298054288,14.09579857390062,,8.39,,,,,,,,,,,0.507,71.0,72.6,81.0,,,1331.0
gemini-pro,1110.0,24.38177610802152,18.177644540571432,79.66417910447761,,788.0,,,,71.8,,,,,,,65.6,63.4,,,72.9,1456.0
GPT-3.5-Turbo-0314,1104.0,,,,7.94,,,85.5,70.6,70.0,,85.2,57.1,,43.2,,,73.2,79.0,63.5,81.6,
claude-instant-1.2,1104.0,25.61225902543337,16.12739962159006,,7.85,,,,,73.4,,,,,,,,52.8,60.0,,,1112.0
wizardlm-70b,1102.0,17.575060737493747,14.383896086782608,,7.71,,61.25,65.44,84.41,63.7,54.81,80.82,17.97,,,,,,,,,1545.0
Yi-34B-Chat,1099.0,27.19054787762733,29.65994671879504,94.08468244084682,,743.9,65.32,65.44,84.16,73.5,55.37,80.11,31.92,72.13,50.8,0.772,71.7,,,63.3,,2123.0
tulu-2-dpo-70b,1097.0,21.238610038371124,15.982854374136648,95.03105590062113,7.89,685.9,73.77,72.1,88.99,69.84,65.78,83.27,62.62,,,,66.0,,,,,1418.0
GPT-3.5-Turbo-0125,1096.0,,,,,736.4,,,,,,,,,,,,,,,,
vicuna-33b-v1.3,1089.0,17.574575310874923,12.705947921540371,88.99253731,7.12,,58.5,,,59.2,,,,,37.3,,52.0,,,53.0,,1479.0
Starling-LM-7B-alpha,1084.0,14.690471079424972,14.24592352162733,,8.09,,67.13,63.82,84.9,63.9,46.39,80.58,62.4,72.72,40.1,,,,,,,1895.0
llama-2-70b-chat-hf,1082.0,14.689648588392544,13.88825834374378,92.66169154228857,6.86,697.4,62.4,64.59,85.88,63.0,52.8,80.51,26.69,,45.0,,60.8,,60.0,58.6,,1790.0
OpenHermes-2.5-Mistral-7B,1079.0,16.248577696674843,10.340415705751552,,,,61.52,64.93,84.18,63.8,52.24,78.06,26.08,73.12,43.0,,,48.2,,,,1107.0
NV-Llama2-70B-SteerLM-Chat,1076.0,,,,7.54,,,,,68.5,,,,,,,,,,,,
Mistral-7B-Instruct-v0.2,1073.0,17.111251846021165,14.722772657714286,92.77708592777088,7.6,,,,,,,,,,,,,,,,,1676.0
deepseek-llm-67b-chat,1073.0,17.843384089909343,12.093422264919258,,,,71.79,67.75,86.82,72.42,55.85,84.21,63.68,,,,,,,,,1151.0
OpenChat-3.5,1071.0,,,,7.81,,61.24,63.91,84.79,64.3,46.38,80.58,26.84,72.92,42.7,,,55.5,,,,
pplx-70b-online,1068.0,,,,,,,,,,,,,,,,,,,,,
SOLAR-10.7B-Instruct-v1.0,1065.0,,,,7.58,,74.2,71.08,88.16,66.2,71.43,83.58,64.75,75.11,47.6,,,,,,42.9,
dolphin-2.2.1-mistral-7b,1058.0,13.121477650433736,9.039799728223604,,,,64.93,63.31,83.76,63.2,53.11,78.14,48.07,72.24,39.2,,59.8,,,58.0,,1130.0
wizardlm-13b-v1.2,1054.0,14.462590694316631,12.027480342770186,89.16562889,7.2,,54.76,59.04,82.21,52.7,47.27,71.9,13.5,,,,,,,,,1635.0
zephyr-7b-beta,1046.0,13.203198493136666,10.992885755354038,90.5977584059776,7.34,662.3,61.95,62.03,84.36,61.4,57.45,77.74,29.04,71.83,40.6,,,30.0,,,41.1,1444.0
llama-2-13b-chat-hf,1043.0,8.436014548885215,7.702309957875775,81.09452736318407,6.65,678.2,54.91,59.04,81.94,53.6,44.12,74.51,15.24,,33.6,0.348,58.2,,50.0,50.3,,1513.0
MPT-30B-chat,1042.0,,,,6.39,,55.38,58.7,82.54,50.4,52.42,75.3,12.13,,,,,,40.0,,,
CodeLlama-34B-instruct,1040.0,,,,,,57.29,54.27,76.92,53.7,44.44,74.59,37.98,,,,,51.8,34.0,,,
vicuna-13b-v1.5,1037.0,10.484438298504218,6.722122014857143,,6.57,593.2,55.4,57.08,81.24,55.8,51.51,74.66,11.3,63.1,36.8,,51.5,17.1,50.0,52.1,,1061.0
pplx-7b-online,1035.0,,,,,,,,,,,,,,,,,,,,,
zephyr-7b-alpha,1033.0,10.289760888704258,8.352663968198758,85.7587064676617,6.88,,59.5,61.01,84.04,61.4,57.9,78.61,14.03,72.24,38.0,,,,,,,1302.0
Qwen-14B-Chat,1032.0,12.378741790737235,7.502333484720497,,6.96,,,,,66.5,,,59.7,,39.6,,53.7,43.9,,,,1013.0
guanaco-33b,1031.0,5.690019090866207,5.002493724956522,65.96273292,6.53,,,,,57.6,,,,,,,,,43.0,,,1311.0
gemma-7b-it,1029.0,10.425760403690134,6.937294379677018,,,676.5,,,,64.3,,,,,,,,,,,,1115.0
llama-2-7b-chat-hf,1027.0,5.354821279508294,4.961339547167702,71.36645962732919,6.27,651.9,50.74,52.9,78.55,45.8,45.57,71.74,7.35,,29.6,0.217,35.6,,50.0,,,1479.0
falcon-180b-chat,1026.0,,,,,,67.85,69.45,88.86,68.0,45.47,86.9,45.94,,,,,,67.0,,,
Mistral-7B-Instruct-v0.1,1002.0,,,,6.84,545.9,54.96,54.52,75.63,55.4,56.28,73.72,14.25,67.95,33.5,0.438,56.7,28.7,57.0,53.6,,
vicuna-7b-v1.5,1001.0,7.616892731870527,4.797493939167703,,6.17,,50.1,53.24,77.39,49.8,50.34,72.14,8.19,61.0,31.4,,43.4,11.6,41.0,,,1083.0
gemma-2b-it,985.0,5.437453620377121,3.4019714381366457,,,,,,,42.3,,,,,,,,,,,,1041.0
chatglm2-6b,925.0,4.35928292679035,2.7621847964596284,47.12858926,4.96,,,,,45.5,,,,,,,,,,,,1027.0
oasst-sft-pythia-12b,893.0,3.270102114456748,1.790114083180124,25.96273292,4.32,,40.77,46.42,70.0,26.19,39.19,62.19,0.61,,,,,,,,,726.0
Yi-34Bx2-MoE-60B,,,,,,,76.72,71.08,85.23,77.5,66.19,84.85,75.51,,,,,,,,,
Binary file added notebooks/chat_correlations.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 26 additions & 0 deletions notebooks/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
from typing import Any, Union

import matplotlib.pyplot as plt
import seaborn as sns


def save_fig(fig: Any, filename: Union[str, bytes, os.PathLike], dpi: int = 300, is_tight: bool = True) -> None:
"""General function for many different types of figures."""

# order matters ! and don't use elif!
if isinstance(fig, sns.FacetGrid):
fig = fig.fig

if isinstance(fig, plt.Artist): # any type of axes
fig = fig.get_figure()

if isinstance(fig, plt.Figure):
plt_kwargs = {}
if is_tight:
plt_kwargs["bbox_inches"] = "tight"

fig.savefig(filename, dpi=dpi, **plt_kwargs)
plt.close(fig)
else:
raise ValueError(f"Unknown figure type {type(fig)}")
File renamed without changes.

0 comments on commit e162b90

Please sign in to comment.