Auto-download MMLU data, add small datasets (#4)

* Remove unnecessary files * Added lost datasets * Remove GSM8K * Automated download script * Autodownload in eval script --------- Co-authored-by: Dmitrii Khizbullin <[email protected]>
metauto-ai · Feb 27, 2024 · 931cb1e · 931cb1e
1 parent 796b891
commit 931cb1e
Show file tree

Hide file tree

Showing 24 changed files with 7,487 additions and 642 deletions.
diff --git a/.gitmodules b/.gitmodules
diff --git a/datasets/MMLU/.gitignore b/datasets/MMLU/.gitignore
@@ -0,0 +1,5 @@
+data.tar
+data/
+data/README.txt
+data/auxiliary_train/
+data/possibly_contaminated_urls.txt
diff --git a/datasets/MMLU/download.py b/datasets/MMLU/download.py
@@ -0,0 +1,27 @@
+import os
+import requests
+import tarfile
+
+
+def download():
+
+    this_file_path = os.path.split(__file__)[0]
+    tar_path = os.path.join(this_file_path, "data.tar")
+    if not os.path.exists(tar_path):
+        url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
+        print(f"Downloading {url}")
+        r = requests.get(url, allow_redirects=True)
+        with open(tar_path, 'wb') as f:
+            f.write(r.content)
+        print(f"Saved to {tar_path}")
+
+    data_path = os.path.join(this_file_path, "data")
+    if not os.path.exists(data_path):
+        tar = tarfile.open(tar_path)
+        tar.extractall(this_file_path)
+        tar.close()
+        print(f"Saved to {data_path}")
+
+
+if __name__ == "__main__":
+    download()
diff --git a/datasets/README.md b/datasets/README.md
@@ -0,0 +1,11 @@
+**We include the following datasets linked to our best understanding of their original source.**
+
+[GSM8K](https://github.com/openai/grade-school-math/tree/3101c7d5072418e28b9008a6636bde82a006892c)
+
+[MMLU](https://github.com/hendrycks/test)
+
+[Mini Crosswords](https://www.goobix.com/crosswords/0505/)
+
+[GAIA](https://huggingface.co/datasets/gaia-benchmark/GAIA)
+
+[HumanEval](https://github.com/openai/human-eval)