-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_data.py
31 lines (23 loc) · 893 Bytes
/
fetch_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""Script to download the 20 newsgroups text classification set"""
import os
import tarfile
from contextlib import closing
try:
from urllib import urlopen
except ImportError:
from urllib.request import urlopen
URL = ("http://people.csail.mit.edu/jrennie/"
"20Newsgroups/20news-bydate.tar.gz")
ARCHIVE_NAME = URL.rsplit('/', 1)[1]
TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"
if not os.path.exists(TRAIN_FOLDER) or not os.path.exists(TEST_FOLDER):
if not os.path.exists(ARCHIVE_NAME):
print("Downloading dataset from %s (14 MB)" % URL)
opener = urlopen(URL)
with open(ARCHIVE_NAME, 'wb') as archive:
archive.write(opener.read())
print("Decompressing %s" % ARCHIVE_NAME)
with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
archive.extractall(path='.')
os.remove(ARCHIVE_NAME)