-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataloader.py
142 lines (133 loc) · 5.71 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import csv
import urllib.request
def download_data():
""" Download data files from Johns Hopkins."""
base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
file_list = [
'time_series_covid19_confirmed_global.csv',
'time_series_covid19_recovered_global.csv',
'time_series_covid19_deaths_global.csv',
'time_series_covid19_confirmed_US.csv',
'time_series_covid19_deaths_US.csv',
]
localfolder = './johns-hopkins-data/'
for file in file_list:
url = f"{base_url}{file}"
filename = f"{localfolder}{file}"
urllib.request.urlretrieve(url, filename)
def load_data(region="world"):
""" Read data from Johns Hopkins files. The 'region' parameter decides which files to load and which columns."""
# Read data from Johns Hopkins: confirmed cases
if region == "USA":
file_jh_confirmed = './johns-hopkins-data/time_series_covid19_confirmed_US.csv'
else:
file_jh_confirmed = './johns-hopkins-data/time_series_covid19_confirmed_global.csv'
with open(file_jh_confirmed) as f:
reader = csv.reader(f)
next(reader)
hopkins_confirmed = []
for row in reader:
hopkins_confirmed.append(row)
# Read data from Johns Hopkins: confirmed deaths
if region == "USA":
file_jh_deaths = './johns-hopkins-data/time_series_covid19_deaths_US.csv'
else:
file_jh_deaths = './johns-hopkins-data/time_series_covid19_deaths_global.csv'
with open(file_jh_deaths) as f:
reader = csv.reader(f)
next(reader)
hopkins_deaths = []
for row in reader:
hopkins_deaths.append(row)
# List of ISO codes for countries
# TODO: Get this directly from johns hopkins file for USA
if region == "USA":
file_countries = './ISO-USA.csv'
else:
file_countries = './ISO-countries.csv'
with open(file_countries) as f:
reader = csv.reader(f)
next(reader)
countries_ISO = []
for row in reader:
countries_ISO.append(row)
# List of countries in Johns Hopkins files
countries_ori = []
# print(len(hopkins_confirmed))
for row in hopkins_confirmed:
if region == "USA":
countries_ori.append(row[6])
else:
countries_ori.append(row[1])
countries_ori = sorted(set(countries_ori))
# List of ISO countries that can be found in Johns Hopkins files. This is the one we will be using.
countries = []
#print(countries_ori)
for country_ori in countries_ori:
if region == "USA":
# Don't assume Americans know which states they have and how to spell their names.
notfound = True
for country_ISO in countries_ISO:
if country_ori == country_ISO[1]:
notfound = False
countries.append(country_ISO)
if notfound:
print(f"Not found: {country_ori}")
else:
# World. Only use countries in the ISO list. No cruise ships or countries that are spelled differently.
notfound = True
for country_ISO in countries_ISO:
if country_ori == country_ISO[1]:
notfound = False
countries.append(country_ISO)
if notfound:
print(f"Not found: {country_ori}")
# Number of days for which we have data in the file
countries_data = []
if region == "USA":
# USA. Data per day begins on 12th column
number_of_days = len(hopkins_confirmed[1]) - 11
else:
# World. Data per day begins on 5th column
number_of_days = len(hopkins_confirmed[1]) - 4
print(f"{number_of_days} days in data set")
for country in countries:
temp_confirmed = [0 for i in range(number_of_days)]
temp_deaths = [0 for i in range(number_of_days)]
# combine rows that belong to the same country
if region == "USA":
# USA. Data per day begins on 12th column for confirmed. 13th column for deaths
# State name in 7th column
for row in hopkins_confirmed:
if country[1] == row[6]:
temp_confirmed = [int(temp_confirmed[i]) + int(row[11:][i]) for i in range(number_of_days)]
for row in hopkins_deaths:
if country[1] == row[6]:
temp_deaths = [int(temp_deaths[i]) + int(row[12:][i]) for i in range(number_of_days)]
else:
# World. Data per day begins on 5th column
for row in hopkins_confirmed:
if country[1] == row[1]:
temp_confirmed = [int(temp_confirmed[i]) + int(row[4:][i]) for i in range(number_of_days)]
for row in hopkins_deaths:
if country[1] == row[1]:
temp_deaths = [int(temp_deaths[i]) + int(row[4:][i]) for i in range(number_of_days)]
# Create lists of differences
temp_confirmed_prev = temp_confirmed[:-1]
temp_confirmed_prev.insert(0,0)
temp_d_confirmed = [int(temp_confirmed[i]) - int(temp_confirmed_prev[i]) for i in range(len(temp_confirmed))]
temp_deaths_prev = temp_deaths[:-1]
temp_deaths_prev.insert(0,0)
temp_d_deaths = [int(temp_deaths[i]) - int(temp_deaths_prev[i]) for i in range(len(temp_deaths))]
country_data = {
'name': country[1],
'confirmed': temp_confirmed,
'deaths': temp_deaths,
'd_confirmed': temp_d_confirmed,
'd_deaths': temp_d_deaths
}
countries_data.append(country_data)
return {
'countries': countries,
'countries_data': countries_data,
}