diff --git a/bridge/bridge.py b/bridge/bridge.py new file mode 100644 index 0000000..a1cfd47 --- /dev/null +++ b/bridge/bridge.py @@ -0,0 +1,328 @@ +import os +import sys +import git +from datetime import datetime, date, time +from dateutil.parser import parse + +REPO = 'https://github.com/CSSEGISandData/COVID-19.git' +TMP_FOLDER = '/tmp/corona/' +TMP_GIT = os.path.join(TMP_FOLDER, 'COVID-19') +DATA = os.path.join(TMP_GIT, 'csse_covid_19_data/csse_covid_19_daily_reports/') +out = './' + +class DataAccessor: + def __init__(self, cleaner): + self.cleaner = cleaner + + def get_date(self, last_update): + pass + + def get_csv_date(self, file): + pass + + def get_data(self, cleaned_sheets): + pass + + +class AdvancedDataAccessor(DataAccessor): + def __init__(self, cleaner): + self.cleaner = cleaner + # self.REPO = 'https://github.com/CSSEGISandData/COVID-19.git' + # self.TMP_FOLDER = '/tmp/corona/' + # self.TMP_GIT = os.path.join(TMP_FOLDER, 'COVID-19') + # self.DATA = os.path.join(TMP_GIT, 'csse_covid_19_data/csse_covid_19_daily_reports/') + # self.out = './' + + def get_date(last_update): + return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + + def get_csv_date(file): + return get_date(file.split('.')[0] + ' ') + + def get_data(cleaned_sheets): + all_csv = [] + # Import all CSV's + for file in tqdm(sorted(sheets), desc='... importing data: '): + if 'csv' in file: + # print('...', file) + tmp_df = pd.read_csv(os.path.join(DATA, file), index_col=None, header=0, parse_dates=['Last Update']) + tmp_df = tmp_df[keep_cols] + tmp_df[numeric_cols] = tmp_df[numeric_cols].fillna(0) + tmp_df[numeric_cols] = tmp_df[numeric_cols].astype(int) + tmp_df['Province/State'].fillna(tmp_df['Country/Region'], inplace=True) #If no region given, fill it with country + + tmp_df['Last Update'] = tmp_df['Last Update'].apply(clean_last_updated) + tmp_df['date'] = tmp_df['Last Update'].apply(get_date) + tmp_df['file_date'] = get_csv_date(file) + all_csv.append(tmp_df) + + # concatenate all csv's into one df + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) + df_raw = df_raw.sort_values(by=['Last Update']) + + frames = cleaner.drop_duplicates(df_raw) + tmp = pd.concat(frames, axis=0, ignore_index=True, sort=True) + + return tmp + + def get_clean_sheets(): + # Create Tmp Folder + if not os.path.isdir(TMP_FOLDER): + print('Creating folder...') + print('...', TMP_FOLDER) + os.mkdir(TMP_FOLDER) + + #Check if repo exists + #git pull if it does + if not os.path.isdir(TMP_GIT): + cleaner.clone_repo(TMP_FOLDER, REPO) + else: + try: + print('git pull from', REPO) + rep = git.Repo(TMP_GIT) + rep.remotes.origin.pull() + except: + print('Could not pull from', REPO) + sys.exit() + + sheets = os.listdir(DATA) + + # Clean the result to the sheet tabs we want + print('Getting sheets...') + cleaned_sheets = cleaner.clean_sheet_names(sheets) + + return cleaned_sheets + + def get_similar_countries(c, country_list): + pos_countries = get_close_matches(c, country_list) + + if len(pos_countries) > 0: + print(c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?') + sys.exit() + else: + print(c, 'was not listed.') + sys.exit() + + def get_new_cases(tmp, col): + diff_list = [] + tmp_df_list = [] + df = tmp.copy() + + for i, day in enumerate(df.sort_values('date').date.unique()): + tmp_df = df[df.date == day] + tmp_df_list.append(tmp_df[col].sum()) + + if i == 0: + diff_list.append(tmp_df[col].sum()) + else: + diff_list.append(tmp_df[col].sum() - tmp_df_list[i-1]) + + return diff_list + + def get_moving_average(tmp, col): + df = tmp.copy() + return df[col].rolling(window=2).mean() + + def get_exp_moving_average(tmp, col): + df = tmp.copy() + return df[col].ewm(span=2, adjust=True).mean() + + +class Cleaner: + @abstractmethod + def clean_sheet_names(self, new_ranges): + pass + + @abstractmethod + def clean_last_updates(self, last_update): + pass + + @abstractmethod + def drop_duplicates(self, df_raw): + pass + + @abstractmethod + def clone_repo(self, TMP_FOLDER, REPO): + pass + + @abstractmethod + def clean_data(self, tmp_df): + pass + + @abstractmethod + def clean_sheets(self): + pass + + +class DataCleaner3(Cleaner): + + def clean_data(tmp_df): + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'Deaths'}, inplace=True) + + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + + if 'Suspected' in tmp_df.columns: + tmp_df = tmp_df.drop(columns='Suspected') + + for col in tmp_df.columns: + tmp_df[col] = tmp_df[col].fillna(0) + + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + return tmp_df + + def clean_last_updated(last_update): + ''' + convert date and time in YYYYMMDD HMS format + ''' + date = parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + time = parse(str(last_update).split(' ')[1]).strftime('%H:%M:%S') + parsed_date = str(date) + ' ' + str(time) + + return parsed_date + + def clean_sheet_names(new_ranges): + indices = [] + # Remove all sheets that dont have a numeric header + numeric_sheets = [x for x in new_ranges if re.search(r'\d', x)] + + return numeric_sheets + + def drop_duplicates(df_raw): + ''' + Take the max date value for each province for a given date + ''' + days_list = [] + + for datetime in df_raw.date.unique(): + tmp_df = df_raw[df_raw.date == datetime] + tmp_df = tmp_df.sort_values(['Last Update']).drop_duplicates('Province/State', keep='last') + days_list.append(tmp_df) + + return days_list + + def clone_repo(TMP_FOLDER, REPO): + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + + def clean_sheets(self): + # Create Tmp Folder + if not os.path.isdir(TMP_FOLDER): + print('Creating folder...') + print('...', TMP_FOLDER) + os.mkdir(TMP_FOLDER) + + #Check if repo exists + #git pull if it does + if not os.path.isdir(TMP_GIT): + clone_repo(TMP_FOLDER, REPO) + else: + try: + print('git pull from', REPO) + rep = git.Repo(TMP_GIT) + rep.remotes.origin.pull() + except: + print('Could not pull from', REPO) + sys.exit() + + sheets = os.listdir(DATA) + + # Clean the result to the sheet tabs we want + print('Cleaning sheets...') + cleaned_sheets = clean_sheet_names(sheets) + return cleaned_sheets + + +class DataCleaner4(Cleaner): + + def clean_data(tmp_df): + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'Deaths'}, inplace=True) + + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + + if 'Suspected' in tmp_df.columns: + tmp_df = tmp_df.drop(columns='Suspected') + + for col in tmp_df.columns: + tmp_df[col] = tmp_df[col].fillna(0) + + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + return tmp_df + + def clean_last_updated(last_update): + ''' + convert date and time in YYYYMMDD HMS format + ''' + date = parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + time = parse(str(last_update).split(' ')[1]).strftime('%H:%M:%S') + parsed_date = str(date) + ' ' + str(time) + + return parsed_date + + def clean_sheet_names(new_ranges): + indices = [] + # Remove all sheets that dont have a numeric header + numeric_sheets = [x for x in new_ranges if re.search(r'\d', x)] + + return numeric_sheets + + def drop_duplicates(df_raw): + ''' + Take the max date value for each province for a given date + ''' + days_list = [] + + for datetime in df_raw.date.unique(): + tmp_df = df_raw[df_raw.date == datetime] + tmp_df = tmp_df.sort_values(['Last Update']).drop_duplicates('Province/State', keep='last') + days_list.append(tmp_df) + + return days_list + + def clone_repo(TMP_FOLDER, REPO): + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + + def clean_sheets(self): + # Create Tmp Folder + if not os.path.isdir(TMP_FOLDER): + print('Creating folder...') + print('...', TMP_FOLDER) + os.mkdir(TMP_FOLDER) + + #Check if repo exists + #git pull if it does + if not os.path.isdir(TMP_GIT): + clone_repo(TMP_FOLDER, REPO) + else: + try: + print('git pull from', REPO) + rep = git.Repo(TMP_GIT) + rep.remotes.origin.pull() + except: + print('Could not pull from', REPO) + sys.exit() + + sheets = os.listdir(DATA) + + # Clean the result to the sheet tabs we want + print('Cleaning sheets...') + cleaned_sheets = clean_sheet_names(sheets) + return cleaned_sheets \ No newline at end of file diff --git a/builder/builder.py b/builder/builder.py new file mode 100644 index 0000000..eb205da --- /dev/null +++ b/builder/builder.py @@ -0,0 +1,119 @@ +class Plotter(ABC): + + @abstractmethod + def setTitle(self): + pass + + @abstractmethod + def setXlabel(self): + pass + + @abstractmethod + def setYlabel(self): + pass + + @abstractmethod + def getResult(self): + pass + + +class Graph(): + def __init__(self): + self.Xcol = None + self.Ycol = None + self.style = None + self.title = None + self.Xlabel = None + self.Ylabel = None + self.grid = None + + +class ScatterPlotterv1(Plotter): + def __init__(self): + self.graph = Graph() + self.x = 10 + self.y = 6 + self.style = 'o' + + def setXlabel(self, xl): + self.graph.Xlabel = xl + return self.graph + + def setYlabel(self, yl): + self.graph.Ylabel = yl + return self.graph + + def setTitle(self, t): + self.graph.title = t + return self.graph + + def getResult(self): + return self.graph + + +class ScatterPlotterv2(Plotter): + def __init__(self): + self.graph = Graph() + self.style = 'o' + + def setXcol(self, x): + self.graph.Xcol = x + return self.graph + + def setYcol(self, y): + self.graph.Ycol = y + return self.graph + + def setStyle(self, s): + self.graph.style = s + return self.graph + + def setTitle(self, t): + self.graph.title = t + return self.graph + + def setXlabel(self, xl): + self.graph.Xlabel = xl + return self.graph + + def setYlabel(self, yl): + self.graph.Ylabel = yl + return self.graph + + def getResult(self): + return self.graph + + +class SeriesPlotter(Plotter): + def __init__(self): + self.graph = Graph() + self.start = 0 + self.end = None + self.format = '-' + + def setTime(self): + self.time = time[start, end] + return self.graph + + def setXlabel(self, xl): + self.graph.setXlabel = xl + return self.graph + + def setYlabel(self, yl): + self.graph.setYlabel = yl + return self.graph + + def setGrid(self, g): + self.graph.grid = g + return self.graph + + def getResult(self): + return self.graph + + +class Director: + def __init__(self) -> None: + self._builder = None + + def setGraph(self, graph): + self.graph = graph diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py index 38c7a71..1ad1502 100644 --- a/src/covidify/sources/github.py +++ b/src/covidify/sources/github.py @@ -166,4 +166,44 @@ def get(): df = get_data(cleaned_sheets) #Clean the column names - return df \ No newline at end of file + return df + + # ------------------------------------------------------------------------------------------------------ + # import unittest + # class TestGraphTypes(unittest.TestCase) + + # image_dir = './images/' + # if not os.path.exists(image_dir): + # os.mkdir(image_dir) + # COUNTRY = 'Uk' + + # data_dir = './data/' + str(datetime.date(datetime.now())) + # trend_file = 'trend_{}.csv'.format(datetime.date(datetime.now())) + # agg_file = 'agg_data_{}.parquet.gzip'.format(datetime.date(datetime.now())) + # daily_df = pd.read_csv(os.path.join(data_dir, trend_file)) + # agg_df = pd.read_parquet(os.path.join(data_dir, agg_file)) + # new_df = pd.DataFrame([]) + # new_df['date'] = daily_df['date'] + # new_df['confirmed_cases'] = agg_df.groupby(['date']).confirmed.sum().values - daily_df.new_confirmed_cases + # new_df['new_confirmed_cases'] = daily_df.new_confirmed_cases + # create_stacked_bar(new_df, 'new_confirmed_cases', 'confirmed_cases', "Stacked bar of confirmed and new cases by day", COUNTRY) + # create_trend_line(agg_df, 'confirmed', 'deaths', 'recovered', 'Accumalitive trend', COUNTRY) + # daily_figures_cols = ['new_confirmed_cases', 'new_deaths', 'new_recoveries', 'currently_infected'] + # for col, rgb in zip(daily_figures_cols, ['tomato', 'lightblue', 'mediumpurple', 'green']): + # create_bar(daily_df, col, rgb, COUNTRY) + + + # def get_image_types(path): + # # get all the possible types of images in + # # the passed directory path + # types = [] + # for fn in glob.glob(os.path.join(path, '*.png')): + # types.append(fn.split('_',)[-1].split('.')[0]) + # return types + + # image_types = [] + # image_types = get_image_types(image_dir) + + # def test_types(self): + # expected = ['bar', 'trendline', 'bar'] + # self.assertEqual(expected, image_types) \ No newline at end of file diff --git a/strategy/strategy.py b/strategy/strategy.py new file mode 100644 index 0000000..fc9ee59 --- /dev/null +++ b/strategy/strategy.py @@ -0,0 +1,167 @@ +class Strategy: + def get_date(self, last_update): + pass + def get_csv_date(self, file): + pass + def clone_repo(TMP_FOLDER, REPO): + pass + def clean_sheet_names(self, new_ranges): + pass + def clean_data(self, tmp_df): + pass + def get_data(self, cleaned_sheets): + pass + def clean_last_updated(self, last_update): + pass + def drop_duplicates(self, df_raw): + pass + + +class Data_prep3( Strategy ): + def get_date(self, last_update): + return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + def get_csv_date(self, file): + return get_date(file.split('.')[0] + ' ') + def clone_repo(TMP_FOLDER, REPO): + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + def clean_sheet_names(self, new_ranges): + indices = [] + numeric_sheets = [x for x in new_ranges if re.search(r'\d', x)] + return numeric_sheets + def clean_data(self, tmp_df): + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'Deaths'}, inplace=True) + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + if 'Suspected' in tmp_df.columns: + tmp_df = tmp_df.drop(columns='Suspected') + for col in tmp_df.columns: + tmp_df[col] = tmp_df[col].fillna(0) + tmp_df.columns = map(str.lower, tmp_df.columns) + return tmp_df + def get_data(self, cleaned_sheets): + all_csv = [] + for file in sorted(sheets): + if 'csv' in file: + print('...', file) + tmp_df = pd.read_csv(os.path.join(DATA, file), index_col=None, header=0, parse_dates=['Last Update']) + tmp_df = tmp_df[keep_cols] + tmp_df[numeric_cols] = tmp_df[numeric_cols].fillna(0) + tmp_df[numeric_cols] = tmp_df[numeric_cols].astype(int) + tmp_df['Province/State'].fillna(tmp_df['Country/Region'], inplace=True) + + tmp_df['Last Update'] = tmp_df['Last Update'].apply(clean_last_updates) + tmp_df['date'] = tmp_df['Last Update'].apply(get_date) + tmp_df['file_date'] = get_csv_date(file) + + all_csv.append(tmp_df) + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) + df_raw = df_raw.sort_values(by=['Last Update']) + return df_raw + def clean_last_updates(self, last_update): + date = parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + time = parse(str(last_update).split(' ')[1]).strftime('%H:%M:%S') + parsed_date = str(date) + ' ' + str(time) + return parsed_date + def drop_duplicates(self, df_raw): + days_list = [] + for datetime in df_raw.date.unique(): + tmp_df = df_raw[df_raw.date == datetime] + tmp_df = tmp_df[df_raw.file_date != datetime].sort_values(['file_date']).drop_duplicates('Province/State', keep='last') + days_list.append(tmp_df) + return days_list + + + +class Data_prep4( Strategy ): + def get_date(self, last_update): + return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + def get_csv_date(self, file): + return get_date(file.split('.')[0] + ' ') + def clone_repo(TMP_FOLDER, REPO): + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + def clean_sheet_names(new_ranges): + indices = [] + numeric_sheets = [x for x in new_ranges if re.search(r'\d', x)] + return numeric_sheets + def clean_data(self, tmp_df): + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'Deaths'}, inplace=True) + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + if 'Suspected' in tmp_df.columns: + tmp_df = tmp_df.drop(columns='Suspected') + for col in tmp_df.columns: + tmp_df[col] = tmp_df[col].fillna(0) + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + return tmp_df + def get_data(self, cleaned_sheets): + all_csv = [] + for file in tqdm(sorted(sheets), desc='... importing data: '): + if 'csv' in file: + tmp_df = pd.read_csv(os.path.join(DATA, file), index_col=None, header=0, parse_dates=['Last Update']) + tmp_df = tmp_df[keep_cols] + tmp_df[numeric_cols] = tmp_df[numeric_cols].fillna(0) + tmp_df[numeric_cols] = tmp_df[numeric_cols].astype(int) + tmp_df['Province/State'].fillna(tmp_df['Country/Region'], inplace=True) + tmp_df['Last Update'] = tmp_df['Last Update'].apply(clean_last_updated) + tmp_df['date'] = tmp_df['Last Update'].apply(get_date) + tmp_df['file_date'] = get_csv_date(file) + all_csv.append(tmp_df) + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) + df_raw = df_raw.sort_values(by=['Last Update']) + frames = drop_duplicates(df_raw) + tmp = pd.concat(frames, axis=0, ignore_index=True, sort=True) + return tmp + def clean_last_updates(self, last_update): + date = parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + time = parse(str(last_update).split(' ')[1]).strftime('%H:%M:%S') + parsed_date = str(date) + ' ' + str(time) + return parsed_date + def drop_duplicates(self, df_raw): + days_list = [] + for datetime in df_raw.date.unique(): + tmp_df = df_raw[df_raw.date == datetime] + tmp_df = tmp_df.sort_values(['Last Update']).drop_duplicates('Province/State', keep='last') + days_list.append(tmp_df) + return days_list + + class Context: + private strategy: Strategy + def setStrategy(Strategy s) + this.strategy = s + def execute_get_date() + strategy.get_date() + def execute_get_csv_date() + strategy.get_csv_date() + def execute_clone_repo() + strategy.clone_repo() + def execute_cleaned_sheets() + strategy.cleaned_sheets() + def execute_clean_sheet_names() + strategy.clean_sheet_names() + def execute_clean_data() + strategy.clean_data() + def execute_get_data() + strategy.get_data() + def execute_clean_last_updated() + strategy.clean_last_updated() + def execute_drop_duplicates() + strategy.drop_duplicates() + + +class Application: + Context context: object + context.setStrategy(new Data_prep3()) + result = context.execute_get_data() \ No newline at end of file diff --git a/template_method/template_method.py b/template_method/template_method.py new file mode 100644 index 0000000..6c9ab3e --- /dev/null +++ b/template_method/template_method.py @@ -0,0 +1,147 @@ +class DataProcessor: + def get_date(): + return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + + def get_csv_date() + return get_date(f.split('.')[0] + ' ') + + def clone_repo() + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + + @abstractmethod + def cleaned_sheets(): + pass + + @abstractmethod + def clean_sheet_names(): + pass + + @abstractmethod + def clean_data(): + pass + + @abstractmethod + def get_data(): + pass + + +class Github(DataProcessor): + def cleaned_sheet_names(self): + return [x for x in new_ranges if re.search(r'\d', x)] + + def clean_data(self): + tmp_df = df.copy() + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'deaths'}, inplace=True) + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + if 'Country_Region' in tmp_df.columns: + tmp_df.rename(columns={'Country_Region':'country'}, inplace=True) + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + if 'Province_State' in tmp_df.columns: + tmp_df.rename(columns={'Province_State':'province'}, inplace=True) + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + if 'Last_Update' in tmp_df.columns: + tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True) + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + for col in tmp_df[NUMERIC_COLS]: + tmp_df[col] = tmp_df[col].fillna(0) + tmp_df[col] = tmp_df[col].astype(int) + return tmp_df + + def get_data(self): + all_csv = [] + # Import all CSV's + for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '): + if 'csv' in f: + try: + tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update']) + except: + # Temporary fix for JHU's bullshit data management + tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update']) + tmp_df = clean_data(tmp_df) + tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date + tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name + tmp_df = tmp_df[KEEP_COLS] + tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country + all_csv.append(tmp_df) + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) # concatenate all csv's into one df + df_raw = fix_country_names(df_raw) # Fix mispelled country names + df_raw = df_raw.sort_values(by=['datetime']) + return df_raw + + +class DataPrep(DataProcessor): + def clean_sheet_names(self): + indices = [] + # Remove all sheets that dont have a numeric header + numeric_sheets = [x for x in new_ranges if re.search(r'\d', x)] + return numeric_sheets + + def clean_data(self): + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'Deaths'}, inplace=True) + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + if 'Suspected' in tmp_df.columns: + tmp_df = tmp_df.drop(columns='Suspected') + for col in tmp_df.columns: + tmp_df[col] = tmp_df[col].fillna(0) + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + return tmp_df + + @abstractmethod + def get_data(self): + pass + + +class Data_prep3(DataPrep): + def get_data(self): + all_csv = [] + # Import all CSV's + for file in sorted(sheets): + if 'csv' in file: + print('...', file) + tmp_df = pd.read_csv(os.path.join(DATA, file), index_col=None, + header=0, parse_dates=['Last Update']) + tmp_df = tmp_df[keep_cols] + tmp_df[numeric_cols] = tmp_df[numeric_cols].fillna(0) + tmp_df[numeric_cols] = tmp_df[numeric_cols].astype(int) + tmp_df['Province/State'].fillna(tmp_df['Country/Region'], inplace=True) + tmp_df['Last Update'] = tmp_df['Last Update'].apply(clean_last_updates) + tmp_df['date'] = tmp_df['Last Update'].apply(get_date) + tmp_df['file_date'] = get_csv_date(file) + all_csv.append(tmp_df) + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) + df_raw = df_raw.sort_values(by=['Last Update']) + return df_raw + + class Data_prep4(DataPrep): + def get_data(self): + all_csv = [] + for file in tqdm(sorted(sheets), desc='... importing data: '): + if 'csv' in file: + tmp_df = pd.read_csv(os.path.join(DATA, file), index_col=None, + header=0, parse_dates=['Last Update']) + tmp_df = tmp_df[keep_cols] + tmp_df[numeric_cols] = tmp_df[numeric_cols].fillna(0) + tmp_df[numeric_cols] = tmp_df[numeric_cols].astype(int) + tmp_df['Province/State'].fillna(tmp_df['Country/Region'], inplace=True) + tmp_df['Last Update'] = tmp_df['Last Update'].apply(clean_last_updated) + tmp_df['date'] = tmp_df['Last Update'].apply(get_date) + tmp_df['file_date'] = get_csv_date(file) + all_csv.append(tmp_df) + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) + df_raw = df_raw.sort_values(by=['Last Update']) + frames = drop_duplicates(df_raw) + tmp = pd.concat(frames, axis=0, ignore_index=True, sort=True) + return tmp