Add tests and testdata

cchuong · cchuong · commit b2e50134147b · 2025-03-07T10:52:50.000-08:00
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -19,3 +19,6 @@ selenium==4.7.2
 sqlalchemy-stubs>=0.3
 tenacity==7.0.0
 xlrd==2.0.1
+bs4
+mock
+requests_file
diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py
@@ -14,16 +14,19 @@
     )
 
 def abbreviate_virus(full_name):
+    """Abbreviate viruses and make them lowercase """
+    
     lowercase=full_name.lower()
     keys = (re.escape(k) for k in VIRUSES.keys())
     pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b')
     result = pattern.sub(lambda x: VIRUSES[x.group()], lowercase)
     return(result)
 
 def abbreviate_geo(full_name):
+    """Abbreviate provincial geo_values and make spelling consistent (i.e. removing extra spaces)"""
     lowercase=full_name.lower()
     lowercase = re.sub("province of ","",lowercase)
-    lowercase=re.sub("\.|\*","",lowercase)
+    lowercase=re.sub(r"\.|\*","",lowercase)
     lowercase=re.sub("/territoires","",lowercase)
     lowercase=re.sub("^cana$","can",lowercase)
     lowercase =lowercase.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation),'.'+"'"))
@@ -43,7 +46,8 @@ def abbreviate_geo(full_name):
     return(result)
 
 def create_geo_types(geo,default_geo):
-    if geo in NATION:
+    lowercase_geo = geo.lower()
+    if lowercase_geo in NATION:
         geo_type="nation"
     elif geo in REGIONS:
         geo_type="region"
@@ -88,15 +92,15 @@ def preprocess_table_columns(table):
     Change some naming of locations in columns (i.e at instead of atl)
     """
     table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space
-    table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns
-    table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods
+    table.columns = [re.sub(r"(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns
+    table.columns =[re.sub(r"\.", "", s)for s in table.columns] #remove periods
     table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all)
-    table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns]
+    table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] # remove ( )
     table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space
-    table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _
+    table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns]
     table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _
 
-    table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns]
+    table.columns = [re.sub(r"^at\b","atl",t) for t in table.columns]
     table.columns = [re.sub("canada","can",t) for t in table.columns]
     table.columns = [re.sub(r"\bcb\b","bc",t) for t in table.columns]
 
@@ -146,7 +150,8 @@ def make_signal_type_spelling_consistent(signal):
     pat4 = 'tested'
     combined_pat2 = '|'.join((pat3, pat4))
 
-    new_signal = re.sub(combined_pat, "positive_tests",signal)
+    new_signal = re.sub("positive tests", "positive_tests",signal)
+    new_signal = re.sub(combined_pat, "positive_tests",new_signal)
     new_signal = re.sub(combined_pat2, "tests",new_signal)
     new_signal =re.sub(" *%", "_pct_positive",new_signal)
     new_signal = re.sub("total ", "",new_signal)
@@ -198,7 +203,7 @@ def get_detections_data(base_url,headers,update_date):
     week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
     week_string = week_df.iloc[0]['Text'].lower()
     current_week = int(re.search("week (.+?) ", week_string).group(1))
-    current_year= int(re.search("20\d{2}", week_string).group(0))
+    current_year= int(re.search(r"20\d{2}", week_string).group(0))
 
     current_epiweek= Week(current_year,current_week)
 
diff --git a/testdata/acquisition/rvdss/RVD_CurrentWeekTable.csv b/testdata/acquisition/rvdss/RVD_CurrentWeekTable.csv
diff --git a/testdata/acquisition/rvdss/RVD_SummaryText.csv b/testdata/acquisition/rvdss/RVD_SummaryText.csv
@@ -0,0 +1,25 @@
+Language,Section,Type,Text
+English,summary,title,"Summary of laboratory data for Week 7 (week ending February 15, 2025)"
+English,summary,text,"In week 7 (week ending February 15, 2025) in Canada, percent positivity is currently highest for influenza (26.9% positive) among respiratory viruses under surveillance. The following results were reported from RVDSS laboratories:"
+English,category1,title,Influenza (includes influenza A and B)
+English,category1,listitem1,"Influenza percent positivity continues to increase (11,790 detections; 26.9% positive)."
+English,category2,title,SARS-CoV-2 (the virus which causes COVID-19)
+English,category2,listitem1,"National SARS-CoV-2 percent positivity continues to decrease (1,750 detections; 4.0% positive)."
+English,category3,title,RSV (respiratory syncytial virus)
+English,category3,listitem1,"National RSV percent positivity continues to decrease (1,938 detections; 4.9% positive)."
+English,category4,title,Other respiratory viruses
+English,category4,listitem1,Percent positivity of all other respiratory viruses is following historically observed trends.
+English,category5,title,Number of reporting laboratories
+English,category5,listitem1,34 out of 35 laboratories reported surveillance data.
+French,summary,title,RÃ©sumÃ© des donnÃ©es de laboratoire pour la semaine 7 (semaine se terminant le 15 fÃ©vrier 2025)
+French,summary,text,"Au cours de la semaine 7 (se terminant le 15 fÃ©vrier 2025) au Canada, le pourcentage de positivitÃ© est actuellement le plus Ã©levÃ© pour la grippe (26,9 % positifs) parmi les virus respiratoires sous surveillance. Les rÃ©sultats suivants ont Ã©tÃ© rapportÃ©s par les laboratoires du SSDVR :"
+French,category1,title,La grippe (incluant la grippe A et B)
+French,category1,listitem1,"Le pourcentage de positivitÃ© pour la grippe continue dâaugmenter (11 790 dÃ©tections; 26,9 % positifs)."
+French,category2,title,SRAS-CoV-2 (le virus Ã  lâorigine de la COVID-19)
+French,category2,listitem1,"Le pourcentage de positivitÃ© national pour le SRAS-CoV-2 continue de diminuer (1 750 dÃ©tections; 4,0 % positifs)."
+French,category3,title,Le VRS (virus respiratoire syncytial)
+French,category3,listitem1,"Le pourcentage de positivitÃ© national du VRS continue de diminuer (1 938 dÃ©tections; 4,9 % positifs)."
+French,category4,title,Autres virus respiratoires
+French,category4,listitem1,Le pourcentage de positivitÃ© de tous les autres virus respiratoires suit les tendances historiques observÃ©es.
+French,category5,title,Nombre de laboratoires dÃ©clarants
+French,category5,listitem1,Nombre de laboratoires qui ont fait Ã©tat de la situation : 34 sur 35
diff --git a/testdata/acquisition/rvdss/RVD_UpdateDate.csv b/testdata/acquisition/rvdss/RVD_UpdateDate.csv
@@ -0,0 +1 @@
+2/20/2025 10:28:16
diff --git a/testdata/acquisition/rvdss/example_update_dates.txt b/testdata/acquisition/rvdss/example_update_dates.txt
@@ -0,0 +1,2 @@
+2025-02-14
+2023-09-01
diff --git a/tests/acquisition/rvdss/test_pull_historic.py b/tests/acquisition/rvdss/test_pull_historic.py
@@ -1,6 +1,13 @@
 """Unit tests for rvdss/pull_historic.py."""
 
 import pytest
+import mock
+
+from delphi.epidata.acquisition.rvdss.pull_historic import (get_report_season_years, add_https_prefix, 
+construct_weekly_report_urls, report_weeks, get_report_date, extract_captions_of_interest, get_modified_dates,
+deduplicate_rows, drop_ah1_columns, create_detections_table, create_number_detections_table, 
+create_percent_positive_detection_table, fetch_one_season_from_report, fetch_archived_dashboard_dates, 
+fetch_report_data, fetch_historical_dashboard_data)
 
 # py3tester coverage target
 __test_target__ = "delphi.epidata.acquisition.rvdss.pull_historic"
@@ -11,3 +18,55 @@ class TestPullHistoric():
     def test_syntax(self):
         """This no-op test ensures that syntax is valid."""
         pass
+    
+    def test_get_report_season_years(self):
+        pass
+
+    def test_add_https_prefix(self):
+       # assert add_https_prefix(["/random.html"]) == "https://www.canada.ca/random.html"
+       # assert add_https_prefix(["http://randomurl2.html"]) == "https://randomurl2.html"
+       # assert add_https_prefix(["https://randomurl3.html"]) == "https://randomurl3.html"
+       pass
+   
+    def test_construct_weekly_report_urls(self):
+        pass
+
+    def test_report_weeks(self):
+        pass
+
+    def test_get_report_date(self):
+        pass
+
+    def test_extract_captions_of_interest(self):
+        pass
+        
+    def test_get_modified_dates(self):
+        pass
+
+    def test_deduplicate_rows(self):
+        pass
+
+    def test_drop_ah1_columns(self):
+        pass
+
+    def test_create_detections_table(self):
+        pass
+
+    def test_create_number_detections_table(self):
+        pass
+
+    def test_create_percent_positive_detection_table(self):
+        pass
+
+    def test_fetch_one_season_from_report(self):
+        pass
+        
+    def test_fetch_archived_dashboard_dates(self):
+        pass
+
+    def test_fetch_report_data(self):
+        pass
+
+    def test_fetch_historical_dashboard_data(self):
+        pass
+        
diff --git a/tests/acquisition/rvdss/test_utils.py b/tests/acquisition/rvdss/test_utils.py
@@ -1,24 +1,187 @@
 """Unit tests for rvdss/utils.py."""
 
 import pytest
+import mock
+import requests
+from requests_file import FileAdapter
+from pathlib import Path
+import pandas as pd
 
-from delphi.epidata.acquisition.rvdss.utils import abbreviate_virus, create_geo_types
+from delphi.epidata.acquisition.rvdss.utils import (abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
+get_dashboard_update_date, check_most_recent_update_date, preprocess_table_columns, add_flu_prefix, 
+make_signal_type_spelling_consistent, get_positive_data, get_detections_data, fetch_dashboard_data) 
 
 # py3tester coverage target
 __test_target__ = "delphi.epidata.acquisition.rvdss.utils"
 
+example_unprocessed_data = [
+    pd.DataFrame({'Reporting\xa0Laboratories':1},index=[0]),
+    pd.DataFrame({'lab':1,'lab.2':2},index=[0]),
+    pd.DataFrame({'Reporting.lab':1},index=[0]),
+    pd.DataFrame({'flucounts (all)':2},index=[0]),
+    pd.DataFrame({'fluah1 (2009)':2},index=[0]),
+    pd.DataFrame({'flucounts       s':2},index=[0]),
+    pd.DataFrame({'lab/tech':3},index=[0]),
+    
+    pd.DataFrame({'at counts':1},index=[0]),
+    pd.DataFrame({'canada counts':2},index=[0]),
+    pd.DataFrame({'cb counts':3},index=[0]),
+    
+    pd.DataFrame({'h1n1 2009 ':3},index=[0]),
+    pd.DataFrame({'h1n12009 counts':3},index=[0]), 
+    pd.DataFrame({'a_h1 counts':3},index=[0]),
+    pd.DataFrame({'ah1 counts':3},index=[0]),
+    pd.DataFrame({'a_uns counts':3},index=[0]),
+    pd.DataFrame({'a_h3 counts':3},index=[0]),
+    
+    pd.DataFrame({'parainfluenza a':4,'piv b':4, "para c":4},index=[0]),
+    pd.DataFrame({'adeno a':4, 'adeno b':4},index=[0]),
+    pd.DataFrame({'human metapneumovirus a':4},index=[0]),
+    pd.DataFrame({'enterovirus_rhinovirus a':4,'rhinovirus b':4, "rhv c":4,"entero_rhino d":4,"rhino e":4, "ev_rv f":4},index=[0]),
+    pd.DataFrame({'coronavirus a':4,'coron b':4, "coro c":4},index=[0]),
+    pd.DataFrame({'respiratory syncytial virus a':4},index=[0]),
+    pd.DataFrame({'influenza counts':4},index=[0]),
+    pd.DataFrame({'sars-cov-2 counts':4},index=[0]),
+    
+    pd.DataFrame({"flu a":5,"flu b":5},index=[0]),
+    pd.DataFrame({"flutest p":5},index=[0]),
+    pd.DataFrame({"other hpiv a":5, "other_hpiv count b":5},index=[0]),
+    
+    
+    pd.DataFrame({"flu apositive":6,"flu bpositive":6},index=[0]),
+    pd.DataFrame({"hpiv_1 counts":6,"hpiv_2 counts":6,"hpiv_3 counts":6,"hpiv_4 counts":6},index=[0]),
+    
+    pd.DataFrame({"num positive tests":7},index=[0]),
+    pd.DataFrame({"num positive a":7,"num pos b":7},index=[0]),
+    pd.DataFrame({"num test a":7,"num tested b":7},index=[0]),
+    pd.DataFrame({"virus% a":7,"virus % b":7},index=[0]),
+    pd.DataFrame({"total counts":7},index=[0])
+]
+
+expected_processed_data = [
+    pd.DataFrame({'reporting laboratories':1},index=[0]),
+    pd.DataFrame({'lab':1,'lab2':2},index=[0]).rename(columns={"lab":"lab","lab2":"lab"}),
+    pd.DataFrame({'reportinglab':1},index=[0]),
+    pd.DataFrame({'flucounts ':2},index=[0]),
+    pd.DataFrame({'fluah12009':2},index=[0]),
+    pd.DataFrame({'flucounts s':2},index=[0]),
+    pd.DataFrame({'lab_tech':3},index=[0]),
+    
+    pd.DataFrame({'atl counts':1},index=[0]),
+    pd.DataFrame({'can counts':2},index=[0]),
+    pd.DataFrame({'bc counts':3},index=[0]),
+    
+    pd.DataFrame({'ah1n1pdm09':3},index=[0]),
+    pd.DataFrame({'ah1n1pdm09 counts':3},index=[0]), 
+    pd.DataFrame({'ah1n1pdm09 counts':3},index=[0]),
+    pd.DataFrame({'ah1n1pdm09 counts':3},index=[0]),
+    pd.DataFrame({'auns counts':3},index=[0]),
+    pd.DataFrame({'ah3 counts':3},index=[0]),
+    
+    pd.DataFrame({'hpiv a':4,'hpiv b':4, "hpiv c":4},index=[0]),
+    pd.DataFrame({'adv a':4, 'adv b':4},index=[0]),
+    pd.DataFrame({'hmpv a':4},index=[0]),
+    pd.DataFrame({'evrv a':4,'evrv b':4, "evrv c":4,"evrv d":4,"evrv e":4, "evrv f":4},index=[0]),
+    pd.DataFrame({'hcov a':4,'hcov b':4, "hcov c":4},index=[0]),
+    pd.DataFrame({'rsv a':4},index=[0]),
+    pd.DataFrame({'flu counts':4},index=[0]),
+    pd.DataFrame({'sarscov2 counts':4},index=[0]),
+    
+    pd.DataFrame({"flua":5,"flub":5},index=[0]),
+    pd.DataFrame({"flu tests p":5},index=[0]),
+    pd.DataFrame({"hpivother a":5, "hpivother count b":5},index=[0]),
+    
+    pd.DataFrame({"flua_positive_tests":6,"flub_positive_tests":6},index=[0]),
+    pd.DataFrame({"hpiv1 counts":6,"hpiv2 counts":6,"hpiv3 counts":6,"hpiv4 counts":6},index=[0]),
+    
+    pd.DataFrame({"num positive_tests":7},index=[0]),
+    pd.DataFrame({"num positive_tests a":7,"num positive_tests b":7},index=[0]),
+    pd.DataFrame({"num tests a":7,"num tests b":7},index=[0]),
+    pd.DataFrame({"virus_pct_positive a":7,"virus_pct_positive b":7},index=[0]),
+    pd.DataFrame({"counts":7},index=[0])
+]
 
 class TestUtils:
     def test_syntax(self):
         """This no-op test ensures that syntax is valid."""
         pass
 
     def test_abbreviate_virus(self):
-    	assert abbreviate_virus("influenza") == "flu" # normal case
-    	assert abbreviate_virus("flu") == "flu" # already abbreviated
+        assert abbreviate_virus("influenza") == "flu" # normal case
+        assert abbreviate_virus("flu") == "flu" # already abbreviated
+        assert abbreviate_virus("parainfluenza") == "hpiv" 
+        assert abbreviate_virus("banana") == "banana" #non geos should remain as is
 
+    def test_abbreviate_geo(self):
+        assert abbreviate_geo("british columbia") == "bc"
+        assert abbreviate_geo("québec") == "qc" # recognise accents in provinces
+        assert abbreviate_geo("Région Nord-Est") == "région nord est" # remove dashes, make lowercase
+        assert abbreviate_geo("P.H.O.L. - Sault Ste. Marie") == "phol sault ste marie"
+        assert abbreviate_geo("random lab") == "random lab" #unknown geos remain unchanged     
+        # only province names on their own should be abbreviated, not as part of a larger name
+        assert abbreviate_geo("british columbia lab") == "british columbia lab"
+        
     def test_create_geo_types(self):
         assert create_geo_types("canada","lab") == "nation"
         assert create_geo_types("bc","lab") == "region"
         assert create_geo_types("random lab","lab") == "lab"
-        assert create_geo_types("Canada","province") == "province" #lowercase handling happens upstream
+        assert create_geo_types("Canada","province") == "nation"
+        
+    def test_check_date_format(self):
+        assert check_date_format("2015-09-05") == "2015-09-05"
+        assert check_date_format("01/10/2020") == "2020-10-01" # change d/m/Y to Y-m-d
+        assert check_date_format("02-11-2013") == "2013-11-02" # change d-m-Y to Y-m-d
+        with pytest.raises(AssertionError):
+            check_date_format("02-2005-10") # Invalid date format raises error
+    
+    @mock.patch("requests.get")
+    def test_get_dashboard_update_date(self, mock_requests):
+        # Set up fake data.
+        headers={}
+        url = "testurl.ca"
+        
+        s = requests.Session()
+        s.mount('file://', FileAdapter())
+
+        TEST_DIR = Path(__file__).parent
+        resp = s.get('file://'+ str(TEST_DIR) + "/RVD_UpdateDate.csv") 
+        
+        # Mocks
+        mock_requests.return_value = resp
+        assert get_dashboard_update_date(url, headers) == "2025-02-20"
+        
+    def test_check_most_recent_update_date(self):
+        TEST_DIR = Path(__file__).parent
+        path = str(TEST_DIR) + "/example_update_dates.txt"
+        
+        assert check_most_recent_update_date("2025-02-14",path) == True #date is in the file
+        assert check_most_recent_update_date("2025-03-20",path) == False #date is not in the file
+        
+    def test_preprocess_table_columns(self):
+        for example, expected in zip(example_unprocessed_data, expected_processed_data):
+            assert preprocess_table_columns(example).equals(expected)
+    
+    def test_add_flu_prefix(self):
+        assert add_flu_prefix("ah3_pos") == "fluah3_pos"
+        assert add_flu_prefix("auns") == "fluauns"
+        assert add_flu_prefix("ah1pdm09 tests") == "fluah1pdm09 tests"
+        assert add_flu_prefix("ah1n1pdm09") == "fluah1n1pdm09"
+        assert add_flu_prefix("fluah1n1pdm09") == "fluah1n1pdm09" #if prefix exists, do nothing
+        assert add_flu_prefix("random string") == "random string" #if no prefix, it should do nothing
+        
+    def test_make_signal_type_spelling_consistent(self):
+        assert make_signal_type_spelling_consistent("positive tests") == "positive_tests"
+        assert make_signal_type_spelling_consistent("flu pos") == "flu positive_tests"
+        assert make_signal_type_spelling_consistent("rsv tested") == "rsv tests"
+        assert make_signal_type_spelling_consistent("covid total tested") == "covid tests"
+        assert make_signal_type_spelling_consistent("flua%") == "flua_pct_positive"
+        
+    
+    def test_get_positive_data(self):
+        pass
+        
+    def test_get_detections_data(self):
+        pass
+    
+    def test_fetch_dashboard_data(self):
+        pass