Skip to content

Commit

Permalink
Add tests and testdata
Browse files Browse the repository at this point in the history
  • Loading branch information
cchuong committed Mar 7, 2025
1 parent c0742fc commit b2e5013
Show file tree
Hide file tree
Showing 8 changed files with 1,297 additions and 13 deletions.
3 changes: 3 additions & 0 deletions requirements.dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ selenium==4.7.2
sqlalchemy-stubs>=0.3
tenacity==7.0.0
xlrd==2.0.1
bs4
mock
requests_file
23 changes: 14 additions & 9 deletions src/acquisition/rvdss/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@
)

def abbreviate_virus(full_name):
"""Abbreviate viruses and make them lowercase """

lowercase=full_name.lower()
keys = (re.escape(k) for k in VIRUSES.keys())
pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b')
result = pattern.sub(lambda x: VIRUSES[x.group()], lowercase)
return(result)

def abbreviate_geo(full_name):
"""Abbreviate provincial geo_values and make spelling consistent (i.e. removing extra spaces)"""
lowercase=full_name.lower()
lowercase = re.sub("province of ","",lowercase)
lowercase=re.sub("\.|\*","",lowercase)
lowercase=re.sub(r"\.|\*","",lowercase)
lowercase=re.sub("/territoires","",lowercase)
lowercase=re.sub("^cana$","can",lowercase)
lowercase =lowercase.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation),'.'+"'"))
Expand All @@ -43,7 +46,8 @@ def abbreviate_geo(full_name):
return(result)

def create_geo_types(geo,default_geo):
if geo in NATION:
lowercase_geo = geo.lower()
if lowercase_geo in NATION:
geo_type="nation"
elif geo in REGIONS:
geo_type="region"
Expand Down Expand Up @@ -88,15 +92,15 @@ def preprocess_table_columns(table):
Change some naming of locations in columns (i.e at instead of atl)
"""
table.columns = [re.sub("\xa0"," ", col) for col in table.columns] # \xa0 to space
table.columns = [re.sub("(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns
table.columns =[re.sub("\.", "", s)for s in table.columns] #remove periods
table.columns = [re.sub(r"(.*?)(\.\d+)", "\\1", c) for c in table.columns] # remove .# for duplicated columns
table.columns =[re.sub(r"\.", "", s)for s in table.columns] #remove periods
table.columns =[re.sub(r"\((all)\)", "", s)for s in table.columns] # remove (all)
table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns]
table.columns =[re.sub(r"\s*\(|\)", "", s)for s in table.columns] # remove ( )
table.columns = [re.sub(' +', ' ', col) for col in table.columns] # Make any muliple spaces into one space
table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns] # replace () for _
table.columns = [re.sub(r'\(|\)', '', col) for col in table.columns]
table.columns = [re.sub(r'/', '_', col) for col in table.columns] # replace / with _

table.columns = [re.sub(r"^at\b","atl ",t) for t in table.columns]
table.columns = [re.sub(r"^at\b","atl",t) for t in table.columns]
table.columns = [re.sub("canada","can",t) for t in table.columns]
table.columns = [re.sub(r"\bcb\b","bc",t) for t in table.columns]

Expand Down Expand Up @@ -146,7 +150,8 @@ def make_signal_type_spelling_consistent(signal):
pat4 = 'tested'
combined_pat2 = '|'.join((pat3, pat4))

new_signal = re.sub(combined_pat, "positive_tests",signal)
new_signal = re.sub("positive tests", "positive_tests",signal)
new_signal = re.sub(combined_pat, "positive_tests",new_signal)
new_signal = re.sub(combined_pat2, "tests",new_signal)
new_signal =re.sub(" *%", "_pct_positive",new_signal)
new_signal = re.sub("total ", "",new_signal)
Expand Down Expand Up @@ -198,7 +203,7 @@ def get_detections_data(base_url,headers,update_date):
week_df = summary_df[(summary_df['Section'] == "summary") & (summary_df['Type']=="title")]
week_string = week_df.iloc[0]['Text'].lower()
current_week = int(re.search("week (.+?) ", week_string).group(1))
current_year= int(re.search("20\d{2}", week_string).group(0))
current_year= int(re.search(r"20\d{2}", week_string).group(0))

current_epiweek= Week(current_year,current_week)

Expand Down
1,026 changes: 1,026 additions & 0 deletions testdata/acquisition/rvdss/RVD_CurrentWeekTable.csv

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions testdata/acquisition/rvdss/RVD_SummaryText.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Language,Section,Type,Text
English,summary,title,"Summary of laboratory data for Week 7 (week ending February 15, 2025)"
English,summary,text,"In week 7 (week ending February 15, 2025) in Canada, percent positivity is currently highest for influenza (26.9% positive) among respiratory viruses under surveillance. The following results were reported from RVDSS laboratories:"
English,category1,title,Influenza (includes influenza A and B)
English,category1,listitem1,"Influenza percent positivity continues to increase (11,790 detections; 26.9% positive)."
English,category2,title,SARS-CoV-2 (the virus which causes COVID-19)
English,category2,listitem1,"National SARS-CoV-2 percent positivity continues to decrease (1,750 detections; 4.0% positive)."
English,category3,title,RSV (respiratory syncytial virus)
English,category3,listitem1,"National RSV percent positivity continues to decrease (1,938 detections; 4.9% positive)."
English,category4,title,Other respiratory viruses
English,category4,listitem1,Percent positivity of all other respiratory viruses is following historically observed trends.
English,category5,title,Number of reporting laboratories
English,category5,listitem1,34 out of 35 laboratories reported surveillance data.
French,summary,title,Résumé des données de laboratoire pour la semaine 7 (semaine se terminant le 15 février 2025)
French,summary,text,"Au cours de la semaine 7 (se terminant le 15 février 2025) au Canada, le pourcentage de positivité est actuellement le plus élevé pour la grippe (26,9 % positifs) parmi les virus respiratoires sous surveillance. Les résultats suivants ont été rapportés par les laboratoires du SSDVR :"
French,category1,title,La grippe (incluant la grippe A et B)
French,category1,listitem1,"Le pourcentage de positivité pour la grippe continue d’augmenter (11 790 détections; 26,9 % positifs)."
French,category2,title,SRAS-CoV-2 (le virus à l’origine de la COVID-19)
French,category2,listitem1,"Le pourcentage de positivité national pour le SRAS-CoV-2 continue de diminuer (1 750 détections; 4,0 % positifs)."
French,category3,title,Le VRS (virus respiratoire syncytial)
French,category3,listitem1,"Le pourcentage de positivité national du VRS continue de diminuer (1 938 détections; 4,9 % positifs)."
French,category4,title,Autres virus respiratoires
French,category4,listitem1,Le pourcentage de positivité de tous les autres virus respiratoires suit les tendances historiques observées.
French,category5,title,Nombre de laboratoires déclarants
French,category5,listitem1,Nombre de laboratoires qui ont fait état de la situation : 34 sur 35
1 change: 1 addition & 0 deletions testdata/acquisition/rvdss/RVD_UpdateDate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2/20/2025 10:28:16
2 changes: 2 additions & 0 deletions testdata/acquisition/rvdss/example_update_dates.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
2025-02-14
2023-09-01
59 changes: 59 additions & 0 deletions tests/acquisition/rvdss/test_pull_historic.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
"""Unit tests for rvdss/pull_historic.py."""

import pytest
import mock

from delphi.epidata.acquisition.rvdss.pull_historic import (get_report_season_years, add_https_prefix,
construct_weekly_report_urls, report_weeks, get_report_date, extract_captions_of_interest, get_modified_dates,
deduplicate_rows, drop_ah1_columns, create_detections_table, create_number_detections_table,
create_percent_positive_detection_table, fetch_one_season_from_report, fetch_archived_dashboard_dates,
fetch_report_data, fetch_historical_dashboard_data)

# py3tester coverage target
__test_target__ = "delphi.epidata.acquisition.rvdss.pull_historic"
Expand All @@ -11,3 +18,55 @@ class TestPullHistoric():
def test_syntax(self):
"""This no-op test ensures that syntax is valid."""
pass

def test_get_report_season_years(self):
pass

def test_add_https_prefix(self):
# assert add_https_prefix(["/random.html"]) == "https://www.canada.ca/random.html"
# assert add_https_prefix(["http://randomurl2.html"]) == "https://randomurl2.html"
# assert add_https_prefix(["https://randomurl3.html"]) == "https://randomurl3.html"
pass

def test_construct_weekly_report_urls(self):
pass

def test_report_weeks(self):
pass

def test_get_report_date(self):
pass

def test_extract_captions_of_interest(self):
pass

def test_get_modified_dates(self):
pass

def test_deduplicate_rows(self):
pass

def test_drop_ah1_columns(self):
pass

def test_create_detections_table(self):
pass

def test_create_number_detections_table(self):
pass

def test_create_percent_positive_detection_table(self):
pass

def test_fetch_one_season_from_report(self):
pass

def test_fetch_archived_dashboard_dates(self):
pass

def test_fetch_report_data(self):
pass

def test_fetch_historical_dashboard_data(self):
pass

171 changes: 167 additions & 4 deletions tests/acquisition/rvdss/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,187 @@
"""Unit tests for rvdss/utils.py."""

import pytest
import mock
import requests
from requests_file import FileAdapter
from pathlib import Path
import pandas as pd

from delphi.epidata.acquisition.rvdss.utils import abbreviate_virus, create_geo_types
from delphi.epidata.acquisition.rvdss.utils import (abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
get_dashboard_update_date, check_most_recent_update_date, preprocess_table_columns, add_flu_prefix,
make_signal_type_spelling_consistent, get_positive_data, get_detections_data, fetch_dashboard_data)

# py3tester coverage target
__test_target__ = "delphi.epidata.acquisition.rvdss.utils"

example_unprocessed_data = [
pd.DataFrame({'Reporting\xa0Laboratories':1},index=[0]),
pd.DataFrame({'lab':1,'lab.2':2},index=[0]),
pd.DataFrame({'Reporting.lab':1},index=[0]),
pd.DataFrame({'flucounts (all)':2},index=[0]),
pd.DataFrame({'fluah1 (2009)':2},index=[0]),
pd.DataFrame({'flucounts s':2},index=[0]),
pd.DataFrame({'lab/tech':3},index=[0]),

pd.DataFrame({'at counts':1},index=[0]),
pd.DataFrame({'canada counts':2},index=[0]),
pd.DataFrame({'cb counts':3},index=[0]),

pd.DataFrame({'h1n1 2009 ':3},index=[0]),
pd.DataFrame({'h1n12009 counts':3},index=[0]),
pd.DataFrame({'a_h1 counts':3},index=[0]),
pd.DataFrame({'ah1 counts':3},index=[0]),
pd.DataFrame({'a_uns counts':3},index=[0]),
pd.DataFrame({'a_h3 counts':3},index=[0]),

pd.DataFrame({'parainfluenza a':4,'piv b':4, "para c":4},index=[0]),
pd.DataFrame({'adeno a':4, 'adeno b':4},index=[0]),
pd.DataFrame({'human metapneumovirus a':4},index=[0]),
pd.DataFrame({'enterovirus_rhinovirus a':4,'rhinovirus b':4, "rhv c":4,"entero_rhino d":4,"rhino e":4, "ev_rv f":4},index=[0]),
pd.DataFrame({'coronavirus a':4,'coron b':4, "coro c":4},index=[0]),
pd.DataFrame({'respiratory syncytial virus a':4},index=[0]),
pd.DataFrame({'influenza counts':4},index=[0]),
pd.DataFrame({'sars-cov-2 counts':4},index=[0]),

pd.DataFrame({"flu a":5,"flu b":5},index=[0]),
pd.DataFrame({"flutest p":5},index=[0]),
pd.DataFrame({"other hpiv a":5, "other_hpiv count b":5},index=[0]),


pd.DataFrame({"flu apositive":6,"flu bpositive":6},index=[0]),
pd.DataFrame({"hpiv_1 counts":6,"hpiv_2 counts":6,"hpiv_3 counts":6,"hpiv_4 counts":6},index=[0]),

pd.DataFrame({"num positive tests":7},index=[0]),
pd.DataFrame({"num positive a":7,"num pos b":7},index=[0]),
pd.DataFrame({"num test a":7,"num tested b":7},index=[0]),
pd.DataFrame({"virus% a":7,"virus % b":7},index=[0]),
pd.DataFrame({"total counts":7},index=[0])
]

expected_processed_data = [
pd.DataFrame({'reporting laboratories':1},index=[0]),
pd.DataFrame({'lab':1,'lab2':2},index=[0]).rename(columns={"lab":"lab","lab2":"lab"}),
pd.DataFrame({'reportinglab':1},index=[0]),
pd.DataFrame({'flucounts ':2},index=[0]),
pd.DataFrame({'fluah12009':2},index=[0]),
pd.DataFrame({'flucounts s':2},index=[0]),
pd.DataFrame({'lab_tech':3},index=[0]),

pd.DataFrame({'atl counts':1},index=[0]),
pd.DataFrame({'can counts':2},index=[0]),
pd.DataFrame({'bc counts':3},index=[0]),

pd.DataFrame({'ah1n1pdm09':3},index=[0]),
pd.DataFrame({'ah1n1pdm09 counts':3},index=[0]),
pd.DataFrame({'ah1n1pdm09 counts':3},index=[0]),
pd.DataFrame({'ah1n1pdm09 counts':3},index=[0]),
pd.DataFrame({'auns counts':3},index=[0]),
pd.DataFrame({'ah3 counts':3},index=[0]),

pd.DataFrame({'hpiv a':4,'hpiv b':4, "hpiv c":4},index=[0]),
pd.DataFrame({'adv a':4, 'adv b':4},index=[0]),
pd.DataFrame({'hmpv a':4},index=[0]),
pd.DataFrame({'evrv a':4,'evrv b':4, "evrv c":4,"evrv d":4,"evrv e":4, "evrv f":4},index=[0]),
pd.DataFrame({'hcov a':4,'hcov b':4, "hcov c":4},index=[0]),
pd.DataFrame({'rsv a':4},index=[0]),
pd.DataFrame({'flu counts':4},index=[0]),
pd.DataFrame({'sarscov2 counts':4},index=[0]),

pd.DataFrame({"flua":5,"flub":5},index=[0]),
pd.DataFrame({"flu tests p":5},index=[0]),
pd.DataFrame({"hpivother a":5, "hpivother count b":5},index=[0]),

pd.DataFrame({"flua_positive_tests":6,"flub_positive_tests":6},index=[0]),
pd.DataFrame({"hpiv1 counts":6,"hpiv2 counts":6,"hpiv3 counts":6,"hpiv4 counts":6},index=[0]),

pd.DataFrame({"num positive_tests":7},index=[0]),
pd.DataFrame({"num positive_tests a":7,"num positive_tests b":7},index=[0]),
pd.DataFrame({"num tests a":7,"num tests b":7},index=[0]),
pd.DataFrame({"virus_pct_positive a":7,"virus_pct_positive b":7},index=[0]),
pd.DataFrame({"counts":7},index=[0])
]

class TestUtils:
def test_syntax(self):
"""This no-op test ensures that syntax is valid."""
pass

def test_abbreviate_virus(self):
assert abbreviate_virus("influenza") == "flu" # normal case
assert abbreviate_virus("flu") == "flu" # already abbreviated
assert abbreviate_virus("influenza") == "flu" # normal case
assert abbreviate_virus("flu") == "flu" # already abbreviated
assert abbreviate_virus("parainfluenza") == "hpiv"
assert abbreviate_virus("banana") == "banana" #non geos should remain as is

def test_abbreviate_geo(self):
assert abbreviate_geo("british columbia") == "bc"
assert abbreviate_geo("québec") == "qc" # recognise accents in provinces
assert abbreviate_geo("Région Nord-Est") == "région nord est" # remove dashes, make lowercase
assert abbreviate_geo("P.H.O.L. - Sault Ste. Marie") == "phol sault ste marie"
assert abbreviate_geo("random lab") == "random lab" #unknown geos remain unchanged
# only province names on their own should be abbreviated, not as part of a larger name
assert abbreviate_geo("british columbia lab") == "british columbia lab"

def test_create_geo_types(self):
assert create_geo_types("canada","lab") == "nation"
assert create_geo_types("bc","lab") == "region"
assert create_geo_types("random lab","lab") == "lab"
assert create_geo_types("Canada","province") == "province" #lowercase handling happens upstream
assert create_geo_types("Canada","province") == "nation"

def test_check_date_format(self):
assert check_date_format("2015-09-05") == "2015-09-05"
assert check_date_format("01/10/2020") == "2020-10-01" # change d/m/Y to Y-m-d
assert check_date_format("02-11-2013") == "2013-11-02" # change d-m-Y to Y-m-d
with pytest.raises(AssertionError):
check_date_format("02-2005-10") # Invalid date format raises error

@mock.patch("requests.get")
def test_get_dashboard_update_date(self, mock_requests):
# Set up fake data.
headers={}
url = "testurl.ca"

s = requests.Session()
s.mount('file://', FileAdapter())

TEST_DIR = Path(__file__).parent
resp = s.get('file://'+ str(TEST_DIR) + "/RVD_UpdateDate.csv")

# Mocks
mock_requests.return_value = resp
assert get_dashboard_update_date(url, headers) == "2025-02-20"

def test_check_most_recent_update_date(self):
TEST_DIR = Path(__file__).parent
path = str(TEST_DIR) + "/example_update_dates.txt"

assert check_most_recent_update_date("2025-02-14",path) == True #date is in the file
assert check_most_recent_update_date("2025-03-20",path) == False #date is not in the file

def test_preprocess_table_columns(self):
for example, expected in zip(example_unprocessed_data, expected_processed_data):
assert preprocess_table_columns(example).equals(expected)

def test_add_flu_prefix(self):
assert add_flu_prefix("ah3_pos") == "fluah3_pos"
assert add_flu_prefix("auns") == "fluauns"
assert add_flu_prefix("ah1pdm09 tests") == "fluah1pdm09 tests"
assert add_flu_prefix("ah1n1pdm09") == "fluah1n1pdm09"
assert add_flu_prefix("fluah1n1pdm09") == "fluah1n1pdm09" #if prefix exists, do nothing
assert add_flu_prefix("random string") == "random string" #if no prefix, it should do nothing

def test_make_signal_type_spelling_consistent(self):
assert make_signal_type_spelling_consistent("positive tests") == "positive_tests"
assert make_signal_type_spelling_consistent("flu pos") == "flu positive_tests"
assert make_signal_type_spelling_consistent("rsv tested") == "rsv tests"
assert make_signal_type_spelling_consistent("covid total tested") == "covid tests"
assert make_signal_type_spelling_consistent("flua%") == "flua_pct_positive"


def test_get_positive_data(self):
pass

def test_get_detections_data(self):
pass

def test_fetch_dashboard_data(self):
pass

0 comments on commit b2e5013

Please sign in to comment.