Skip to content

Commit d72d59b

Browse files
committed
Implemented Processor class and refactored some preliminary processing
A `Processor` class has been added, subclassing from the `Invoice` class. This is the first step to refactor invoicing in order to seperate the processing and filtering/exporting functionalities of our current Invoice subclasses. Subclasses of `Processor` should only process invoices and manipulate its internal data, while subclasses of `Invoice` should only perform filtering and exporting, never changing any data itself. In addition to implementing `Processor`, two of its subclasses, `ValidatePIAliasProcessor` and `AddInstitutionProcessor` has been added to perform some preliminary processing steps.
1 parent aacb054 commit d72d59b

8 files changed

+147
-91
lines changed

process_report/invoices/billable_invoice.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class BillableInvoice(discount_invoice.DiscountInvoice):
2222

2323
nonbillable_pis: list[str]
2424
nonbillable_projects: list[str]
25-
25+
2626
export_columns_list = [
2727
invoice.INVOICE_DATE_FIELD,
2828
invoice.PROJECT_FIELD,

process_report/process_report.py

+21-60
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import sys
33
import datetime
44

5-
import json
65
import pandas
76
import pyarrow
87

@@ -15,7 +14,10 @@
1514
bu_internal_invoice,
1615
pi_specific_invoice,
1716
)
18-
17+
from process_report.processors import (
18+
validate_pi_alias_processor,
19+
add_institution_processor,
20+
)
1921

2022
### PI file field names
2123
PI_PI_FIELD = "PI"
@@ -51,26 +53,6 @@
5153
ALIAS_S3_FILEPATH = "PIs/alias.csv"
5254

5355

54-
def get_institution_from_pi(institute_map, pi_uname):
55-
institution_domain = pi_uname.split("@")[-1]
56-
for i in range(institution_domain.count(".") + 1):
57-
if institution_name := institute_map.get(institution_domain, ""):
58-
break
59-
institution_domain = institution_domain[institution_domain.find(".") + 1 :]
60-
61-
if institution_name == "":
62-
print(f"Warning: PI name {pi_uname} does not match any institution!")
63-
64-
return institution_name
65-
66-
67-
def load_institute_map() -> dict:
68-
with open("process_report/institute_map.json", "r") as f:
69-
institute_map = json.load(f)
70-
71-
return institute_map
72-
73-
7456
def load_alias(alias_file):
7557
alias_dict = dict()
7658

@@ -220,15 +202,27 @@ def main():
220202

221203
projects = list(set(projects + timed_projects_list))
222204

223-
merged_dataframe = validate_pi_aliases(merged_dataframe, alias_dict)
224-
merged_dataframe = add_institution(merged_dataframe)
205+
### Preliminary processing
206+
207+
validate_pi_alias_proc = validate_pi_alias_processor.ValidatePIAliasProcessor(
208+
"", invoice_month, merged_dataframe, alias_dict
209+
)
210+
validate_pi_alias_proc.process()
211+
212+
add_institute_proc = add_institution_processor.AddInstitutionProcessor(
213+
"", invoice_month, validate_pi_alias_proc.data
214+
)
215+
add_institute_proc.process()
216+
217+
### Finish preliminary processing
218+
225219
lenovo_inv = lenovo_invoice.LenovoInvoice(
226-
name=args.Lenovo_file, invoice_month=invoice_month, data=merged_dataframe.copy()
220+
name=args.Lenovo_file, invoice_month=invoice_month, data=add_institute_proc.data
227221
)
228222
nonbillable_inv = nonbillable_invoice.NonbillableInvoice(
229223
name=args.nonbillable_file,
230224
invoice_month=invoice_month,
231-
data=merged_dataframe.copy(),
225+
data=add_institute_proc.data,
232226
nonbillable_pis=pi,
233227
nonbillable_projects=projects,
234228
)
@@ -239,7 +233,7 @@ def main():
239233
billable_inv = billable_invoice.BillableInvoice(
240234
name=args.output_file,
241235
invoice_month=invoice_month,
242-
data=merged_dataframe.copy(),
236+
data=add_institute_proc.data.copy(),
243237
nonbillable_pis=pi,
244238
nonbillable_projects=projects,
245239
old_pi_filepath=old_pi_file,
@@ -330,13 +324,6 @@ def timed_projects(timed_projects_file, invoice_date):
330324
return dataframe[mask]["Project"].to_list()
331325

332326

333-
def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
334-
for pi, pi_aliases in alias_dict.items():
335-
dataframe.loc[dataframe[PI_FIELD].isin(pi_aliases), PI_FIELD] = pi
336-
337-
return dataframe
338-
339-
340327
def fetch_s3_alias_file():
341328
local_name = "alias.csv"
342329
invoice_bucket = get_invoice_bucket()
@@ -356,32 +343,6 @@ def backup_to_s3_old_pi_file(old_pi_file):
356343
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")
357344

358345

359-
def add_institution(dataframe: pandas.DataFrame):
360-
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
361-
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
362-
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
363-
the username is not an email address.
364-
365-
Exact matches are then mapped to the corresponding institution name.
366-
367-
I.e "[email protected]" would match with "bu.edu", which maps to the instition name "Boston University"
368-
369-
The list of mappings are defined in `institute_map.json`.
370-
"""
371-
institute_map = load_institute_map()
372-
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
373-
for i, row in dataframe.iterrows():
374-
pi_name = row[PI_FIELD]
375-
if pandas.isna(pi_name):
376-
print(f"Project {row[PROJECT_FIELD]} has no PI")
377-
else:
378-
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
379-
institute_map, pi_name
380-
)
381-
382-
return dataframe
383-
384-
385346
def export_billables(dataframe, output_file):
386347
dataframe.to_csv(output_file, index=False)
387348

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from dataclasses import dataclass
2+
import json
3+
4+
import pandas
5+
6+
from process_report.invoices import invoice
7+
from process_report.processors import processor
8+
9+
10+
@dataclass
11+
class AddInstitutionProcessor(processor.Processor):
12+
@staticmethod
13+
def _load_institute_map() -> dict:
14+
with open("process_report/institute_map.json", "r") as f:
15+
institute_map = json.load(f)
16+
17+
return institute_map
18+
19+
@staticmethod
20+
def _get_institution_from_pi(institute_map, pi_uname):
21+
institution_domain = pi_uname.split("@")[-1]
22+
for i in range(institution_domain.count(".") + 1):
23+
if institution_name := institute_map.get(institution_domain, ""):
24+
break
25+
institution_domain = institution_domain[institution_domain.find(".") + 1 :]
26+
27+
if institution_name == "":
28+
print(f"Warning: PI name {pi_uname} does not match any institution!")
29+
30+
return institution_name
31+
32+
def _add_institution(self, dataframe: pandas.DataFrame):
33+
"""Determine every PI's institution name, logging any PI whose institution cannot be determined
34+
This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
35+
a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
36+
the username is not an email address.
37+
38+
Exact matches are then mapped to the corresponding institution name.
39+
40+
I.e "[email protected]" would match with "bu.edu", which maps to the instition name "Boston University"
41+
42+
The list of mappings are defined in `institute_map.json`.
43+
"""
44+
institute_map = self._load_institute_map()
45+
dataframe = dataframe.astype({invoice.INSTITUTION_FIELD: "str"})
46+
for i, row in dataframe.iterrows():
47+
pi_name = row[invoice.PI_FIELD]
48+
if pandas.isna(pi_name):
49+
print(f"Project {row[invoice.PROJECT_FIELD]} has no PI")
50+
else:
51+
dataframe.at[
52+
i, invoice.INSTITUTION_FIELD
53+
] = self._get_institution_from_pi(institute_map, pi_name)
54+
55+
return dataframe
56+
57+
def _process(self):
58+
self.data = self._add_institution(self.data)
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from dataclasses import dataclass
2+
3+
from process_report.invoices import invoice
4+
5+
6+
@dataclass
7+
class Processor(invoice.Invoice):
8+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from dataclasses import dataclass
2+
3+
import pandas
4+
5+
from process_report.invoices import invoice
6+
from process_report.processors import processor
7+
8+
9+
@dataclass
10+
class ValidatePIAliasProcessor(processor.Processor):
11+
alias_map: dict
12+
13+
@staticmethod
14+
def _validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
15+
for pi, pi_aliases in alias_dict.items():
16+
dataframe.loc[
17+
dataframe[invoice.PI_FIELD].isin(pi_aliases), invoice.PI_FIELD
18+
] = pi
19+
20+
return dataframe
21+
22+
def _process(self):
23+
self.data = self._validate_pi_aliases(self.data, self.alias_map)

process_report/tests/unit_tests.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def test_export_pi(self):
219219
self.assertNotIn("ProjectC", pi_df["Project - Allocation"].tolist())
220220

221221

222-
class TestGetInstitute(TestCase):
222+
class TestAddInstituteProcessor(TestCase):
223223
def test_get_pi_institution(self):
224224
institute_map = {
225225
"harvard.edu": "Harvard University",
@@ -248,31 +248,34 @@ def test_get_pi_institution(self):
248248
"[email protected]": "Beth Israel Deaconess Medical Center",
249249
}
250250

251+
add_institute_proc = test_utils.new_add_institution_processor()
252+
251253
for pi_email, answer in answers.items():
252254
self.assertEqual(
253-
process_report.get_institution_from_pi(institute_map, pi_email), answer
255+
add_institute_proc._get_institution_from_pi(institute_map, pi_email),
256+
answer,
254257
)
255258

256259

257-
class TestAlias(TestCase):
258-
def setUp(self):
259-
self.alias_dict = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]}
260-
261-
self.data = pandas.DataFrame(
260+
class TestValidateAliasProcessor(TestCase):
261+
def test_validate_alias(self):
262+
alias_map = {"PI1": ["PI1_1", "PI1_2"], "PI2": ["PI2_1"]}
263+
test_data = pandas.DataFrame(
262264
{
263265
"Manager (PI)": ["PI1", "PI1_1", "PI1_2", "PI2_1", "PI2_1"],
264266
}
265267
)
266-
267-
self.answer = pandas.DataFrame(
268+
answer_data = pandas.DataFrame(
268269
{
269270
"Manager (PI)": ["PI1", "PI1", "PI1", "PI2", "PI2"],
270271
}
271272
)
272273

273-
def test_validate_alias(self):
274-
output = process_report.validate_pi_aliases(self.data, self.alias_dict)
275-
self.assertTrue(self.answer.equals(output))
274+
validate_pi_alias_proc = test_utils.new_validate_pi_alias_processor(
275+
data=test_data, alias_map=alias_map
276+
)
277+
validate_pi_alias_proc.process()
278+
self.assertTrue(answer_data.equals(validate_pi_alias_proc.data))
276279

277280

278281
class TestMonthUtils(TestCase):

process_report/tests/util.py

+21
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
pi_specific_invoice,
88
)
99

10+
from process_report.processors import (
11+
add_institution_processor,
12+
validate_pi_alias_processor,
13+
)
14+
1015

1116
def new_base_invoice(
1217
name="",
@@ -52,3 +57,19 @@ def new_pi_specific_invoice(
5257
invoice_month,
5358
data,
5459
)
60+
61+
62+
def new_add_institution_processor(
63+
name="",
64+
invoice_month="0000-00",
65+
data=pandas.DataFrame(),
66+
):
67+
return add_institution_processor.AddInstitutionProcessor(name, invoice_month, data)
68+
69+
70+
def new_validate_pi_alias_processor(
71+
name="", invoice_month="0000-00", data=pandas.DataFrame(), alias_map={}
72+
):
73+
return validate_pi_alias_processor.ValidatePIAliasProcessor(
74+
name, invoice_month, data, alias_map
75+
)

process_report/util.py

-18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
22
import datetime
3-
import json
43
import logging
54
import functools
65

@@ -27,23 +26,6 @@ def get_invoice_bucket():
2726
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))
2827

2928

30-
def get_institution_from_pi(institute_map, pi_uname):
31-
institution_key = pi_uname.split("@")[-1]
32-
institution_name = institute_map.get(institution_key, "")
33-
34-
if institution_name == "":
35-
logger.warn(f"PI name {pi_uname} does not match any institution!")
36-
37-
return institution_name
38-
39-
40-
def load_institute_map() -> dict:
41-
with open("process_report/institute_map.json", "r") as f:
42-
institute_map = json.load(f)
43-
44-
return institute_map
45-
46-
4729
def get_iso8601_time():
4830
return datetime.datetime.now().strftime("%Y%m%dT%H%M%SZ")
4931

0 commit comments

Comments
 (0)