22import sys
33import datetime
44
5- import json
65import pandas
76import pyarrow
87
1514 bu_internal_invoice ,
1615 pi_specific_invoice ,
1716)
18-
17+ from process_report .processors import (
18+ validate_pi_alias_processor ,
19+ add_institution_processor ,
20+ )
1921
2022### PI file field names
2123PI_PI_FIELD = "PI"
5153ALIAS_S3_FILEPATH = "PIs/alias.csv"
5254
5355
54- def get_institution_from_pi (institute_map , pi_uname ):
55- institution_domain = pi_uname .split ("@" )[- 1 ]
56- for i in range (institution_domain .count ("." ) + 1 ):
57- if institution_name := institute_map .get (institution_domain , "" ):
58- break
59- institution_domain = institution_domain [institution_domain .find ("." ) + 1 :]
60-
61- if institution_name == "" :
62- print (f"Warning: PI name { pi_uname } does not match any institution!" )
63-
64- return institution_name
65-
66-
67- def load_institute_map () -> dict :
68- with open ("process_report/institute_map.json" , "r" ) as f :
69- institute_map = json .load (f )
70-
71- return institute_map
72-
73-
7456def load_alias (alias_file ):
7557 alias_dict = dict ()
7658
@@ -220,15 +202,27 @@ def main():
220202
221203 projects = list (set (projects + timed_projects_list ))
222204
223- merged_dataframe = validate_pi_aliases (merged_dataframe , alias_dict )
224- merged_dataframe = add_institution (merged_dataframe )
205+ ### Preliminary processing
206+
207+ validate_pi_alias_proc = validate_pi_alias_processor .ValidatePIAliasProcessor (
208+ "" , invoice_month , merged_dataframe , alias_dict
209+ )
210+ validate_pi_alias_proc .process ()
211+
212+ add_institute_proc = add_institution_processor .AddInstitutionProcessor (
213+ "" , invoice_month , validate_pi_alias_proc .data
214+ )
215+ add_institute_proc .process ()
216+
217+ ### Finish preliminary processing
218+
225219 lenovo_inv = lenovo_invoice .LenovoInvoice (
226- name = args .Lenovo_file , invoice_month = invoice_month , data = merged_dataframe . copy ()
220+ name = args .Lenovo_file , invoice_month = invoice_month , data = add_institute_proc . data
227221 )
228222 nonbillable_inv = nonbillable_invoice .NonbillableInvoice (
229223 name = args .nonbillable_file ,
230224 invoice_month = invoice_month ,
231- data = merged_dataframe . copy () ,
225+ data = add_institute_proc . data ,
232226 nonbillable_pis = pi ,
233227 nonbillable_projects = projects ,
234228 )
@@ -239,7 +233,7 @@ def main():
239233 billable_inv = billable_invoice .BillableInvoice (
240234 name = args .output_file ,
241235 invoice_month = invoice_month ,
242- data = merged_dataframe .copy (),
236+ data = add_institute_proc . data .copy (),
243237 nonbillable_pis = pi ,
244238 nonbillable_projects = projects ,
245239 old_pi_filepath = old_pi_file ,
@@ -330,13 +324,6 @@ def timed_projects(timed_projects_file, invoice_date):
330324 return dataframe [mask ]["Project" ].to_list ()
331325
332326
333- def validate_pi_aliases (dataframe : pandas .DataFrame , alias_dict : dict ):
334- for pi , pi_aliases in alias_dict .items ():
335- dataframe .loc [dataframe [PI_FIELD ].isin (pi_aliases ), PI_FIELD ] = pi
336-
337- return dataframe
338-
339-
340327def fetch_s3_alias_file ():
341328 local_name = "alias.csv"
342329 invoice_bucket = get_invoice_bucket ()
@@ -356,32 +343,6 @@ def backup_to_s3_old_pi_file(old_pi_file):
356343 invoice_bucket .upload_file (old_pi_file , f"PIs/Archive/PI { get_iso8601_time ()} .csv" )
357344
358345
359- def add_institution (dataframe : pandas .DataFrame ):
360- """Determine every PI's institution name, logging any PI whose institution cannot be determined
361- This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
362- a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
363- the username is not an email address.
364-
365- Exact matches are then mapped to the corresponding institution name.
366-
367- I.e "foo@bu.edu" would match with "bu.edu", which maps to the instition name "Boston University"
368-
369- The list of mappings are defined in `institute_map.json`.
370- """
371- institute_map = load_institute_map ()
372- dataframe = dataframe .astype ({INSTITUTION_FIELD : "str" })
373- for i , row in dataframe .iterrows ():
374- pi_name = row [PI_FIELD ]
375- if pandas .isna (pi_name ):
376- print (f"Project { row [PROJECT_FIELD ]} has no PI" )
377- else :
378- dataframe .at [i , INSTITUTION_FIELD ] = get_institution_from_pi (
379- institute_map , pi_name
380- )
381-
382- return dataframe
383-
384-
385346def export_billables (dataframe , output_file ):
386347 dataframe .to_csv (output_file , index = False )
387348
0 commit comments