2
2
import sys
3
3
import datetime
4
4
5
- import json
6
5
import pandas
7
6
import pyarrow
8
7
15
14
bu_internal_invoice ,
16
15
pi_specific_invoice ,
17
16
)
18
-
17
+ from process_report .processors import (
18
+ validate_pi_alias_processor ,
19
+ add_institution_processor ,
20
+ )
19
21
20
22
### PI file field names
21
23
PI_PI_FIELD = "PI"
51
53
ALIAS_S3_FILEPATH = "PIs/alias.csv"
52
54
53
55
54
- def get_institution_from_pi (institute_map , pi_uname ):
55
- institution_domain = pi_uname .split ("@" )[- 1 ]
56
- for i in range (institution_domain .count ("." ) + 1 ):
57
- if institution_name := institute_map .get (institution_domain , "" ):
58
- break
59
- institution_domain = institution_domain [institution_domain .find ("." ) + 1 :]
60
-
61
- if institution_name == "" :
62
- print (f"Warning: PI name { pi_uname } does not match any institution!" )
63
-
64
- return institution_name
65
-
66
-
67
- def load_institute_map () -> dict :
68
- with open ("process_report/institute_map.json" , "r" ) as f :
69
- institute_map = json .load (f )
70
-
71
- return institute_map
72
-
73
-
74
56
def load_alias (alias_file ):
75
57
alias_dict = dict ()
76
58
@@ -220,15 +202,27 @@ def main():
220
202
221
203
projects = list (set (projects + timed_projects_list ))
222
204
223
- merged_dataframe = validate_pi_aliases (merged_dataframe , alias_dict )
224
- merged_dataframe = add_institution (merged_dataframe )
205
+ ### Preliminary processing
206
+
207
+ validate_pi_alias_proc = validate_pi_alias_processor .ValidatePIAliasProcessor (
208
+ "" , invoice_month , merged_dataframe , alias_dict
209
+ )
210
+ validate_pi_alias_proc .process ()
211
+
212
+ add_institute_proc = add_institution_processor .AddInstitutionProcessor (
213
+ "" , invoice_month , validate_pi_alias_proc .data
214
+ )
215
+ add_institute_proc .process ()
216
+
217
+ ### Finish preliminary processing
218
+
225
219
lenovo_inv = lenovo_invoice .LenovoInvoice (
226
- name = args .Lenovo_file , invoice_month = invoice_month , data = merged_dataframe . copy ()
220
+ name = args .Lenovo_file , invoice_month = invoice_month , data = add_institute_proc . data
227
221
)
228
222
nonbillable_inv = nonbillable_invoice .NonbillableInvoice (
229
223
name = args .nonbillable_file ,
230
224
invoice_month = invoice_month ,
231
- data = merged_dataframe . copy () ,
225
+ data = add_institute_proc . data ,
232
226
nonbillable_pis = pi ,
233
227
nonbillable_projects = projects ,
234
228
)
@@ -239,7 +233,7 @@ def main():
239
233
billable_inv = billable_invoice .BillableInvoice (
240
234
name = args .output_file ,
241
235
invoice_month = invoice_month ,
242
- data = merged_dataframe .copy (),
236
+ data = add_institute_proc . data .copy (),
243
237
nonbillable_pis = pi ,
244
238
nonbillable_projects = projects ,
245
239
old_pi_filepath = old_pi_file ,
@@ -330,13 +324,6 @@ def timed_projects(timed_projects_file, invoice_date):
330
324
return dataframe [mask ]["Project" ].to_list ()
331
325
332
326
333
- def validate_pi_aliases (dataframe : pandas .DataFrame , alias_dict : dict ):
334
- for pi , pi_aliases in alias_dict .items ():
335
- dataframe .loc [dataframe [PI_FIELD ].isin (pi_aliases ), PI_FIELD ] = pi
336
-
337
- return dataframe
338
-
339
-
340
327
def fetch_s3_alias_file ():
341
328
local_name = "alias.csv"
342
329
invoice_bucket = get_invoice_bucket ()
@@ -356,32 +343,6 @@ def backup_to_s3_old_pi_file(old_pi_file):
356
343
invoice_bucket .upload_file (old_pi_file , f"PIs/Archive/PI { get_iso8601_time ()} .csv" )
357
344
358
345
359
- def add_institution (dataframe : pandas .DataFrame ):
360
- """Determine every PI's institution name, logging any PI whose institution cannot be determined
361
- This is performed by `get_institution_from_pi()`, which tries to match the PI's username to
362
- a list of known institution email domains (i.e bu.edu), or to several edge cases (i.e rudolph) if
363
- the username is not an email address.
364
-
365
- Exact matches are then mapped to the corresponding institution name.
366
-
367
- I.e "[email protected] " would match with "bu.edu", which maps to the instition name "Boston University"
368
-
369
- The list of mappings are defined in `institute_map.json`.
370
- """
371
- institute_map = load_institute_map ()
372
- dataframe = dataframe .astype ({INSTITUTION_FIELD : "str" })
373
- for i , row in dataframe .iterrows ():
374
- pi_name = row [PI_FIELD ]
375
- if pandas .isna (pi_name ):
376
- print (f"Project { row [PROJECT_FIELD ]} has no PI" )
377
- else :
378
- dataframe .at [i , INSTITUTION_FIELD ] = get_institution_from_pi (
379
- institute_map , pi_name
380
- )
381
-
382
- return dataframe
383
-
384
-
385
346
def export_billables (dataframe , output_file ):
386
347
dataframe .to_csv (output_file , index = False )
387
348
0 commit comments