Skip to content

Commit 9a9a24b

Browse files
committed
Restructured institute_map.json into a YAML list
As prerequisite to adding metadata about our list of institutions, such as whether MGHPCC has a partnership with them, and to make the file more human-friendly, `institute.json` has been converted to a YAML and formatted as a list. Each element of the YAML list must be a dict with 2 attributes: - `display_name`: The name of the institute, as will appear on invoices - `domains`: A list containing domains for that institute Additional metadata can be freely added to each institute dict as our billing needs change. There is also some small cleanup. Namely, some functions in `process_report.py` that have been moved to `util.py` were not removed during refactoring.
1 parent f43f2a8 commit 9a9a24b

File tree

6 files changed

+107
-73
lines changed

6 files changed

+107
-73
lines changed

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ RUN pip install -r requirements.txt
99

1010
COPY tools/ tools/
1111
COPY process_report/process_report.py process_report/
12-
COPY process_report/institute_map.json process_report/
12+
COPY process_report/institute_list.yaml process_report/
1313

1414
CMD ["tools/clone_nonbillables_and_process_invoice.sh"]

process_report/institute_list.yaml

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
- display_name: Northeastern University
2+
domains:
3+
- northeastern.edu
4+
- display_name: Boston University
5+
domains:
6+
- bu.edu
7+
- robbaron
8+
- display_name: Bentley
9+
domains:
10+
- bentley.edu
11+
- display_name: University of Rhode Island
12+
domains:
13+
- uri.edu
14+
- display_name: Red Hat
15+
domains:
16+
- redhat.com
17+
- display_name: Boston Childrens Hospital
18+
domains:
19+
- childrens.harvard.edu
20+
- rudolph
21+
- display_name: McLean Hospital
22+
domains:
23+
- mclean.harvard.edu
24+
- display_name: Massachusetts Eye & Ear
25+
domains:
26+
- meei.harvard.edu
27+
- display_name: Dana-Farber Cancer Institute
28+
domains:
29+
- dfci.harvard.edu
30+
- display_name: Brigham and Women's Hospital
31+
domains:
32+
- bwh.harvard.edu
33+
- display_name: Beth Israel Deaconess Medical Center
34+
domains:
35+
- bidmc.harvard.edu
36+
- display_name: Harvard University
37+
domains:
38+
- harvard.edu
39+
- mmsh
40+
- kmdalton
41+
- francesco.pontiggia
42+
- chemistry.harvard.edu
43+
- display_name: Worcester Polytechnic Institute
44+
domains:
45+
- wpi.edu
46+
- display_name: Massachusetts Institute of Technology
47+
domains:
48+
- mit.edu
49+
- display_name: University of Massachusetts Amherst
50+
domains:
51+
- umass.edu
52+
- gstuart
53+
- mzink
54+
- display_name: University of Massachusetts Lowell
55+
domains:
56+
- uml.edu
57+
- display_name: Code For Boston
58+
domains:
59+
- codeforboston.org
60+
- display_name: Yale University
61+
domains:
62+
- yale.edu
63+
- display_name: Dartmouth College
64+
domains:
65+
- dartmouth.edu
66+
- display_name: Photrek
67+
domains:
68+
69+
- display_name: Positron Networks
70+
domains:
71+
72+
- display_name: Next Generation Justice
73+
domains:
74+

process_report/institute_map.json

-32
This file was deleted.

process_report/process_report.py

+10-30
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
import sys
33
import datetime
44

5-
import json
65
import pandas
76
import pyarrow
87

9-
from process_report.util import get_invoice_bucket, process_and_export_invoices
8+
from process_report import util
109
from process_report.invoices import (
1110
lenovo_invoice,
1211
nonbillable_invoice,
@@ -51,26 +50,6 @@
5150
ALIAS_S3_FILEPATH = "PIs/alias.csv"
5251

5352

54-
def get_institution_from_pi(institute_map, pi_uname):
55-
institution_domain = pi_uname.split("@")[-1]
56-
for i in range(institution_domain.count(".") + 1):
57-
if institution_name := institute_map.get(institution_domain, ""):
58-
break
59-
institution_domain = institution_domain[institution_domain.find(".") + 1 :]
60-
61-
if institution_name == "":
62-
print(f"Warning: PI name {pi_uname} does not match any institution!")
63-
64-
return institution_name
65-
66-
67-
def load_institute_map() -> dict:
68-
with open("process_report/institute_map.json", "r") as f:
69-
institute_map = json.load(f)
70-
71-
return institute_map
72-
73-
7453
def load_alias(alias_file):
7554
alias_dict = dict()
7655

@@ -245,7 +224,7 @@ def main():
245224
old_pi_filepath=old_pi_file,
246225
)
247226

248-
process_and_export_invoices(
227+
util.process_and_export_invoices(
249228
[lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3
250229
)
251230

@@ -266,15 +245,15 @@ def main():
266245
name=args.output_folder, invoice_month=invoice_month, data=billable_inv.data
267246
)
268247

269-
process_and_export_invoices(
248+
util.process_and_export_invoices(
270249
[nerc_total_inv, bu_internal_inv, pi_inv], args.upload_to_s3
271250
)
272251

273252

274253
def fetch_s3_invoices(invoice_month):
275254
"""Fetches usage invoices from S3 given invoice month"""
276255
s3_invoice_list = list()
277-
invoice_bucket = get_invoice_bucket()
256+
invoice_bucket = util.get_invoice_bucket()
278257
for obj in invoice_bucket.objects.filter(
279258
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
280259
):
@@ -339,20 +318,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
339318

340319
def fetch_s3_alias_file():
341320
local_name = "alias.csv"
342-
invoice_bucket = get_invoice_bucket()
321+
invoice_bucket = util.get_invoice_bucket()
343322
invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name)
344323
return local_name
345324

346325

347326
def fetch_s3_old_pi_file():
348327
local_name = "PI.csv"
349-
invoice_bucket = get_invoice_bucket()
328+
invoice_bucket = util.get_invoice_bucket()
350329
invoice_bucket.download_file(PI_S3_FILEPATH, local_name)
351330
return local_name
352331

353332

354333
def backup_to_s3_old_pi_file(old_pi_file):
355-
invoice_bucket = get_invoice_bucket()
334+
invoice_bucket = util.get_invoice_bucket()
356335
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")
357336

358337

@@ -368,14 +347,15 @@ def add_institution(dataframe: pandas.DataFrame):
368347
369348
The list of mappings are defined in `institute_map.json`.
370349
"""
371-
institute_map = load_institute_map()
350+
institute_list = util.load_institute_list()
351+
institute_map = util.get_institute_mapping(institute_list)
372352
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
373353
for i, row in dataframe.iterrows():
374354
pi_name = row[PI_FIELD]
375355
if pandas.isna(pi_name):
376356
print(f"Project {row[PROJECT_FIELD]} has no PI")
377357
else:
378-
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
358+
dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi(
379359
institute_map, pi_name
380360
)
381361

process_report/tests/unit_tests.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def test_get_pi_institution(self):
250250

251251
for pi_email, answer in answers.items():
252252
self.assertEqual(
253-
process_report.get_institution_from_pi(institute_map, pi_email), answer
253+
util.get_institution_from_pi(institute_map, pi_email), answer
254254
)
255255

256256

@@ -789,7 +789,7 @@ def test_process_lenovo(self):
789789

790790

791791
class TestUploadToS3(TestCase):
792-
@mock.patch("process_report.process_report.get_invoice_bucket")
792+
@mock.patch("process_report.util.get_invoice_bucket")
793793
@mock.patch("process_report.util.get_iso8601_time")
794794
def test_upload_to_s3(self, mock_get_time, mock_get_bucket):
795795
mock_bucket = mock.MagicMock()

process_report/util.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
import datetime
3-
import json
3+
import yaml
44
import logging
55
import functools
66

@@ -27,21 +27,33 @@ def get_invoice_bucket():
2727
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))
2828

2929

30+
def get_institute_mapping(institute_list: list):
31+
institute_map = dict()
32+
for institute_info in institute_list:
33+
for domain in institute_info["domains"]:
34+
institute_map[domain] = institute_info["display_name"]
35+
36+
return institute_map
37+
38+
3039
def get_institution_from_pi(institute_map, pi_uname):
31-
institution_key = pi_uname.split("@")[-1]
32-
institution_name = institute_map.get(institution_key, "")
40+
institution_domain = pi_uname.split("@")[-1]
41+
for i in range(institution_domain.count(".") + 1):
42+
if institution_name := institute_map.get(institution_domain, ""):
43+
break
44+
institution_domain = institution_domain[institution_domain.find(".") + 1 :]
3345

3446
if institution_name == "":
35-
logger.warn(f"PI name {pi_uname} does not match any institution!")
47+
print(f"Warning: PI name {pi_uname} does not match any institution!")
3648

3749
return institution_name
3850

3951

40-
def load_institute_map() -> dict:
41-
with open("process_report/institute_map.json", "r") as f:
42-
institute_map = json.load(f)
52+
def load_institute_list():
53+
with open("process_report/institute_list.yaml", "r") as f:
54+
institute_list = yaml.safe_load(f)
4355

44-
return institute_map
56+
return institute_list
4557

4658

4759
def get_iso8601_time():

0 commit comments

Comments
 (0)