Skip to content

Commit 2746b67

Browse files
committed
Restructured institute_map.json into a YAML list
As prerequisite to adding metadata about our list of institutions, such as whether MGHPCC has a partnership with them, and to make the file more human-friendly, `institute.json` has been converted to a YAML and formatted as a list. Each element of the YAML list must be a dict with 2 attributes: - `display_name`: The name of the institute, as will appear on invoices - `domains`: A list containing domains for that institute Additional metadata can be freely added to each institute dict as our billing needs change. There is also some small cleanup. Namely, some functions in `process_report.py` that have been moved to `util.py` were not removed during refactoring.
1 parent 19ca40b commit 2746b67

File tree

6 files changed

+110
-74
lines changed

6 files changed

+110
-74
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ RUN pip install -r requirements.txt
99

1010
COPY tools/ tools/
1111
COPY process_report/process_report.py process_report/
12-
COPY process_report/institute_map.json process_report/
12+
COPY process_report/institute_list.yaml process_report/
1313

1414
CMD ["tools/clone_nonbillables_and_process_invoice.sh"]

process_report/institute_list.yaml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
- display_name: Northeastern University
2+
domains:
3+
- northeastern.edu
4+
- display_name: Boston University
5+
domains:
6+
- bu.edu
7+
- robbaron
8+
- display_name: Bentley
9+
domains:
10+
- bentley.edu
11+
- display_name: University of Rhode Island
12+
domains:
13+
- uri.edu
14+
- display_name: Red Hat
15+
domains:
16+
- redhat.com
17+
- display_name: Boston Childrens Hospital
18+
domains:
19+
- childrens.harvard.edu
20+
- rudolph
21+
- display_name: McLean Hospital
22+
domains:
23+
- mclean.harvard.edu
24+
- display_name: Massachusetts Eye & Ear
25+
domains:
26+
- meei.harvard.edu
27+
- display_name: Dana-Farber Cancer Institute
28+
domains:
29+
- dfci.harvard.edu
30+
- display_name: Brigham and Women's Hospital
31+
domains:
32+
- bwh.harvard.edu
33+
- display_name: Beth Israel Deaconess Medical Center
34+
domains:
35+
- bidmc.harvard.edu
36+
- display_name: Harvard University
37+
domains:
38+
- harvard.edu
39+
- mmsh
40+
- kmdalton
41+
- francesco.pontiggia
42+
- chemistry.harvard.edu
43+
- display_name: Worcester Polytechnic Institute
44+
domains:
45+
- wpi.edu
46+
- display_name: Massachusetts Institute of Technology
47+
domains:
48+
- mit.edu
49+
- display_name: University of Massachusetts Amherst
50+
domains:
51+
- umass.edu
52+
- gstuart
53+
- mzink
54+
- display_name: University of Massachusetts Lowell
55+
domains:
56+
- uml.edu
57+
- display_name: Code For Boston
58+
domains:
59+
- codeforboston.org
60+
- display_name: Yale University
61+
domains:
62+
- yale.edu
63+
- display_name: Dartmouth College
64+
domains:
65+
- dartmouth.edu
66+
- display_name: Photrek
67+
domains:
68+
69+
- display_name: Positron Networks
70+
domains:
71+
72+
- display_name: Next Generation Justice
73+
domains:
74+

process_report/institute_map.json

Lines changed: 0 additions & 32 deletions
This file was deleted.

process_report/process_report.py

Lines changed: 13 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
import sys
44
import datetime
55

6-
import json
76
import pandas
87
import pyarrow
98

10-
from process_report.util import get_invoice_bucket, process_and_export_invoices
9+
from process_report import util
1110
from process_report.invoices import (
1211
lenovo_invoice,
1312
nonbillable_invoice,
@@ -51,26 +50,6 @@
5150
ALIAS_S3_FILEPATH = "PIs/alias.csv"
5251

5352

54-
def get_institution_from_pi(institute_map, pi_uname):
55-
institution_domain = pi_uname.split("@")[-1]
56-
for i in range(institution_domain.count(".") + 1):
57-
if institution_name := institute_map.get(institution_domain, ""):
58-
break
59-
institution_domain = institution_domain[institution_domain.find(".") + 1 :]
60-
61-
if institution_name == "":
62-
print(f"Warning: PI name {pi_uname} does not match any institution!")
63-
64-
return institution_name
65-
66-
67-
def load_institute_map() -> dict:
68-
with open("process_report/institute_map.json", "r") as f:
69-
institute_map = json.load(f)
70-
71-
return institute_map
72-
73-
7453
def load_alias(alias_file):
7554
alias_dict = dict()
7655

@@ -245,7 +224,7 @@ def main():
245224
old_pi_filepath=old_pi_file,
246225
)
247226

248-
process_and_export_invoices(
227+
util.process_and_export_invoices(
249228
[lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3
250229
)
251230

@@ -262,7 +241,9 @@ def main():
262241
subsidy_amount=args.BU_subsidy_amount,
263242
)
264243

265-
process_and_export_invoices([nerc_total_inv, bu_internal_inv], args.upload_to_s3)
244+
util.process_and_export_invoices(
245+
[nerc_total_inv, bu_internal_inv], args.upload_to_s3
246+
)
266247

267248
export_pi_billables(billable_inv.data.copy(), args.output_folder, invoice_month)
268249

@@ -278,7 +259,7 @@ def main():
278259
def fetch_s3_invoices(invoice_month):
279260
"""Fetches usage invoices from S3 given invoice month"""
280261
s3_invoice_list = list()
281-
invoice_bucket = get_invoice_bucket()
262+
invoice_bucket = util.get_invoice_bucket()
282263
for obj in invoice_bucket.objects.filter(
283264
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
284265
):
@@ -343,20 +324,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):
343324

344325
def fetch_s3_alias_file():
345326
local_name = "alias.csv"
346-
invoice_bucket = get_invoice_bucket()
327+
invoice_bucket = util.get_invoice_bucket()
347328
invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name)
348329
return local_name
349330

350331

351332
def fetch_s3_old_pi_file():
352333
local_name = "PI.csv"
353-
invoice_bucket = get_invoice_bucket()
334+
invoice_bucket = util.get_invoice_bucket()
354335
invoice_bucket.download_file(PI_S3_FILEPATH, local_name)
355336
return local_name
356337

357338

358339
def backup_to_s3_old_pi_file(old_pi_file):
359-
invoice_bucket = get_invoice_bucket()
340+
invoice_bucket = util.get_invoice_bucket()
360341
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")
361342

362343

@@ -372,14 +353,15 @@ def add_institution(dataframe: pandas.DataFrame):
372353
373354
The list of mappings are defined in `institute_map.json`.
374355
"""
375-
institute_map = load_institute_map()
356+
institute_list = util.load_institute_list()
357+
institute_map = util.get_institute_mapping(institute_list)
376358
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
377359
for i, row in dataframe.iterrows():
378360
pi_name = row[PI_FIELD]
379361
if pandas.isna(pi_name):
380362
print(f"Project {row[PROJECT_FIELD]} has no PI")
381363
else:
382-
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
364+
dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi(
383365
institute_map, pi_name
384366
)
385367

@@ -407,7 +389,7 @@ def export_pi_billables(dataframe: pandas.DataFrame, output_folder, invoice_mont
407389

408390

409391
def upload_to_s3(invoice_list: list, invoice_month):
410-
invoice_bucket = get_invoice_bucket()
392+
invoice_bucket = util.get_invoice_bucket()
411393
for invoice_filename in invoice_list:
412394
striped_filename = os.path.splitext(invoice_filename)[0]
413395
invoice_s3_path = (

process_report/tests/unit_tests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def test_get_pi_institution(self):
249249

250250
for pi_email, answer in answers.items():
251251
self.assertEqual(
252-
process_report.get_institution_from_pi(institute_map, pi_email), answer
252+
util.get_institution_from_pi(institute_map, pi_email), answer
253253
)
254254

255255

@@ -788,7 +788,7 @@ def test_process_lenovo(self):
788788

789789

790790
class TestUploadToS3(TestCase):
791-
@mock.patch("process_report.process_report.get_invoice_bucket")
791+
@mock.patch("process_report.util.get_invoice_bucket")
792792
@mock.patch("process_report.process_report.get_iso8601_time")
793793
def test_remove_prefix(self, mock_get_time, mock_get_bucket):
794794
mock_bucket = mock.MagicMock()

process_report/util.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
import datetime
3-
import json
3+
import yaml
44
import logging
55
import functools
66

@@ -27,21 +27,33 @@ def get_invoice_bucket():
2727
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))
2828

2929

30+
def get_institute_mapping(institute_list: list):
31+
institute_map = dict()
32+
for institute_info in institute_list:
33+
for domain in institute_info["domains"]:
34+
institute_map[domain] = institute_info["display_name"]
35+
36+
return institute_map
37+
38+
3039
def get_institution_from_pi(institute_map, pi_uname):
31-
institution_key = pi_uname.split("@")[-1]
32-
institution_name = institute_map.get(institution_key, "")
40+
institution_domain = pi_uname.split("@")[-1]
41+
for i in range(institution_domain.count(".") + 1):
42+
if institution_name := institute_map.get(institution_domain, ""):
43+
break
44+
institution_domain = institution_domain[institution_domain.find(".") + 1 :]
3345

3446
if institution_name == "":
35-
logger.warn(f"PI name {pi_uname} does not match any institution!")
47+
print(f"Warning: PI name {pi_uname} does not match any institution!")
3648

3749
return institution_name
3850

3951

40-
def load_institute_map() -> dict:
41-
with open("process_report/institute_map.json", "r") as f:
42-
institute_map = json.load(f)
52+
def load_institute_list():
53+
with open("process_report/institute_list.yaml", "r") as f:
54+
institute_list = yaml.safe_load(f)
4355

44-
return institute_map
56+
return institute_list
4557

4658

4759
def get_iso8601_time():

0 commit comments

Comments
 (0)