Skip to content

Commit

Permalink
Restructured institute_map.json into a YAML list
Browse files Browse the repository at this point in the history
As prerequisite to adding metadata about our list of institutions,
such as whether MGHPCC has a partnership with them, and to make the
file more human-friendly, `institute.json` has been converted to a YAML
and formatted as a list. Each element of the YAML list must be a dict with
2 attributes:
- `display_name`: The name of the institute, as will appear on invoices
- `domains`: A list containing domains for that institute

Additional metadata can be freely added to each institute dict as our
billing needs change.

There is also some small cleanup. Namely, some functions in `process_report.py`
that have been moved to `util.py` were not removed during refactoring.
  • Loading branch information
QuanMPhm committed Sep 19, 2024
1 parent f43f2a8 commit 9a9a24b
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 73 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ RUN pip install -r requirements.txt

COPY tools/ tools/
COPY process_report/process_report.py process_report/
COPY process_report/institute_map.json process_report/
COPY process_report/institute_list.yaml process_report/

CMD ["tools/clone_nonbillables_and_process_invoice.sh"]
74 changes: 74 additions & 0 deletions process_report/institute_list.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
- display_name: Northeastern University
domains:
- northeastern.edu
- display_name: Boston University
domains:
- bu.edu
- robbaron
- display_name: Bentley
domains:
- bentley.edu
- display_name: University of Rhode Island
domains:
- uri.edu
- display_name: Red Hat
domains:
- redhat.com
- display_name: Boston Childrens Hospital
domains:
- childrens.harvard.edu
- rudolph
- display_name: McLean Hospital
domains:
- mclean.harvard.edu
- display_name: Massachusetts Eye & Ear
domains:
- meei.harvard.edu
- display_name: Dana-Farber Cancer Institute
domains:
- dfci.harvard.edu
- display_name: Brigham and Women's Hospital
domains:
- bwh.harvard.edu
- display_name: Beth Israel Deaconess Medical Center
domains:
- bidmc.harvard.edu
- display_name: Harvard University
domains:
- harvard.edu
- mmsh
- kmdalton
- francesco.pontiggia
- chemistry.harvard.edu
- display_name: Worcester Polytechnic Institute
domains:
- wpi.edu
- display_name: Massachusetts Institute of Technology
domains:
- mit.edu
- display_name: University of Massachusetts Amherst
domains:
- umass.edu
- gstuart
- mzink
- display_name: University of Massachusetts Lowell
domains:
- uml.edu
- display_name: Code For Boston
domains:
- codeforboston.org
- display_name: Yale University
domains:
- yale.edu
- display_name: Dartmouth College
domains:
- dartmouth.edu
- display_name: Photrek
domains:
- [email protected]
- display_name: Positron Networks
domains:
- [email protected]
- display_name: Next Generation Justice
domains:
- [email protected]
32 changes: 0 additions & 32 deletions process_report/institute_map.json

This file was deleted.

40 changes: 10 additions & 30 deletions process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
import sys
import datetime

import json
import pandas
import pyarrow

from process_report.util import get_invoice_bucket, process_and_export_invoices
from process_report import util
from process_report.invoices import (
lenovo_invoice,
nonbillable_invoice,
Expand Down Expand Up @@ -51,26 +50,6 @@
ALIAS_S3_FILEPATH = "PIs/alias.csv"


def get_institution_from_pi(institute_map, pi_uname):
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)

return institute_map


def load_alias(alias_file):
alias_dict = dict()

Expand Down Expand Up @@ -245,7 +224,7 @@ def main():
old_pi_filepath=old_pi_file,
)

process_and_export_invoices(
util.process_and_export_invoices(
[lenovo_inv, nonbillable_inv, billable_inv], args.upload_to_s3
)

Expand All @@ -266,15 +245,15 @@ def main():
name=args.output_folder, invoice_month=invoice_month, data=billable_inv.data
)

process_and_export_invoices(
util.process_and_export_invoices(
[nerc_total_inv, bu_internal_inv, pi_inv], args.upload_to_s3
)


def fetch_s3_invoices(invoice_month):
"""Fetches usage invoices from S3 given invoice month"""
s3_invoice_list = list()
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
for obj in invoice_bucket.objects.filter(
Prefix=f"Invoices/{invoice_month}/Service Invoices/"
):
Expand Down Expand Up @@ -339,20 +318,20 @@ def validate_pi_aliases(dataframe: pandas.DataFrame, alias_dict: dict):

def fetch_s3_alias_file():
local_name = "alias.csv"
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.download_file(ALIAS_S3_FILEPATH, local_name)
return local_name


def fetch_s3_old_pi_file():
local_name = "PI.csv"
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.download_file(PI_S3_FILEPATH, local_name)
return local_name


def backup_to_s3_old_pi_file(old_pi_file):
invoice_bucket = get_invoice_bucket()
invoice_bucket = util.get_invoice_bucket()
invoice_bucket.upload_file(old_pi_file, f"PIs/Archive/PI {get_iso8601_time()}.csv")


Expand All @@ -368,14 +347,15 @@ def add_institution(dataframe: pandas.DataFrame):
The list of mappings are defined in `institute_map.json`.
"""
institute_map = load_institute_map()
institute_list = util.load_institute_list()
institute_map = util.get_institute_mapping(institute_list)
dataframe = dataframe.astype({INSTITUTION_FIELD: "str"})
for i, row in dataframe.iterrows():
pi_name = row[PI_FIELD]
if pandas.isna(pi_name):
print(f"Project {row[PROJECT_FIELD]} has no PI")
else:
dataframe.at[i, INSTITUTION_FIELD] = get_institution_from_pi(
dataframe.at[i, INSTITUTION_FIELD] = util.get_institution_from_pi(
institute_map, pi_name
)

Expand Down
4 changes: 2 additions & 2 deletions process_report/tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def test_get_pi_institution(self):

for pi_email, answer in answers.items():
self.assertEqual(
process_report.get_institution_from_pi(institute_map, pi_email), answer
util.get_institution_from_pi(institute_map, pi_email), answer
)


Expand Down Expand Up @@ -789,7 +789,7 @@ def test_process_lenovo(self):


class TestUploadToS3(TestCase):
@mock.patch("process_report.process_report.get_invoice_bucket")
@mock.patch("process_report.util.get_invoice_bucket")
@mock.patch("process_report.util.get_iso8601_time")
def test_upload_to_s3(self, mock_get_time, mock_get_bucket):
mock_bucket = mock.MagicMock()
Expand Down
28 changes: 20 additions & 8 deletions process_report/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import datetime
import json
import yaml
import logging
import functools

Expand All @@ -27,21 +27,33 @@ def get_invoice_bucket():
return s3_resource.Bucket(os.environ.get("S3_BUCKET_NAME", "nerc-invoicing"))


def get_institute_mapping(institute_list: list):
institute_map = dict()
for institute_info in institute_list:
for domain in institute_info["domains"]:
institute_map[domain] = institute_info["display_name"]

return institute_map


def get_institution_from_pi(institute_map, pi_uname):
institution_key = pi_uname.split("@")[-1]
institution_name = institute_map.get(institution_key, "")
institution_domain = pi_uname.split("@")[-1]
for i in range(institution_domain.count(".") + 1):
if institution_name := institute_map.get(institution_domain, ""):
break
institution_domain = institution_domain[institution_domain.find(".") + 1 :]

if institution_name == "":
logger.warn(f"PI name {pi_uname} does not match any institution!")
print(f"Warning: PI name {pi_uname} does not match any institution!")

return institution_name


def load_institute_map() -> dict:
with open("process_report/institute_map.json", "r") as f:
institute_map = json.load(f)
def load_institute_list():
with open("process_report/institute_list.yaml", "r") as f:
institute_list = yaml.safe_load(f)

return institute_map
return institute_list


def get_iso8601_time():
Expand Down

0 comments on commit 9a9a24b

Please sign in to comment.