HHS · DavidDudas-Intuitial · Nov 5, 2024 · Oct 17, 2024 · Oct 18, 2024 · Oct 18, 2024
@@ -13,12 +13,13 @@ ISSUE_FILE ?= $(OUTPUT_DIR)/issue-data.json
 DELIVERY_FILE ?= $(OUTPUT_DIR)/delivery-data.json
 SPRINT ?= @current
 # Names of the points and sprint fields in the GitHub project
-POINTS_FIELD ?= Points
+POINTS_FIELD ?= Story Points
 SPRINT_FIELD ?= Sprint
 UNIT ?= points
 ACTION ?= show-results
 MIN_TEST_COVERAGE ?= 80
 APP_NAME ?= grants-analytics
+EFFECTIVE_DATE ?= $(shell date +"%Y-%m-%d")
 
 # Required for CI to work properly
 SHELL = /bin/bash -o pipefail
@@ -144,6 +145,20 @@ lint: ## runs code quality checks
 # Data Commands #
 #################
 
+init-db:
+	@echo "=> Initializing the database schema"
+	@echo "====================================================="
+	$(POETRY) analytics etl initialize_database
+	@echo "====================================================="
+
+gh-transform-and-load:
+	@echo "=> Transforming and loading GitHub data into the database"
+	@echo "====================================================="
+	$(POETRY) analytics etl transform_and_load \
+	--deliverable-file $(DELIVERY_FILE) \
+	--effective-date $(EFFECTIVE_DATE)
+	@echo "====================================================="
+
 sprint-data-export:
 	@echo "=> Exporting project data from the sprint board"
 	@echo "====================================================="
@@ -186,6 +201,8 @@ issue-data-export:
 
 gh-data-export: sprint-data-export issue-data-export roadmap-data-export delivery-data-export
 
+gh-etl: delivery-data-export gh-transform-and-load
+
 sprint-burndown:
 	@echo "=> Running sprint burndown report"
 	@echo "====================================================="

@@ -1,7 +1,9 @@
 # pylint: disable=C0415
 """Expose a series of CLI entrypoints for the analytics package."""
+
 import logging
 import logging.config
+from datetime import datetime
 from pathlib import Path
 from typing import Annotated, Optional
 
@@ -10,8 +12,9 @@
 from sqlalchemy import text
 
 from analytics.datasets.deliverable_tasks import DeliverableTasks
+from analytics.datasets.etl_dataset import EtlDataset
 from analytics.datasets.issues import GitHubIssues
-from analytics.integrations import db, github, slack
+from analytics.integrations import db, etldb, github, slack
 from analytics.metrics.base import BaseMetric, Unit
 from analytics.metrics.burndown import SprintBurndown
 from analytics.metrics.burnup import SprintBurnup
@@ -39,6 +42,8 @@
 STATUS_ARG = typer.Option(
     help="Deliverable status to include in report, can be passed multiple times",
 )
+DELIVERABLE_FILE_ARG = typer.Option(help="Path to file with exported deliverable data")
+EFFECTIVE_DATE_ARG = typer.Option(help="YYYY-MM-DD effective date to apply to each imported row")
 # fmt: on
 
 # instantiate the main CLI entrypoint
@@ -47,10 +52,12 @@
 export_app = typer.Typer()
 metrics_app = typer.Typer()
 import_app = typer.Typer()
+etl_app = typer.Typer()
 # add sub-commands to main entrypoint
 app.add_typer(export_app, name="export", help="Export data needed to calculate metrics")
 app.add_typer(metrics_app, name="calculate", help="Calculate key project metrics")
 app.add_typer(import_app, name="import", help="Import data into the database")
+app.add_typer(etl_app, name="etl", help="Transform and load local file")
 
 
 @app.callback()
@@ -292,3 +299,45 @@ def export_json_to_database(delivery_file: Annotated[str, ISSUE_FILE_ARG]) -> No
     )
     rows = len(issues.to_dict())
     logger.info("Number of rows in table: %s", rows)
+
+
+# ===========================================================
+# Etl commands
+# ===========================================================
+
+
+@etl_app.command(name="initialize_database")
+def initialize_database() -> None:
+    """Initialize etl database."""
+    print("initializing database")
+    etldb.init_db()
+    print("done")
+
+
+@etl_app.command(name="transform_and_load")
+def transform_and_load(
+    deliverable_file: Annotated[str, DELIVERABLE_FILE_ARG],
+    effective_date: Annotated[str, EFFECTIVE_DATE_ARG],
+) -> None:
+    """Transform and load etl data."""
+    # validate effective date arg
+    try:
+        dateformat = "%Y-%m-%d"
+        datestamp = (
+            datetime.strptime(effective_date, dateformat)
+            .astimezone()
+            .strftime(dateformat)
+        )
+        print(f"running transform and load with effective date {datestamp}")
+    except ValueError:
+        print("FATAL ERROR: malformed effective date, expected YYYY-MM-DD format")
+        return
+
+    # hydrate a dataset instance from the input data
+    dataset = EtlDataset.load_from_json_file(file_path=deliverable_file)
+
+    # sync data to db
+    etldb.sync_db(dataset, datestamp)
+
+    # finish
+    print("transform and load is done")
@@ -0,0 +1,145 @@
+"""
+Implement the EtlDataset class.
+
+This is a sub-class of BaseDataset that models
+quad, deliverable, epic, issue, and sprint data.
+"""
+
+from enum import Enum
+from typing import Any, Self
+
+import pandas as pd
+from numpy.typing import NDArray
+
+from analytics.datasets.base import BaseDataset
+from analytics.datasets.utils import load_json_data_as_df
+
+
+class EtlEntityType(Enum):
+    """Define entity types in the db schema."""
+
+    DELIVERABLE = "deliverable"
+    EPIC = "epic"
+    ISSUE = "issue"
+    SPRINT = "sprint"
+    QUAD = "quad"
+
+
+class EtlDataset(BaseDataset):
+    """Encapsulate data exported from github."""
+
+    COLUMN_MAP = {
+        "deliverable_url": "deliverable_ghid",
+        "deliverable_title": "deliverable_title",
+        "deliverable_pillar": "deliverable_pillar",
+        "epic_url": "epic_ghid",
+        "epic_title": "epic_title",
+        "issue_url": "issue_ghid",
+        "issue_title": "issue_title",
+        "issue_parent": "issue_parent",
+        "issue_type": "issue_type",
+        "issue_is_closed": "issue_is_closed",
+        "issue_opened_at": "issue_opened_at",
+        "issue_closed_at": "issue_closed_at",
+        "issue_points": "issue_points",
+        "issue_status": "issue_status",
+        "sprint_id": "sprint_ghid",
+        "sprint_name": "sprint_name",
+        "sprint_start": "sprint_start",
+        "sprint_length": "sprint_length",
+        "sprint_end": "sprint_end",
+        "quad_id": "quad_ghid",
+        "quad_name": "quad_name",
+        "quad_start": "quad_start",
+        "quad_length": "quad_length",
+        "quad_end": "quad_end",
+    }
+
+    @classmethod
+    def load_from_json_file(cls, file_path: str) -> Self:
+        """
+        Load the input json file and instantiates an instance of EtlDataset.
+
+        Parameters
+        ----------
+        file_path: str
+            Path to the local json file containing data exported from GitHub
+
+        Returns
+        -------
+        Self:
+            An instance of the EtlDataset dataset class
+        """
+        # load input datasets
+        df = load_json_data_as_df(
+            file_path=file_path,
+            column_map=cls.COLUMN_MAP,
+            date_cols=None,
+        )
+
+        # transform entity id columns
+        prefix = "https://github.com/"
+        for col in ("deliverable_ghid", "epic_ghid", "issue_ghid", "issue_parent"):
+            df[col] = df[col].str.replace(prefix, "")
+
+        return cls(df)
+
+    # QUAD getters
+
+    def get_quad(self, quad_ghid: str) -> pd.Series:
+        """Fetch data about a given quad."""
+        query_string = f"quad_ghid == '{quad_ghid}'"
+        return self.df.query(query_string).iloc[0]
+
+    def get_quad_ghids(self) -> NDArray[Any]:
+        """Fetch an array of unique non-null quad ghids."""
+        df = self.df[self.df.quad_ghid.notna()]
+        return df.quad_ghid.unique()
+
+    # DELIVERABLE getters
+
+    def get_deliverable(self, deliverable_ghid: str) -> pd.Series:
+        """Fetch data about a given deliverable."""
+        query_string = f"deliverable_ghid == '{deliverable_ghid}'"
+        return self.df.query(query_string).iloc[0]
+
+    def get_deliverable_ghids(self) -> NDArray[Any]:
+        """Fetch an array of unique non-null deliverable ghids."""
+        df = self.df[self.df.deliverable_ghid.notna()]
+        return df.deliverable_ghid.unique()
+
+    # SPRINT getters
+
+    def get_sprint(self, sprint_ghid: str) -> pd.Series:
+        """Fetch data about a given sprint."""
+        query_string = f"sprint_ghid == '{sprint_ghid}'"
+        return self.df.query(query_string).iloc[0]
+
+    def get_sprint_ghids(self) -> NDArray[Any]:
+        """Fetch an array of unique non-null sprint ghids."""
+        df = self.df[self.df.sprint_ghid.notna()]
+        return df.sprint_ghid.unique()
+
+    # EPIC getters
+
+    def get_epic(self, epic_ghid: str) -> pd.Series:
+        """Fetch data about a given epic."""
+        query_string = f"epic_ghid == '{epic_ghid}'"
+        return self.df.query(query_string).iloc[0]
+
+    def get_epic_ghids(self) -> NDArray[Any]:
+        """Fetch an array of unique non-null epic ghids."""
+        df = self.df[self.df.epic_ghid.notna()]
+        return df.epic_ghid.unique()
+
+    # ISSUE getters
+
+    def get_issue(self, issue_ghid: str) -> pd.Series:
+        """Fetch data about a given issue."""
+        query_string = f"issue_ghid == '{issue_ghid}'"
+        return self.df.query(query_string).iloc[0]
+
+    def get_issue_ghids(self) -> NDArray[Any]:
+        """Fetch an array of unique non-null issue ghids."""
+        df = self.df[self.df.issue_ghid.notna()]
+        return df.issue_ghid.unique()
@@ -22,7 +22,6 @@ def get_db() -> Engine:
     A SQLAlchemy engine object representing the connection to the database.
     """
     db = get_db_settings()
-    print(f"postgresql+psycopg://{db.user}:{db.password}@{db.db_host}:{db.port}")
     return create_engine(
         f"postgresql+psycopg://{db.user}:{db.password}@{db.db_host}:{db.port}",
         pool_pre_ping=True,

@@ -0,0 +1,11 @@
+"""Read and write data from/to delivery metrics database."""
+
+__all__ = [
+    "init_db",
+    "sync_db",
+]
+
+from analytics.integrations.etldb.main import (
+    init_db,
+    sync_db,
+)
@@ -0,0 +1,96 @@
+CREATE TABLE IF NOT EXISTS gh_deliverable (
+	id SERIAL PRIMARY KEY,
+	ghid TEXT UNIQUE NOT NULL,
+	title TEXT NOT NULL,
+	pillar TEXT, 
+	t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+	t_modified TIMESTAMP 
+);
+
+CREATE TABLE IF NOT EXISTS gh_deliverable_quad_map (
+	id SERIAL PRIMARY KEY,
+	deliverable_id INTEGER NOT NULL,
+	quad_id INTEGER,
+	d_effective DATE NOT NULL,
+	t_modified TIMESTAMP,
+	UNIQUE(deliverable_id, d_effective)
+);
+CREATE INDEX IF NOT EXISTS gh_dqm_i1 on gh_deliverable_quad_map(quad_id, d_effective);
+
+CREATE TABLE IF NOT EXISTS gh_epic (
+	id SERIAL PRIMARY KEY,
+	ghid TEXT UNIQUE NOT NULL,
+	title TEXT NOT NULL,
+	t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+	t_modified TIMESTAMP 
+);
+
+CREATE TABLE IF NOT EXISTS gh_epic_deliverable_map (
+	id SERIAL PRIMARY KEY,
+	epic_id INTEGER NOT NULL,
+	deliverable_id INTEGER,
+	d_effective DATE NOT NULL,
+	t_modified TIMESTAMP,
+	UNIQUE(epic_id, d_effective)
+);
+CREATE INDEX IF NOT EXISTS gh_edm_i1 on gh_epic_deliverable_map(deliverable_id, d_effective);
+
+CREATE TABLE IF NOT EXISTS gh_issue (
+	id SERIAL PRIMARY KEY,
+	ghid TEXT UNIQUE NOT NULL,
+	title TEXT NOT NULL,
+	type TEXT NOT NULL,
+	opened_date DATE,
+	closed_date DATE,
+	parent_issue_ghid TEXT,
+	epic_id INTEGER,
+	t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+	t_modified TIMESTAMP 
+);
+CREATE INDEX IF NOT EXISTS gh_issue_i1 on gh_issue(epic_id);
+
+CREATE TABLE IF NOT EXISTS gh_issue_history (
+	id SERIAL PRIMARY KEY,
+	issue_id INTEGER NOT NULL,
+	status TEXT,
+	is_closed INTEGER NOT NULL,
+	points INTEGER NOT NULL DEFAULT 0,
+	d_effective DATE NOT NULL,
+	t_modified TIMESTAMP,
+	UNIQUE(issue_id, d_effective)
+);
+CREATE INDEX IF NOT EXISTS gh_ih_i1 on gh_issue_history(issue_id, d_effective);
+
+CREATE TABLE IF NOT EXISTS gh_issue_sprint_map (
+	id SERIAL PRIMARY KEY,
+	issue_id INTEGER NOT NULL,
+	sprint_id INTEGER,
+	d_effective DATE NOT NULL,
+	t_modified TIMESTAMP,
+	UNIQUE(issue_id, d_effective)
+);
+
+CREATE TABLE IF NOT EXISTS gh_sprint (
+	id SERIAL PRIMARY KEY,
+	ghid TEXT UNIQUE NOT NULL,
+	name TEXT NOT NULL,
+	start_date DATE,
+	end_date DATE,
+	duration INTEGER,
+	quad_id INTEGER,
+	t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+	t_modified TIMESTAMP 
+);
+
+CREATE TABLE IF NOT EXISTS gh_quad (
+	id SERIAL PRIMARY KEY,
+	ghid TEXT UNIQUE NOT NULL,
+	name TEXT NOT NULL,
+	start_date DATE,
+	end_date DATE,
+	duration INTEGER,
+	t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+	t_modified TIMESTAMP 
+);
+CREATE INDEX IF NOT EXISTS gh_quad_i1 on gh_quad(start_date);
+