diff --git a/analytics/Makefile b/analytics/Makefile index 036fdfc6b..a9b1bd07d 100644 --- a/analytics/Makefile +++ b/analytics/Makefile @@ -12,12 +12,13 @@ PROJECT_CONFIG_FILE ?= $(CONFIG_DIR)/github-projects.json ISSUE_FILE ?= $(OUTPUT_DIR)/delivery-data.json SPRINT ?= @current # Names of the points and sprint fields in the GitHub project -POINTS_FIELD ?= Points +POINTS_FIELD ?= Story Points SPRINT_FIELD ?= Sprint UNIT ?= points ACTION ?= show-results MIN_TEST_COVERAGE ?= 80 APP_NAME ?= grants-analytics +EFFECTIVE_DATE ?= $(shell date +"%Y-%m-%d") # Required for CI to work properly SHELL = /bin/bash -o pipefail @@ -143,6 +144,20 @@ lint: ## runs code quality checks # Data Commands # ################# +init-db: + @echo "=> Initializing the database schema" + @echo "=====================================================" + $(POETRY) analytics etl initialize_database + @echo "=====================================================" + +gh-transform-and-load: + @echo "=> Transforming and loading GitHub data into the database" + @echo "=====================================================" + $(POETRY) analytics etl transform_and_load \ + --deliverable-file $(DELIVERY_FILE) \ + --effective-date $(EFFECTIVE_DATE) + @echo "=====================================================" + gh-db-data-import: @echo "=> Importing sprint data to the database" @echo "=====================================================" diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index 37bea4334..1d9bde629 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -1,7 +1,9 @@ # pylint: disable=C0415 """Expose a series of CLI entrypoints for the analytics package.""" + import logging import logging.config +from datetime import datetime from pathlib import Path from typing import Annotated, Optional @@ -9,10 +11,11 @@ from slack_sdk import WebClient from sqlalchemy import text +from analytics.datasets.etl_dataset import EtlDataset from analytics.datasets.issues import GitHubIssues from analytics.etl.github import GitHubProjectConfig, GitHubProjectETL from analytics.etl.utils import load_config -from analytics.integrations import db, slack +from analytics.integrations import db, etldb, slack from analytics.metrics.base import BaseMetric, Unit from analytics.metrics.burndown import SprintBurndown from analytics.metrics.burnup import SprintBurnup @@ -37,6 +40,8 @@ STATUS_ARG = typer.Option( help="Deliverable status to include in report, can be passed multiple times", ) +DELIVERABLE_FILE_ARG = typer.Option(help="Path to file with exported deliverable data") +EFFECTIVE_DATE_ARG = typer.Option(help="YYYY-MM-DD effective date to apply to each imported row") # fmt: on # instantiate the main CLI entrypoint @@ -45,10 +50,12 @@ export_app = typer.Typer() metrics_app = typer.Typer() import_app = typer.Typer() +etl_app = typer.Typer() # add sub-commands to main entrypoint app.add_typer(export_app, name="export", help="Export data needed to calculate metrics") app.add_typer(metrics_app, name="calculate", help="Calculate key project metrics") app.add_typer(import_app, name="import", help="Import data into the database") +app.add_typer(etl_app, name="etl", help="Transform and load local file") @app.callback() @@ -240,3 +247,45 @@ def export_json_to_database(delivery_file: Annotated[str, ISSUE_FILE_ARG]) -> No ) rows = len(issues.to_dict()) logger.info("Number of rows in table: %s", rows) + + +# =========================================================== +# Etl commands +# =========================================================== + + +@etl_app.command(name="initialize_database") +def initialize_database() -> None: + """Initialize etl database.""" + print("initializing database") + etldb.init_db() + print("done") + + +@etl_app.command(name="transform_and_load") +def transform_and_load( + deliverable_file: Annotated[str, DELIVERABLE_FILE_ARG], + effective_date: Annotated[str, EFFECTIVE_DATE_ARG], +) -> None: + """Transform and load etl data.""" + # validate effective date arg + try: + dateformat = "%Y-%m-%d" + datestamp = ( + datetime.strptime(effective_date, dateformat) + .astimezone() + .strftime(dateformat) + ) + print(f"running transform and load with effective date {datestamp}") + except ValueError: + print("FATAL ERROR: malformed effective date, expected YYYY-MM-DD format") + return + + # hydrate a dataset instance from the input data + dataset = EtlDataset.load_from_json_file(file_path=deliverable_file) + + # sync data to db + etldb.sync_db(dataset, datestamp) + + # finish + print("transform and load is done") diff --git a/analytics/src/analytics/datasets/etl_dataset.py b/analytics/src/analytics/datasets/etl_dataset.py new file mode 100644 index 000000000..8469ab6e4 --- /dev/null +++ b/analytics/src/analytics/datasets/etl_dataset.py @@ -0,0 +1,145 @@ +""" +Implement the EtlDataset class. + +This is a sub-class of BaseDataset that models +quad, deliverable, epic, issue, and sprint data. +""" + +from enum import Enum +from typing import Any, Self + +import pandas as pd +from numpy.typing import NDArray + +from analytics.datasets.base import BaseDataset +from analytics.datasets.utils import load_json_data_as_df + + +class EtlEntityType(Enum): + """Define entity types in the db schema.""" + + DELIVERABLE = "deliverable" + EPIC = "epic" + ISSUE = "issue" + SPRINT = "sprint" + QUAD = "quad" + + +class EtlDataset(BaseDataset): + """Encapsulate data exported from github.""" + + COLUMN_MAP = { + "deliverable_url": "deliverable_ghid", + "deliverable_title": "deliverable_title", + "deliverable_pillar": "deliverable_pillar", + "epic_url": "epic_ghid", + "epic_title": "epic_title", + "issue_url": "issue_ghid", + "issue_title": "issue_title", + "issue_parent": "issue_parent", + "issue_type": "issue_type", + "issue_is_closed": "issue_is_closed", + "issue_opened_at": "issue_opened_at", + "issue_closed_at": "issue_closed_at", + "issue_points": "issue_points", + "issue_status": "issue_status", + "sprint_id": "sprint_ghid", + "sprint_name": "sprint_name", + "sprint_start": "sprint_start", + "sprint_length": "sprint_length", + "sprint_end": "sprint_end", + "quad_id": "quad_ghid", + "quad_name": "quad_name", + "quad_start": "quad_start", + "quad_length": "quad_length", + "quad_end": "quad_end", + } + + @classmethod + def load_from_json_file(cls, file_path: str) -> Self: + """ + Load the input json file and instantiates an instance of EtlDataset. + + Parameters + ---------- + file_path: str + Path to the local json file containing data exported from GitHub + + Returns + ------- + Self: + An instance of the EtlDataset dataset class + """ + # load input datasets + df = load_json_data_as_df( + file_path=file_path, + column_map=cls.COLUMN_MAP, + date_cols=None, + ) + + # transform entity id columns + prefix = "https://github.com/" + for col in ("deliverable_ghid", "epic_ghid", "issue_ghid", "issue_parent"): + df[col] = df[col].str.replace(prefix, "") + + return cls(df) + + # QUAD getters + + def get_quad(self, quad_ghid: str) -> pd.Series: + """Fetch data about a given quad.""" + query_string = f"quad_ghid == '{quad_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_quad_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null quad ghids.""" + df = self.df[self.df.quad_ghid.notna()] + return df.quad_ghid.unique() + + # DELIVERABLE getters + + def get_deliverable(self, deliverable_ghid: str) -> pd.Series: + """Fetch data about a given deliverable.""" + query_string = f"deliverable_ghid == '{deliverable_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_deliverable_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null deliverable ghids.""" + df = self.df[self.df.deliverable_ghid.notna()] + return df.deliverable_ghid.unique() + + # SPRINT getters + + def get_sprint(self, sprint_ghid: str) -> pd.Series: + """Fetch data about a given sprint.""" + query_string = f"sprint_ghid == '{sprint_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_sprint_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null sprint ghids.""" + df = self.df[self.df.sprint_ghid.notna()] + return df.sprint_ghid.unique() + + # EPIC getters + + def get_epic(self, epic_ghid: str) -> pd.Series: + """Fetch data about a given epic.""" + query_string = f"epic_ghid == '{epic_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_epic_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null epic ghids.""" + df = self.df[self.df.epic_ghid.notna()] + return df.epic_ghid.unique() + + # ISSUE getters + + def get_issue(self, issue_ghid: str) -> pd.Series: + """Fetch data about a given issue.""" + query_string = f"issue_ghid == '{issue_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_issue_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null issue ghids.""" + df = self.df[self.df.issue_ghid.notna()] + return df.issue_ghid.unique() diff --git a/analytics/src/analytics/datasets/utils.py b/analytics/src/analytics/datasets/utils.py index e48f67541..e6efcae54 100644 --- a/analytics/src/analytics/datasets/utils.py +++ b/analytics/src/analytics/datasets/utils.py @@ -2,6 +2,53 @@ import json +import pandas as pd + + +def load_json_data_as_df( + file_path: str, + column_map: dict, + date_cols: list[str] | None = None, + key_for_nested_items: str | None = None, +) -> pd.DataFrame: + """ + Load a file that contains JSON data and format is as a DataFrame. + + Parameters + ---------- + file_path: str + Path to the JSON file with the exported issue data + column_map: dict + Dictionary mapping of existing JSON keys to their new column names + date_cols: list[str] + List of columns that need to be converted to date types + key_for_nested_items: Optional[str] + Name of the key containing a list of objects to load as a dataframe. + Only needed if the JSON loaded is an object instead of a list + + Returns + ------- + pd.DataFrame + Pandas dataframe with columns renamed to match the values of the column map + """ + # load json data from the local file + with open(file_path, encoding="utf-8") as f: + json_data = json.loads(f.read()) + # if the items we want to convert are nested under a key extract them + if key_for_nested_items: + json_data = json_data[key_for_nested_items] + # flatten the nested json into a dataframe + df = pd.json_normalize(json_data) + # reorder and rename the columns + df = df[column_map.keys()] + df = df.rename(columns=column_map) + # convert datetime columns to date + if date_cols: + for col in date_cols: + # strip off the timestamp portion of the date + df[col] = pd.to_datetime(df[col]).dt.floor("d") + return df + def load_json_file(path: str) -> list[dict]: """Load contents of a JSON file into a dictionary.""" diff --git a/analytics/src/analytics/integrations/db.py b/analytics/src/analytics/integrations/db.py index 89bdeaa09..e3314ec0b 100644 --- a/analytics/src/analytics/integrations/db.py +++ b/analytics/src/analytics/integrations/db.py @@ -22,7 +22,6 @@ def get_db() -> Engine: A SQLAlchemy engine object representing the connection to the database. """ db = get_db_settings() - print(f"postgresql+psycopg://{db.user}:{db.password}@{db.db_host}:{db.port}") return create_engine( f"postgresql+psycopg://{db.user}:{db.password}@{db.db_host}:{db.port}", pool_pre_ping=True, diff --git a/analytics/src/analytics/integrations/etldb/__init__.py b/analytics/src/analytics/integrations/etldb/__init__.py new file mode 100644 index 000000000..c1afd0946 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/__init__.py @@ -0,0 +1,11 @@ +"""Read and write data from/to delivery metrics database.""" + +__all__ = [ + "init_db", + "sync_db", +] + +from analytics.integrations.etldb.main import ( + init_db, + sync_db, +) diff --git a/analytics/src/analytics/integrations/etldb/create_etl_db.sql b/analytics/src/analytics/integrations/etldb/create_etl_db.sql new file mode 100644 index 000000000..304c4c95b --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/create_etl_db.sql @@ -0,0 +1,100 @@ +CREATE TABLE IF NOT EXISTS gh_deliverable ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + pillar TEXT, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS gh_deliverable_quad_map ( + id SERIAL PRIMARY KEY, + deliverable_id INTEGER NOT NULL, + quad_id INTEGER, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(deliverable_id, d_effective) +); +CREATE INDEX IF NOT EXISTS gh_dqm_i1 on gh_deliverable_quad_map(quad_id, d_effective); + +CREATE TABLE IF NOT EXISTS gh_epic ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS gh_epic_deliverable_map ( + id SERIAL PRIMARY KEY, + epic_id INTEGER NOT NULL, + deliverable_id INTEGER, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(epic_id, d_effective) +); +CREATE INDEX IF NOT EXISTS gh_edm_i1 on gh_epic_deliverable_map(deliverable_id, d_effective); + +CREATE TABLE IF NOT EXISTS gh_issue ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + type TEXT NOT NULL, + opened_date DATE, + closed_date DATE, + parent_issue_ghid TEXT, + epic_id INTEGER, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); +CREATE INDEX IF NOT EXISTS gh_issue_i1 on gh_issue(epic_id); + +CREATE TABLE IF NOT EXISTS gh_issue_history ( + id SERIAL PRIMARY KEY, + issue_id INTEGER NOT NULL, + status TEXT, + is_closed INTEGER NOT NULL, + points INTEGER NOT NULL DEFAULT 0, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(issue_id, d_effective) +); +CREATE INDEX IF NOT EXISTS gh_ih_i1 on gh_issue_history(issue_id, d_effective); + +CREATE TABLE IF NOT EXISTS gh_issue_sprint_map ( + id SERIAL PRIMARY KEY, + issue_id INTEGER NOT NULL, + sprint_id INTEGER, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(issue_id, d_effective) +); + +CREATE TABLE IF NOT EXISTS gh_sprint ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + name TEXT NOT NULL, + start_date DATE, + end_date DATE, + duration INTEGER, + quad_id INTEGER, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS gh_quad ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + name TEXT NOT NULL, + start_date DATE, + end_date DATE, + duration INTEGER, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); +CREATE INDEX IF NOT EXISTS gh_quad_i1 on gh_quad(start_date); + diff --git a/analytics/src/analytics/integrations/etldb/deliverable_model.py b/analytics/src/analytics/integrations/etldb/deliverable_model.py new file mode 100644 index 000000000..0f0d8cd35 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/deliverable_model.py @@ -0,0 +1,149 @@ +"""Define EtlDeliverableModel class to encapsulate db CRUD operations.""" + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlDeliverableModel: + """Encapsulate CRUD operations for deliverable entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_deliverable( + self, + deliverable_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Write deliverable data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + deliverable_id = self._insert_dimensions(deliverable_df) + if deliverable_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if deliverable_id is None: + deliverable_id, change_type = self._update_dimensions(deliverable_df) + + # insert facts + if deliverable_id is not None: + self._insert_facts(deliverable_id, deliverable_df, ghid_map) + + return deliverable_id, change_type + + def _insert_dimensions(self, deliverable_df: Series) -> int | None: + """Write deliverable dimension data to etl database.""" + # insert into dimension table: deliverable + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_deliverable(ghid, title, pillar) " + "values (:ghid, :title, :pillar) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": deliverable_df["deliverable_ghid"], + "title": deliverable_df["deliverable_title"], + "pillar": deliverable_df["deliverable_pillar"], + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _insert_facts( + self, + deliverable_id: int, + deliverable_df: Series, + ghid_map: dict, + ) -> int | None: + """Write deliverable fact data to etl database.""" + # insert into fact table: deliverable_quad_map + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_deliverable_quad_map(deliverable_id, quad_id, d_effective) " + "values (:deliverable_id, :quad_id, :effective) " + "on conflict(deliverable_id, d_effective) do update " + "set (quad_id, t_modified) = (:quad_id, current_timestamp) returning id", + ), + { + "deliverable_id": deliverable_id, + "quad_id": ghid_map[EtlEntityType.QUAD].get( + deliverable_df["quad_ghid"], + ), + "effective": self.dbh.effective_date, + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions( + self, + deliverable_df: Series, + ) -> tuple[int | None, EtlChangeType]: + """Update deliverable fact data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_title = deliverable_df["deliverable_title"] + new_pillar = deliverable_df["deliverable_pillar"] + new_values = (new_title, new_pillar) + + # select old values + deliverable_id, old_title, old_pillar = self._select( + deliverable_df["deliverable_ghid"], + ) + old_values = (old_title, old_pillar) + + # compare + if deliverable_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + update_sql = text( + "update gh_deliverable set title = :new_title, pillar = :new_pillar, " + "t_modified = current_timestamp where id = :deliverable_id", + ) + update_values = { + "new_title": new_title, + "new_pillar": new_pillar, + "deliverable_id": deliverable_id, + } + cursor.execute(update_sql, update_values) + self.dbh.commit(cursor) + + return deliverable_id, change_type + + def _select(self, ghid: str) -> tuple[int | None, str | None, str | None]: + """Select deliverable data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text("select id, title, pillar from gh_deliverable where ghid = :ghid"), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2] + + return None, None, None diff --git a/analytics/src/analytics/integrations/etldb/epic_model.py b/analytics/src/analytics/integrations/etldb/epic_model.py new file mode 100644 index 000000000..af0fdf45d --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/epic_model.py @@ -0,0 +1,137 @@ +"""Defines EtlEpicModel class to encapsulate db CRUD operations.""" + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlEpicModel: + """Encapsulate CRUD operations for epic entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_epic( + self, + epic_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Write epic data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + epic_id = self._insert_dimensions(epic_df) + if epic_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if epic_id is None: + epic_id, change_type = self._update_dimensions(epic_df) + + # insert facts + if epic_id is not None: + self._insert_facts(epic_id, epic_df, ghid_map) + + return epic_id, change_type + + def _insert_dimensions(self, epic_df: Series) -> int | None: + """Write epic dimension data to etl database.""" + # insert into dimension table: epic + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_epic(ghid, title) values (:ghid, :title) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": epic_df["epic_ghid"], + "title": epic_df["epic_title"], + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _insert_facts( + self, + epic_id: int, + epic_df: Series, + ghid_map: dict, + ) -> int | None: + """Write epic fact data to etl database.""" + # insert into fact table: epic_deliverable_map + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_epic_deliverable_map(epic_id, deliverable_id, d_effective) " + "values (:epic_id, :deliverable_id, :effective) " + "on conflict(epic_id, d_effective) do update " + "set (deliverable_id, t_modified) = (:deliverable_id, current_timestamp) " + "returning id", + ), + { + "deliverable_id": ghid_map[EtlEntityType.DELIVERABLE].get( + epic_df["deliverable_ghid"], + ), + "epic_id": epic_id, + "effective": self.dbh.effective_date, + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions(self, epic_df: Series) -> tuple[int | None, EtlChangeType]: + """Update epic dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_title = epic_df["epic_title"] + + # select old values + epic_id, old_title = self._select(epic_df["epic_ghid"]) + + # compare + if epic_id is not None and (new_title,) != (old_title,): + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_epic set title = :new_title, t_modified = current_timestamp " + "where id = :epic_id", + ), + {"new_title": new_title, "epic_id": epic_id}, + ) + self.dbh.commit(cursor) + + return epic_id, change_type + + def _select(self, ghid: str) -> tuple[int | None, str | None]: + """Select epic data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text("select id, title from gh_epic where ghid = :ghid"), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1] + + return None, None diff --git a/analytics/src/analytics/integrations/etldb/etldb.py b/analytics/src/analytics/integrations/etldb/etldb.py new file mode 100644 index 000000000..7a25faed3 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/etldb.py @@ -0,0 +1,44 @@ +"""Define EtlDb as an abstraction layer for database connections.""" + +from enum import Enum + +from sqlalchemy import Connection + +from analytics.integrations import db + + +class EtlDb: + """Encapsulate etl database connections.""" + + def __init__(self, effective: str | None = None) -> None: + """Construct instance.""" + self._db_engine = db.get_db() + self._connection: Connection | None = None + self.effective_date = effective + self.dateformat = "%Y-%m-%d" + + def __del__(self) -> None: + """Destroy instance.""" + self.disconnect() + + def connection(self) -> Connection: + """Get a connection object from the db engine.""" + if self._connection is None: + self._connection = self._db_engine.connect() + return self._connection + + def commit(self, connection: Connection) -> None: + """Commit an open transaction.""" + connection.commit() + + def disconnect(self) -> None: + """Dispose of db connection.""" + self._db_engine.dispose() + + +class EtlChangeType(Enum): + """An enum to describe ETL change types.""" + + NONE = 0 + INSERT = 1 + UPDATE = 2 diff --git a/analytics/src/analytics/integrations/etldb/issue_model.py b/analytics/src/analytics/integrations/etldb/issue_model.py new file mode 100644 index 000000000..36740438d --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/issue_model.py @@ -0,0 +1,202 @@ +"""Define EtlIssueModel class to encapsulate db CRUD operations.""" + +from datetime import datetime + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlIssueModel: + """Encapsulate CRUD operations for issue entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_issue( + self, + issue_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Write issue data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + issue_id = self._insert_dimensions(issue_df, ghid_map) + if issue_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if issue_id is None: + issue_id, change_type = self._update_dimensions(issue_df, ghid_map) + + # insert facts + if issue_id is not None: + self._insert_facts(issue_id, issue_df, ghid_map) + + return issue_id, change_type + + def _insert_dimensions(self, issue_df: Series, ghid_map: dict) -> int | None: + """Write issue dimension data to etl database.""" + # insert into dimension table: issue + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_issue " + "(ghid, title, type, opened_date, closed_date, parent_issue_ghid, epic_id) " + "values (:ghid, :title, :type, :opened_date, :closed_date, :parent_ghid, :epic_id) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": issue_df["issue_ghid"], + "title": issue_df["issue_title"], + "type": issue_df["issue_type"] or "None", + "opened_date": issue_df["issue_opened_at"], + "closed_date": issue_df["issue_closed_at"], + "parent_ghid": issue_df["issue_parent"], + "epic_id": ghid_map[EtlEntityType.EPIC].get(issue_df["epic_ghid"]), + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _insert_facts( + self, + issue_id: int, + issue_df: Series, + ghid_map: dict, + ) -> tuple[int | None, int | None]: + """Write issue fact data to etl database.""" + # get values needed for sql statement + issue_df = issue_df.fillna(0) + insert_values = { + "issue_id": issue_id, + "status": issue_df["issue_status"], + "is_closed": int(issue_df["issue_is_closed"]), + "points": issue_df["issue_points"], + "sprint_id": ghid_map[EtlEntityType.SPRINT].get(issue_df["sprint_ghid"]), + "effective": self.dbh.effective_date, + } + history_id = None + map_id = None + + # insert into fact table: issue_history + cursor = self.dbh.connection() + insert_sql1 = text( + "insert into gh_issue_history (issue_id, status, is_closed, points, d_effective) " + "values (:issue_id, :status, :is_closed, :points, :effective) " + "on conflict (issue_id, d_effective) " + "do update set (status, is_closed, points, t_modified) = " + "(:status, :is_closed, :points, current_timestamp) " + "returning id", + ) + result1 = cursor.execute(insert_sql1, insert_values) + row1 = result1.fetchone() + if row1: + history_id = row1[0] + + # insert into fact table: issue_sprint_map + insert_sql2 = text( + "insert into gh_issue_sprint_map (issue_id, sprint_id, d_effective) " + "values (:issue_id, :sprint_id, :effective) " + "on conflict (issue_id, d_effective) " + "do update set (sprint_id, t_modified) = " + "(:sprint_id, current_timestamp) returning id", + ) + result2 = cursor.execute(insert_sql2, insert_values) + row2 = result2.fetchone() + if row2: + map_id = row2[0] + + # commit + self.dbh.commit(cursor) + + return history_id, map_id + + def _update_dimensions( + self, + issue_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Update issue dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_values = ( + issue_df["issue_title"], + issue_df["issue_type"] or "None", + issue_df["issue_opened_at"], + issue_df["issue_closed_at"], + issue_df["issue_parent"], + ghid_map[EtlEntityType.EPIC].get(issue_df["epic_ghid"]), + ) + + # select old values + issue_id, o_title, o_type, o_opened, o_closed, o_parent, o_epic_id = ( + self._select(issue_df["issue_ghid"]) + ) + old_values = (o_title, o_type, o_opened, o_closed, o_parent, o_epic_id) + + # compare + if issue_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_issue set " + "title = :new_title, type = :new_type, opened_date = :new_opened, " + "closed_date = :new_closed, parent_issue_ghid = :new_parent, " + "epic_id = :new_epic_id, t_modified = current_timestamp " + "where id = :issue_id", + ), + { + "new_title": issue_df["issue_title"], + "new_type": issue_df["issue_type"] or "None", + "new_opened": issue_df["issue_opened_at"], + "new_closed": issue_df["issue_closed_at"], + "new_parent": issue_df["issue_parent"], + "new_epic_id": ghid_map[EtlEntityType.EPIC].get( + issue_df["epic_ghid"], + ), + "issue_id": issue_id, + }, + ) + self.dbh.commit(cursor) + + return issue_id, change_type + + def _select(self, ghid: str) -> tuple[ + int | None, + str | None, + str | None, + datetime | None, + datetime | None, + str | None, + int | None, + ]: + """Select issue data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text( + "select id, title, type, opened_date, closed_date, parent_issue_ghid, epic_id " + "from gh_issue where ghid = :ghid", + ), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2], row[3], row[4], row[5], row[6] + + return None, None, None, None, None, None, None diff --git a/analytics/src/analytics/integrations/etldb/main.py b/analytics/src/analytics/integrations/etldb/main.py new file mode 100644 index 000000000..11f790bda --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/main.py @@ -0,0 +1,134 @@ +"""Integrate with database to read and write etl data.""" + +from pathlib import Path + +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlDataset, EtlEntityType +from analytics.integrations.etldb.deliverable_model import EtlDeliverableModel +from analytics.integrations.etldb.epic_model import EtlEpicModel +from analytics.integrations.etldb.etldb import EtlDb +from analytics.integrations.etldb.issue_model import EtlIssueModel +from analytics.integrations.etldb.quad_model import EtlQuadModel +from analytics.integrations.etldb.sprint_model import EtlSprintModel + +VERBOSE = False + + +def init_db() -> None: + """Initialize etl database.""" + # define the path to the sql file + parent_path = Path(__file__).resolve().parent + sql_path = f"{parent_path}/create_etl_db.sql" + + # read sql file + with open(sql_path) as f: + sql = f.read() + + # execute sql + db = EtlDb() + cursor = db.connection() + cursor.execute( + text(sql), + ) + db.commit(cursor) + + +def sync_db(dataset: EtlDataset, effective: str) -> None: + """Write github data to etl database.""" + # initialize a map of github id to db row id + ghid_map: dict[EtlEntityType, dict[str, int]] = { + EtlEntityType.DELIVERABLE: {}, + EtlEntityType.EPIC: {}, + EtlEntityType.SPRINT: {}, + EtlEntityType.QUAD: {}, + } + + # initialize db connection + db = EtlDb(effective) + + # sync quad data to db resulting in row id for each quad + ghid_map[EtlEntityType.QUAD] = sync_quads(db, dataset) + print(f"quad row(s) processed: {len(ghid_map[EtlEntityType.QUAD])}") + + # sync deliverable data to db resulting in row id for each deliverable + ghid_map[EtlEntityType.DELIVERABLE] = sync_deliverables( + db, + dataset, + ghid_map, + ) + print(f"deliverable row(s) processed: {len(ghid_map[EtlEntityType.DELIVERABLE])}") + + # sync sprint data to db resulting in row id for each sprint + ghid_map[EtlEntityType.SPRINT] = sync_sprints(db, dataset, ghid_map) + print(f"sprint row(s) processed: {len(ghid_map[EtlEntityType.SPRINT])}") + + # sync epic data to db resulting in row id for each epic + ghid_map[EtlEntityType.EPIC] = sync_epics(db, dataset, ghid_map) + print(f"epic row(s) processed: {len(ghid_map[EtlEntityType.EPIC])}") + + # sync issue data to db resulting in row id for each issue + issue_map = sync_issues(db, dataset, ghid_map) + print(f"issue row(s) processed: {len(issue_map)}") + + +def sync_deliverables(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each deliverable and return a map of row ids.""" + result = {} + model = EtlDeliverableModel(db) + for ghid in dataset.get_deliverable_ghids(): + deliverable_df = dataset.get_deliverable(ghid) + result[ghid], _ = model.sync_deliverable(deliverable_df, ghid_map) + if VERBOSE: + print(f"DELIVERABLE '{ghid}' row_id = {result[ghid]}") + return result + + +def sync_epics(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each epic and return a map of row ids.""" + result = {} + model = EtlEpicModel(db) + for ghid in dataset.get_epic_ghids(): + epic_df = dataset.get_epic(ghid) + result[ghid], _ = model.sync_epic(epic_df, ghid_map) + if VERBOSE: + print(f"EPIC '{ghid}' row_id = {result[ghid]}") + return result + + +def sync_issues(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each issue and return a map of row ids.""" + result = {} + model = EtlIssueModel(db) + for ghid in dataset.get_issue_ghids(): + issue_df = dataset.get_issue(ghid) + result[ghid], _ = model.sync_issue(issue_df, ghid_map) + if VERBOSE: + print(f"ISSUE '{ghid}' issue_id = {result[ghid]}") + return result + + +def sync_sprints(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each sprint and return a map of row ids.""" + result = {} + model = EtlSprintModel(db) + for ghid in dataset.get_sprint_ghids(): + sprint_df = dataset.get_sprint(ghid) + result[ghid], _ = model.sync_sprint(sprint_df, ghid_map) + if VERBOSE: + print(f"SPRINT '{ghid}' row_id = {result[ghid]}") + return result + + +def sync_quads(db: EtlDb, dataset: EtlDataset) -> dict: + """Insert or update (if necessary) a row for each quad and return a map of row ids.""" + result = {} + model = EtlQuadModel(db) + for ghid in dataset.get_quad_ghids(): + quad_df = dataset.get_quad(ghid) + result[ghid], _ = model.sync_quad(quad_df) + if VERBOSE: + print( + f"QUAD '{ghid}' title = '{quad_df['quad_name']}', row_id = {result[ghid]}", + ) + return result diff --git a/analytics/src/analytics/integrations/etldb/quad_model.py b/analytics/src/analytics/integrations/etldb/quad_model.py new file mode 100644 index 000000000..6324710ec --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/quad_model.py @@ -0,0 +1,129 @@ +"""Defines EtlQuadModel class to encapsulate db CRUD operations.""" + +from datetime import datetime + +from pandas import Series +from sqlalchemy import text + +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlQuadModel: + """Encapsulates CRUD operations for quad entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_quad(self, quad_df: Series) -> tuple[int | None, EtlChangeType]: + """Write quad data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + quad_id = self._insert_dimensions(quad_df) + if quad_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, then select and update + if quad_id is None: + quad_id, change_type = self._update_dimensions(quad_df) + + return quad_id, change_type + + def _insert_dimensions(self, quad_df: Series) -> int | None: + """Write quad dimension data to etl database.""" + # insert into dimension table: quad + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_quad(ghid, name, start_date, end_date, duration) " + "values (:ghid, :name, :start_date, :end_date, :duration) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": quad_df["quad_ghid"], + "name": quad_df["quad_name"], + "start_date": quad_df["quad_start"], + "end_date": quad_df["quad_end"], + "duration": quad_df["quad_length"], + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions(self, quad_df: Series) -> tuple[int | None, EtlChangeType]: + """Update quad dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_values = ( + quad_df["quad_name"], + quad_df["quad_start"], + quad_df["quad_end"], + int(quad_df["quad_length"]), + ) + + # select old values + quad_id, old_name, old_start, old_end, old_duration = self._select( + quad_df["quad_ghid"], + ) + old_values = ( + old_name, + old_start.strftime(self.dbh.dateformat) if old_start is not None else None, + old_end.strftime(self.dbh.dateformat) if old_end is not None else None, + old_duration, + ) + + # compare + if quad_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_quad set name = :new_name, " + "start_date = :new_start, end_date = :new_end, " + "duration = :new_duration, t_modified = current_timestamp " + "where id = :quad_id", + ), + { + "new_name": new_values[0], + "new_start": new_values[1], + "new_end": new_values[2], + "new_duration": new_values[3], + "quad_id": quad_id, + }, + ) + self.dbh.commit(cursor) + + return quad_id, change_type + + def _select(self, ghid: str) -> tuple[ + int | None, + str | None, + datetime | None, + datetime | None, + int | None, + ]: + """Select epic data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text( + "select id, name, start_date, end_date, duration " + "from gh_quad where ghid = :ghid", + ), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2], row[3], row[4] + + return None, None, None, None, None diff --git a/analytics/src/analytics/integrations/etldb/sprint_model.py b/analytics/src/analytics/integrations/etldb/sprint_model.py new file mode 100644 index 000000000..f14dc14cb --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/sprint_model.py @@ -0,0 +1,132 @@ +"""Define EtlSprintModel class to encapsulate db CRUD operations.""" + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlSprintModel: + """Encapsulate CRUD operations for sprint entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_sprint(self, sprint_df: Series, ghid_map: dict) -> tuple[ + int | None, + EtlChangeType, + ]: + """Write sprint data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + sprint_id = self._insert_dimensions(sprint_df, ghid_map) + if sprint_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if sprint_id is None: + sprint_id, change_type = self._update_dimensions(sprint_df, ghid_map) + + return sprint_id, change_type + + def _insert_dimensions(self, sprint_df: Series, ghid_map: dict) -> int | None: + """Write sprint dimension data in etl database.""" + # insert into dimension table: sprint + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_sprint(ghid, name, start_date, end_date, duration, quad_id) " + "values (:ghid, :name, :start, :end, :duration, :quad_id) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": sprint_df["sprint_ghid"], + "name": sprint_df["sprint_name"], + "start": sprint_df["sprint_start"], + "end": sprint_df["sprint_end"], + "duration": sprint_df["sprint_length"], + "quad_id": ghid_map[EtlEntityType.QUAD].get(sprint_df["quad_ghid"]), + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions(self, sprint_df: Series, ghid_map: dict) -> tuple[ + int | None, + EtlChangeType, + ]: + """Update sprint dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_values = ( + sprint_df["sprint_name"], + sprint_df["sprint_start"], + sprint_df["sprint_end"], + sprint_df["sprint_length"], + ghid_map[EtlEntityType.QUAD].get(sprint_df["quad_ghid"]), + ) + + # select old values + sprint_id, old_name, old_start, old_end, old_duration, old_quad_id = ( + self._select(sprint_df["sprint_ghid"]) + ) + old_values = (old_name, old_start, old_end, old_duration, old_quad_id) + + # compare + if sprint_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_sprint set name = :new_name, start_date = :new_start, " + "end_date = :new_end, duration = :new_duration, quad_id = :quad_id, " + "t_modified = current_timestamp where id = :sprint_id", + ), + { + "new_name": new_values[0], + "new_start": new_values[1], + "new_end": new_values[2], + "new_duration": new_values[3], + "quad_id": new_values[4], + "sprint_id": sprint_id, + }, + ) + self.dbh.commit(cursor) + + return sprint_id, change_type + + def _select(self, ghid: str) -> tuple[ + int | None, + str | None, + str | None, + str | None, + int | None, + int | None, + ]: + """Select epic data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text( + "select id, name, start_date, end_date, duration, quad_id " + "from gh_sprint where ghid = :ghid", + ), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2], row[3], row[4], row[5] + + return None, None, None, None, None, None diff --git a/analytics/tests/datasets/test_etldb.py b/analytics/tests/datasets/test_etldb.py new file mode 100644 index 000000000..042022a89 --- /dev/null +++ b/analytics/tests/datasets/test_etldb.py @@ -0,0 +1,83 @@ +"""Tests the code in datasets/etl_dataset.py.""" + +from analytics.datasets.etl_dataset import EtlDataset + + +class TestEtlDataset: + """Test EtlDataset methods.""" + + TEST_FILE_1 = "./tests/etldb_test_01.json" + + def test_load_from_json_files(self): + """Class method should return the correctly transformed data.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + row_count = dataset.df.shape[0] + col_count = dataset.df.shape[1] + assert row_count == 22 + assert col_count == 24 + + def test_deliverable_fetchers(self): + """Deliverable fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_deliverable_ghids() + assert len(unique_ghids) == 2 + + ghid = unique_ghids[0] + assert ghid == "agilesix/simpler-grants-sandbox/issues/2" + + deliverable = dataset.get_deliverable(ghid) + assert deliverable["deliverable_title"] == "Opportunity listing page" + + def test_epic_fetchers(self): + """Epic fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_epic_ghids() + assert len(unique_ghids) == 4 + + ghid = unique_ghids[0] + assert ghid == "agilesix/simpler-grants-sandbox/issues/8" + + epic = dataset.get_epic(ghid) + assert epic["epic_title"] == "Deploy opportunity listing behind a feature flag" + + def test_issue_fetchers(self): + """Issue fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_issue_ghids() + assert len(unique_ghids) == 22 + + ghid = unique_ghids[0] + assert ghid == "agilesix/simpler-grants-sandbox/issues/46" + + issue = dataset.get_issue(ghid) + assert issue["issue_opened_at"] == "2024-09-27T15:29:37Z" + + def test_sprint_fetchers(self): + """Deliverable fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_sprint_ghids() + assert len(unique_ghids) == 5 + + ghid = unique_ghids[0] + assert ghid == "74402b12" + + sprint = dataset.get_sprint(ghid) + assert sprint["sprint_name"] == "Sprint 2" + + def test_quad_fetchers(self): + """Quad fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_quad_ghids() + assert len(unique_ghids) == 1 + + ghid = unique_ghids[0] + assert ghid == "de5f962b" + + quad = dataset.get_quad(ghid) + assert quad["quad_name"] == "BY1 Quad 1" diff --git a/analytics/tests/etldb_test_01.json b/analytics/tests/etldb_test_01.json new file mode 100644 index 000000000..2c3801b44 --- /dev/null +++ b/analytics/tests/etldb_test_01.json @@ -0,0 +1,574 @@ +[ + { + "issue_title": "exampel that doesn't ask type when created", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/46", + "issue_parent": null, + "issue_type": null, + "issue_is_closed": false, + "issue_opened_at": "2024-09-27T15:29:37Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "Implement opportunity listing UI", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/11", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:49:17Z", + "issue_closed_at": null, + "issue_points": 5, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "Implement opportunity listing API", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/10", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:49:03Z", + "issue_closed_at": null, + "issue_points": 5, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "exampel creating from project interface", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/45", + "issue_parent": null, + "issue_type": null, + "issue_is_closed": false, + "issue_opened_at": "2024-09-26T23:23:31Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "Implement search API", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/5", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "issue_type": "Task", + "issue_is_closed": true, + "issue_opened_at": "2024-09-18T15:41:49Z", + "issue_closed_at": "2024-09-18T19:40:40Z", + "issue_points": 3, + "issue_status": "Done", + "sprint_id": "26a4c39d", + "sprint_name": "Sprint 1", + "sprint_start": "2024-09-09", + "sprint_length": 14, + "sprint_end": "2024-09-23", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "\ud83d\udd0e SimplerFind", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/1", + "deliverable_title": "Search", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "epic_title": "Deploy search behind a feature flag" + }, + { + "issue_title": "Enable feature flag for 1000 users", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/15", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:50:41Z", + "issue_closed_at": null, + "issue_points": 3, + "issue_status": "Todo", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "epic_title": "Release to opportunity listing to 10k users" + }, + { + "issue_title": "Load test for 10k active users", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/14", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:50:20Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": "8a6d26a4", + "sprint_name": "Sprint 4", + "sprint_start": "2024-10-21", + "sprint_length": 14, + "sprint_end": "2024-11-04", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "epic_title": "Release to opportunity listing to 10k users" + }, + { + "issue_title": "Enable feature flag for first 100 users", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/13", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:50:02Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": "11878b69", + "sprint_name": "Sprint 5", + "sprint_start": "2024-11-04", + "sprint_length": 14, + "sprint_end": "2024-11-18", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "epic_title": "Release to opportunity listing to 10k users" + }, + { + "issue_title": "Conduct first usability test for opportunity listing", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/12", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:49:32Z", + "issue_closed_at": null, + "issue_points": 3, + "issue_status": "Todo", + "sprint_id": "0a9ff409", + "sprint_name": "Sprint 3", + "sprint_start": "2024-10-07", + "sprint_length": 14, + "sprint_end": "2024-10-21", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "Implement Search UI", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/6", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:41:58Z", + "issue_closed_at": null, + "issue_points": 8, + "issue_status": "Todo", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "\ud83d\udd0e SimplerFind", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/1", + "deliverable_title": "Search", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "epic_title": "Deploy search behind a feature flag" + }, + { + "issue_title": "Host first usability test for search", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/7", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:42:24Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": "8a6d26a4", + "sprint_name": "Sprint 4", + "sprint_start": "2024-10-21", + "sprint_length": 14, + "sprint_end": "2024-11-04", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "\ud83d\udd0e SimplerFind", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/1", + "deliverable_title": "Search", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "epic_title": "Deploy search behind a feature flag" + }, + { + "issue_title": "[Bug] DD test 01", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/25", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:09:08Z", + "issue_closed_at": "2024-09-21T01:26:21Z", + "issue_points": 2, + "issue_status": "Done", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Bug] DD test 02", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/26", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:14:39Z", + "issue_closed_at": "2024-09-21T01:26:39Z", + "issue_points": 1, + "issue_status": "Done", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[BUG] DD test 03", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/27", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:22:52Z", + "issue_closed_at": "2024-09-21T01:26:45Z", + "issue_points": 5, + "issue_status": "Done", + "sprint_id": "26a4c39d", + "sprint_name": "Sprint 1", + "sprint_start": "2024-09-09", + "sprint_length": 14, + "sprint_end": "2024-09-23", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[BUG] DD test 04 with screenshot", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/28", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:24:39Z", + "issue_closed_at": "2024-09-21T01:26:52Z", + "issue_points": 2, + "issue_status": "Done", + "sprint_id": "26a4c39d", + "sprint_name": "Sprint 1", + "sprint_start": "2024-09-09", + "sprint_length": 14, + "sprint_end": "2024-09-23", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "Sub-issue 1", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/32", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/10", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-24T17:06:03Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "Sub issue 2", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/33", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/10", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-24T17:06:18Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "[Bug] Sample bug created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/35", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T17:58:41Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Feature] Sample feature request created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/36", + "issue_parent": null, + "issue_type": "Enhancement", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T17:59:29Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[ADR] Sample decision created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/37", + "issue_parent": null, + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T18:00:34Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Task] Sample task created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/40", + "issue_parent": null, + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T18:02:47Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Task] Bar 1", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/42", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/39", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T19:14:49Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/39", + "epic_title": "[Epic] Sample epic created with issue template" + } +] \ No newline at end of file diff --git a/analytics/tests/test_cli.py b/analytics/tests/test_cli.py index 8c230aaae..da7dbffc8 100644 --- a/analytics/tests/test_cli.py +++ b/analytics/tests/test_cli.py @@ -272,3 +272,70 @@ def test_stdout_message_includes_issues_if_unit_set_to_issues( # validation - check that slack message is printed and includes 'points' assert "Slack message" in result.stdout assert "issues" in result.stdout + + +class TestEtlEntryPoint: + """Test the etl entry point.""" + + TEST_FILE_1 = "./tests/etldb_test_01.json" + EFFECTIVE_DATE = "2024-10-07" + + def test_init_db(self): + """Test the db initialization command.""" + # setup - create command + command = [ + "etl", + "initialize_database", + ] + # execution + result = runner.invoke(app, command) + print(result.stdout) + # validation - check there wasn't an error + assert result.exit_code == 0 + assert "initializing database" in result.stdout + assert "done" in result.stdout + + def test_transform_and_load_with_valid_parameters(self): + """Test the transform and load command.""" + # setup - create command + command = [ + "etl", + "transform_and_load", + "--deliverable-file", + self.TEST_FILE_1, + "--effective-date", + str(self.EFFECTIVE_DATE), + ] + # execution + result = runner.invoke(app, command) + print(result.stdout) + # validation - check there wasn't an error + assert result.exit_code == 0 + assert ( + f"running transform and load with effective date {self.EFFECTIVE_DATE}" + in result.stdout + ) + assert "quad row(s) processed: 1" in result.stdout + assert "deliverable row(s) processed: 2" in result.stdout + assert "sprint row(s) processed: 5" in result.stdout + assert "epic row(s) processed: 4" in result.stdout + assert "issue row(s) processed: 22" in result.stdout + assert "transform and load is done" in result.stdout + + def test_transform_and_load_with_malformed_effective_date_parameter(self): + """Test the transform and load command.""" + # setup - create command + command = [ + "etl", + "transform_and_load", + "--deliverable-file", + self.TEST_FILE_1, + "--effective-date", + "2024-Oct-07", + ] + # execution + result = runner.invoke(app, command) + print(result.stdout) + # validation - check there wasn't an error + assert result.exit_code == 0 + assert "FATAL ERROR: malformed effective date" in result.stdout diff --git a/documentation/analytics/usage.md b/documentation/analytics/usage.md index c84a4f021..801b8cbb3 100644 --- a/documentation/analytics/usage.md +++ b/documentation/analytics/usage.md @@ -223,3 +223,17 @@ poetry run analytics calculate deliverable_percent_complete \ --show-results \ --unit points ``` + +### Extract and Load + +Development is underway on new as-is/as-was reporting capabilities, the foundation of which is an extract-and-load workflow that writes to an ETL DB. + +Initialize the ETL DB: +```bash +poetry run analytics etl initialize_database +``` + +Transform and load a json file into the ETL DB: +```bash +poetry run analytics etl transform_and_load --deliverable-file ./data/test-etl-01.json --effective-date 2024-10-28 +```