From d69d56dec6ec30e7af7f55e1000227139419c1fd Mon Sep 17 00:00:00 2001 From: Billy Daly Date: Mon, 4 Nov 2024 17:49:09 -0500 Subject: [PATCH 01/13] [Issue #2489] Updates `DeliverablePercentComplete` to use `GitHubIssues` dataset (#2710) Updates the percent complete by deliverable metric to use the new `GitHubIssues` dataset: - Adds a `deliverable_status` and `issue_closed` attributes to the `GitHubIssues` dataset, needed to calculate percent complete by deliverable - Replaces `DeliverableTasks` with `GitHubIssues` as the dataset used to calculate `DeliverablePercentComplete` metric - Updates the entry point to calculate deliverable percent complete --- analytics/Makefile | 4 +- analytics/src/analytics/cli.py | 17 +----- analytics/src/analytics/datasets/issues.py | 19 +++++- analytics/src/analytics/etl/github.py | 3 +- .../github/getRoadmapData.graphql | 3 + .../src/analytics/integrations/github/main.py | 2 + .../src/analytics/metrics/percent_complete.py | 13 ++-- .../tests/metrics/test_percent_complete.py | 60 ++++++++++--------- analytics/tests/test_cli.py | 16 ++--- 9 files changed, 72 insertions(+), 65 deletions(-) diff --git a/analytics/Makefile b/analytics/Makefile index bbca614892..2cfb02512c 100644 --- a/analytics/Makefile +++ b/analytics/Makefile @@ -210,9 +210,7 @@ percent-complete: @echo "=> Running percent complete deliverable" @echo "=====================================================" $(POETRY) analytics calculate deliverable_percent_complete \ - --sprint-file $(SPRINT_FILE) \ - --roadmap-file $(ROADMAP_FILE) \ - --issue-file $(ISSUE_FILE) \ + --issue-file $(DELIVERY_FILE) \ --output-dir $(OUTPUT_DIR) \ --include-status "In Progress" \ --include-status "Planning" \ diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index b70950b268..8ec11b82ee 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -9,7 +9,6 @@ from slack_sdk import WebClient from sqlalchemy import text -from analytics.datasets.deliverable_tasks import DeliverableTasks from analytics.datasets.issues import GitHubIssues from analytics.etl.github import GitHubProjectConfig, GitHubProjectETL from analytics.etl.utils import load_config @@ -165,7 +164,6 @@ def calculate_sprint_burnup( @metrics_app.command(name="deliverable_percent_complete") def calculate_deliverable_percent_complete( - sprint_file: Annotated[str, SPRINT_FILE_ARG], issue_file: Annotated[str, ISSUE_FILE_ARG], # Typer uses the Unit enum to validate user inputs from the CLI # but the default arg must be a string or the CLI will throw an error @@ -174,23 +172,10 @@ def calculate_deliverable_percent_complete( show_results: Annotated[bool, SHOW_RESULTS_ARG] = False, post_results: Annotated[bool, POST_RESULTS_ARG] = False, output_dir: Annotated[str, OUTPUT_DIR_ARG] = "data", - roadmap_file: Annotated[Optional[str], ROADMAP_FILE_ARG] = None, # noqa: UP007 include_status: Annotated[Optional[list[str]], STATUS_ARG] = None, # noqa: UP007 ) -> None: """Calculate percentage completion by deliverable.""" - if roadmap_file: - # load the input data using the new join path with roadmap data - task_data = DeliverableTasks.load_from_json_files_with_roadmap_data( - sprint_file=sprint_file, - issue_file=issue_file, - roadmap_file=roadmap_file, - ) - else: - # load the input data using the original join path without roadmap data - task_data = DeliverableTasks.load_from_json_files( - sprint_file=sprint_file, - issue_file=issue_file, - ) + task_data = GitHubIssues.from_json(issue_file) # calculate percent complete metric = DeliverablePercentComplete( dataset=task_data, diff --git a/analytics/src/analytics/datasets/issues.py b/analytics/src/analytics/datasets/issues.py index 704fa4db5a..1b206941e9 100644 --- a/analytics/src/analytics/datasets/issues.py +++ b/analytics/src/analytics/datasets/issues.py @@ -4,7 +4,7 @@ from enum import Enum import pandas as pd -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, computed_field from analytics.datasets.base import BaseDataset @@ -26,6 +26,13 @@ class IssueType(Enum): NONE = None +class IssueState(Enum): + """Whether the issue is open or closed.""" + + OPEN = "open" + CLOSED = "closed" + + class IssueMetadata(BaseModel): """Stores information about issue type and parent (if applicable).""" @@ -58,9 +65,19 @@ class IssueMetadata(BaseModel): # Parent metadata -- attributes about parent issues populated via lookup deliverable_url: str | None = Field(default=None) deliverable_title: str | None = Field(default=None) + deliverable_status: str | None = Field(default=None) epic_url: str | None = Field(default=None) epic_title: str | None = Field(default=None) + # See https://docs.pydantic.dev/2.0/usage/computed_fields/ + @computed_field # type: ignore[misc] + @property + def issue_state(self) -> str: + """Whether the issue is open or closed.""" + if self.issue_is_closed: + return IssueState.CLOSED.value + return IssueState.OPEN.value + # =============================================================== # Dataset class diff --git a/analytics/src/analytics/etl/github.py b/analytics/src/analytics/etl/github.py index 0cd48a845f..3d5494c503 100644 --- a/analytics/src/analytics/etl/github.py +++ b/analytics/src/analytics/etl/github.py @@ -288,6 +288,7 @@ def flatten_issue_data(lookup: dict[str, IssueMetadata]) -> list[dict]: issue.deliverable_title = deliverable.issue_title issue.deliverable_url = deliverable.issue_url issue.deliverable_pillar = deliverable.deliverable_pillar + issue.deliverable_status = deliverable.issue_status # Set quad metadata issue.quad_id = deliverable.quad_id issue.quad_name = deliverable.quad_name @@ -306,7 +307,7 @@ def flatten_issue_data(lookup: dict[str, IssueMetadata]) -> list[dict]: issue.epic_url = epic.issue_url # Add the issue to the results - result.append(issue.__dict__) + result.append(issue.model_dump()) # Return the results return result diff --git a/analytics/src/analytics/integrations/github/getRoadmapData.graphql b/analytics/src/analytics/integrations/github/getRoadmapData.graphql index 866753215f..5b4bd1aaf0 100644 --- a/analytics/src/analytics/integrations/github/getRoadmapData.graphql +++ b/analytics/src/analytics/integrations/github/getRoadmapData.graphql @@ -27,6 +27,9 @@ query ( pillar: fieldValueByName(name: $pillarField) { ...singleSelectContent } + status: fieldValueByName(name: "Status") { + ...singleSelectContent + } } } } diff --git a/analytics/src/analytics/integrations/github/main.py b/analytics/src/analytics/integrations/github/main.py index 1cf702c7a4..7a5e4aa531 100644 --- a/analytics/src/analytics/integrations/github/main.py +++ b/analytics/src/analytics/integrations/github/main.py @@ -73,6 +73,7 @@ def export_sprint_data( issue_url: .content.url, issue_parent: .content.parent.url, issue_type: .content.issueType.name, + issue_status: .status.name, issue_is_closed: .content.closed, issue_opened_at: .content.createdAt, issue_closed_at: .content.closedAt, @@ -146,6 +147,7 @@ def export_roadmap_data( issue_url: .content.url, issue_parent: .content.parent.url, issue_type: .content.issueType.name, + issue_status: .status.name, issue_is_closed: .content.closed, issue_opened_at: .content.createdAt, issue_closed_at: .content.closedAt, diff --git a/analytics/src/analytics/metrics/percent_complete.py b/analytics/src/analytics/metrics/percent_complete.py index 6064532c16..113e36f7d0 100644 --- a/analytics/src/analytics/metrics/percent_complete.py +++ b/analytics/src/analytics/metrics/percent_complete.py @@ -6,25 +6,26 @@ import plotly.express as px from plotly.graph_objects import Figure -from analytics.datasets.deliverable_tasks import DeliverableTasks +from analytics.datasets.issues import GitHubIssues from analytics.metrics.base import BaseMetric, Statistic, Unit -class DeliverablePercentComplete(BaseMetric[DeliverableTasks]): +class DeliverablePercentComplete(BaseMetric[GitHubIssues]): """Calculate the percentage of issues or points completed per deliverable.""" def __init__( self, - dataset: DeliverableTasks, + dataset: GitHubIssues, unit: Unit, statuses_to_include: list[str] | None = None, ) -> None: """Initialize the DeliverablePercentComplete metric.""" self.dataset = dataset self.deliverable_col = "deliverable_title" - self.status_col = "status" + self.status_col = "issue_state" self.deliverable_status_col = "deliverable_status" self.unit = unit + self.unit_col = dataset.points_col if unit == Unit.points else unit.value self.statuses_to_include = statuses_to_include self.deliverable_data = self._isolate_deliverables_by_status() super().__init__(dataset) @@ -80,7 +81,7 @@ def get_stats(self) -> dict[str, Statistic]: """Calculate stats for this metric.""" df_src = self.deliverable_data # get the total number of issues and the number of issues with points per deliverable - is_pointed = df_src[Unit.points.value] >= 1 + is_pointed = df_src[self.dataset.points_col] >= 1 issues_total = df_src.value_counts(self.deliverable_col).to_frame() issues_pointed = ( df_src[is_pointed].value_counts(self.deliverable_col).to_frame() @@ -127,7 +128,7 @@ def _get_count_by_deliverable( """Get the count of issues (or points) by deliverable and status.""" # create local copies of the dataset and key column names df = self.deliverable_data.copy() - unit_col = self.unit.value + unit_col = self.unit_col key_cols = [self.deliverable_col, unit_col] # create a dummy column to sum per row if the unit is issues if self.unit == Unit.issues: diff --git a/analytics/tests/metrics/test_percent_complete.py b/analytics/tests/metrics/test_percent_complete.py index ea5cd0bc76..5a819706e3 100644 --- a/analytics/tests/metrics/test_percent_complete.py +++ b/analytics/tests/metrics/test_percent_complete.py @@ -4,9 +4,9 @@ import pytest -from analytics.datasets.deliverable_tasks import DeliverableTasks +from analytics.datasets.issues import GitHubIssues, IssueMetadata, IssueType from analytics.metrics.percent_complete import DeliverablePercentComplete, Unit -from tests.conftest import MockSlackbot +from tests.conftest import MockSlackbot, DAY_0, DAY_1 def task_row( @@ -17,15 +17,21 @@ def task_row( status: str | None = "open", ) -> dict: """Create a sample row of the DeliverableTasks dataset.""" - return { - "deliverable_number": deliverable, - "deliverable_title": f"Deliverable {deliverable}", - "deliverable_status": deliverable_status, - "issue_number": task, - "issue_title": f"Task {task}" if task else None, - "points": points, - "status": status, - } + issue = IssueMetadata( + project_owner="HHS", + project_number=1, + issue_title=f"Task {task}", + issue_url=f"task{task}", + issue_type=IssueType.TASK.value, + issue_parent=None, + issue_points=points, + issue_is_closed=status == "closed", + issue_opened_at=DAY_0, + issue_closed_at=DAY_1 if status == "closed" else None, + deliverable_title=f"Deliverable {deliverable}", + deliverable_status=deliverable_status, + ) + return issue.model_dump() @pytest.fixture(name="percent_complete", scope="module") @@ -37,7 +43,7 @@ def sample_percent_complete() -> DeliverablePercentComplete: task_row(deliverable=1, task=2, status="closed"), task_row(deliverable=2, task=3, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # return sprint burndown by points return DeliverablePercentComplete(test_data, unit=Unit.points) @@ -53,7 +59,7 @@ def test_percent_complete_based_on_task_count(self): task_row(deliverable=1, task=2, status="closed"), task_row(deliverable=2, task=3, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution df = DeliverablePercentComplete(test_data, unit=Unit.issues).results df = df.set_index("deliverable_title") @@ -80,7 +86,7 @@ def test_percent_complete_based_on_points(self): task_row(deliverable=1, task=2, points=3, status="closed"), task_row(deliverable=2, task=3, points=5, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution df = DeliverablePercentComplete(test_data, unit=Unit.points).results df = df.set_index("deliverable_title") @@ -106,7 +112,7 @@ def test_show_0_pct_for_deliverables_without_tasks(self): task_row(deliverable=1, task=2, status="closed"), task_row(deliverable=2, task=None, status=None), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution - use tasks as the unit df = DeliverablePercentComplete(test_data, unit=Unit.issues).results df = df.set_index("deliverable_title") @@ -132,7 +138,7 @@ def test_show_0_pct_for_deliverables_without_points(self): task_row(deliverable=1, task=2, points=2, status="closed"), task_row(deliverable=2, task=None, points=None, status=None), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution - use points as the unit df = DeliverablePercentComplete(test_data, unit=Unit.points).results df = df.set_index("deliverable_title") @@ -164,7 +170,7 @@ class TestFilteringReportByDeliverableStatus: def test_filter_out_deliverables_with_excluded_status(self): """The results should exclude deliverables with a status that wasn't passed.""" # setup - create test dataset - test_data = DeliverableTasks.from_dict(self.TEST_ROWS) + test_data = GitHubIssues.from_dict(self.TEST_ROWS) # execution df = DeliverablePercentComplete( test_data, @@ -180,7 +186,7 @@ def test_filter_out_deliverables_with_excluded_status(self): def test_invert_statuses_selected(self): """We should filter out the other deliverable if invert statuses selected.""" # setup - create test dataset - test_data = DeliverableTasks.from_dict(self.TEST_ROWS) + test_data = GitHubIssues.from_dict(self.TEST_ROWS) # execution df = DeliverablePercentComplete( test_data, @@ -196,7 +202,7 @@ def test_invert_statuses_selected(self): def test_list_selected_statuses_in_slack_message(self): """If we filter on status, those statuses should be listed in the slack message.""" # setup - create test dataset - test_data = DeliverableTasks.from_dict(self.TEST_ROWS) + test_data = GitHubIssues.from_dict(self.TEST_ROWS) # execution metric = DeliverablePercentComplete( test_data, @@ -211,7 +217,7 @@ def test_list_selected_statuses_in_slack_message(self): def test_stats_also_filter_out_deliverables_with_excluded_status(self): """Filtered deliverables should also be excluded from get_stats().""" # setup - create test dataset - test_data = DeliverableTasks.from_dict(self.TEST_ROWS) + test_data = GitHubIssues.from_dict(self.TEST_ROWS) # execution metric = DeliverablePercentComplete( test_data, @@ -236,7 +242,7 @@ def test_all_issues_are_pointed(self): task_row(deliverable=2, task=3, points=3, status="open"), task_row(deliverable=2, task=3, points=1, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.issues) # validation @@ -256,7 +262,7 @@ def test_some_issues_are_not_pointed(self): task_row(deliverable=2, task=3, points=3, status="open"), task_row(deliverable=2, task=3, points=None, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.issues) # validation @@ -275,7 +281,7 @@ def test_deliverables_without_tasks_have_0_pct_pointed(self): task_row(deliverable=1, task=2, points=1, status="closed"), task_row(deliverable=2, task=None, points=None, status=None), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.issues) # validation @@ -295,7 +301,7 @@ def test_slack_message_contains_right_number_of_lines(self): task_row(deliverable=2, task=2, points=1, status="closed"), task_row(deliverable=3, task=3, points=3, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.issues) lines = output.format_slack_message().splitlines() @@ -309,7 +315,7 @@ def test_title_includes_issues_when_unit_is_issue(self): task_row(deliverable=1, task=1, points=2, status="open"), task_row(deliverable=2, task=2, points=1, status=None), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.issues) title = output.format_slack_message().splitlines()[0] @@ -323,7 +329,7 @@ def test_title_includes_points_when_unit_is_points(self): task_row(deliverable=1, task=1, points=2, status="open"), task_row(deliverable=2, task=2, points=1, status=None), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.points) title = output.format_slack_message().splitlines()[0] @@ -343,7 +349,7 @@ def test_plot_results_output_stored_in_chart_property(self): task_row(deliverable=2, task=3, points=3, status="open"), task_row(deliverable=2, task=3, points=None, status="open"), ] - test_data = DeliverableTasks.from_dict(test_rows) + test_data = GitHubIssues.from_dict(test_rows) # execution output = DeliverablePercentComplete(test_data, unit=Unit.issues) # validation - check that the chart attribute matches output of plot_results() diff --git a/analytics/tests/test_cli.py b/analytics/tests/test_cli.py index 0c323eb934..8c230aaaec 100644 --- a/analytics/tests/test_cli.py +++ b/analytics/tests/test_cli.py @@ -40,8 +40,8 @@ def test_file_fixtures(tmp_path: Path) -> MockFiles: json_issue_row(issue=2, labels=["deliverable: 30k ft"]), ] delivery_data = [ - issue(issue=1).__dict__, - issue(issue=2).__dict__, + issue(issue=1).model_dump(), + issue(issue=2).model_dump(), ] # write test data to json files write_test_data_to_file(issue_data, issue_file) @@ -217,10 +217,8 @@ def test_calculate_deliverable_percent_complete(self, mock_files: MockFiles): command = [ "calculate", "deliverable_percent_complete", - "--sprint-file", - str(mock_files.sprint_file), "--issue-file", - str(mock_files.issue_file), + str(mock_files.delivery_file), ] # execution result = runner.invoke(app, command) @@ -238,10 +236,8 @@ def test_stdout_message_includes_points_if_no_unit_is_set( command = [ "calculate", "deliverable_percent_complete", - "--sprint-file", - str(mock_files.sprint_file), "--issue-file", - str(mock_files.issue_file), + str(mock_files.delivery_file), "--show-results", ] # execution @@ -262,10 +258,8 @@ def test_stdout_message_includes_issues_if_unit_set_to_issues( command = [ "calculate", "deliverable_percent_complete", - "--sprint-file", - str(mock_files.sprint_file), "--issue-file", - str(mock_files.issue_file), + str(mock_files.delivery_file), "--unit", "issues", "--show-results", From 79996310c092008ee74aa3c482581c1f6d1a6bb6 Mon Sep 17 00:00:00 2001 From: Billy Daly Date: Tue, 5 Nov 2024 08:04:34 -0500 Subject: [PATCH 02/13] [Issue #2470] Removes deprecated analytics code (#2718) - Removes `SprintBoard` and `DeliverableTask` datasets - Removes CLI entry points for the old method of exporting issue and project data: - `poetry run analytics export issue_data` - `poetry run analytics export project_data` - Removes code in `analytics/integrations/github/main.py` for these old export patterns - Updates `documentation/analytics/usage.md` and `documentation/analytics/development.md` to remove references to the old patterns - Cleans up targets in `Makefile` --- analytics/Makefile | 43 +-- analytics/src/analytics/cli.py | 30 +- .../analytics/datasets/deliverable_tasks.py | 228 ------------ .../src/analytics/datasets/sprint_board.py | 146 -------- analytics/src/analytics/datasets/utils.py | 52 --- .../analytics/integrations/github/__init__.py | 4 - .../src/analytics/integrations/github/main.py | 24 -- .../tests/datasets/test_deliverable_tasks.py | 330 ------------------ analytics/tests/datasets/test_sprint_board.py | 162 --------- analytics/tests/etl/test_github.py | 279 +++++++++------ documentation/analytics/development.md | 8 +- documentation/analytics/usage.md | 40 ++- 12 files changed, 208 insertions(+), 1138 deletions(-) delete mode 100644 analytics/src/analytics/datasets/deliverable_tasks.py delete mode 100644 analytics/src/analytics/datasets/sprint_board.py delete mode 100644 analytics/tests/datasets/test_deliverable_tasks.py delete mode 100644 analytics/tests/datasets/test_sprint_board.py diff --git a/analytics/Makefile b/analytics/Makefile index 2cfb02512c..036fdfc6b9 100644 --- a/analytics/Makefile +++ b/analytics/Makefile @@ -9,10 +9,7 @@ ROADMAP_PROJECT ?= 12 OUTPUT_DIR ?= data CONFIG_DIR ?= config PROJECT_CONFIG_FILE ?= $(CONFIG_DIR)/github-projects.json -SPRINT_FILE ?= $(OUTPUT_DIR)/sprint-data.json -ROADMAP_FILE ?= $(OUTPUT_DIR)/roadmap-data.json -ISSUE_FILE ?= $(OUTPUT_DIR)/issue-data.json -DELIVERY_FILE ?= $(OUTPUT_DIR)/delivery-data.json +ISSUE_FILE ?= $(OUTPUT_DIR)/delivery-data.json SPRINT ?= @current # Names of the points and sprint fields in the GitHub project POINTS_FIELD ?= Points @@ -146,50 +143,24 @@ lint: ## runs code quality checks # Data Commands # ################# -sprint-data-export: - @echo "=> Exporting project data from the sprint board" - @echo "=====================================================" - $(POETRY) analytics export gh_project_data \ - --owner $(ORG) \ - --project $(SPRINT_PROJECT) \ - --output-file $(SPRINT_FILE) - gh-db-data-import: @echo "=> Importing sprint data to the database" @echo "=====================================================" - $(POETRY) analytics import db_import --delivery-file $(DELIVERY_FILE) - -roadmap-data-export: - @echo "=> Exporting project data from the product roadmap" - @echo "=====================================================" - $(POETRY) analytics export gh_project_data \ - --owner $(ORG) \ - --project $(ROADMAP_PROJECT) \ - --output-file $(ROADMAP_FILE) + $(POETRY) analytics import db_import --delivery-file $(ISSUE_FILE) -delivery-data-export: +gh-data-export: @echo "=> Exporting GitHub issue and sprint data for delivery metrics" @echo "=====================================================" $(POETRY) analytics export gh_delivery_data \ --config-file $(PROJECT_CONFIG_FILE) \ - --output-file $(DELIVERY_FILE) \ + --output-file $(ISSUE_FILE) \ --temp-dir $(OUTPUT_DIR) -issue-data-export: - @echo "=> Exporting issue data from the repository" - @echo "=====================================================" - $(POETRY) analytics export gh_issue_data \ - --owner $(ORG) \ - --repo $(REPO) \ - --output-file $(ISSUE_FILE) - -gh-data-export: sprint-data-export issue-data-export roadmap-data-export delivery-data-export - sprint-burndown: @echo "=> Running sprint burndown report for HHS/13" @echo "=====================================================" $(POETRY) analytics calculate sprint_burndown \ - --issue-file $(DELIVERY_FILE) \ + --issue-file $(ISSUE_FILE) \ --output-dir $(OUTPUT_DIR) \ --sprint "$(SPRINT)" \ --project 13 \ @@ -199,7 +170,7 @@ sprint-burndown: @echo "=> Running sprint burndown report for HHS/17" @echo "=====================================================" $(POETRY) analytics calculate sprint_burndown \ - --issue-file $(DELIVERY_FILE) \ + --issue-file $(ISSUE_FILE) \ --output-dir $(OUTPUT_DIR) \ --sprint "$(SPRINT)" \ --project 17 \ @@ -210,7 +181,7 @@ percent-complete: @echo "=> Running percent complete deliverable" @echo "=====================================================" $(POETRY) analytics calculate deliverable_percent_complete \ - --issue-file $(DELIVERY_FILE) \ + --issue-file $(ISSUE_FILE) \ --output-dir $(OUTPUT_DIR) \ --include-status "In Progress" \ --include-status "Planning" \ diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index 8ec11b82ee..37bea4334f 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -12,7 +12,7 @@ from analytics.datasets.issues import GitHubIssues from analytics.etl.github import GitHubProjectConfig, GitHubProjectETL from analytics.etl.utils import load_config -from analytics.integrations import db, github, slack +from analytics.integrations import db, slack from analytics.metrics.base import BaseMetric, Unit from analytics.metrics.burndown import SprintBurndown from analytics.metrics.burnup import SprintBurnup @@ -24,18 +24,14 @@ # fmt: off # Instantiate typer options with help text for the commands below CONFIG_FILE_ARG = typer.Option(help="Path to JSON file with configurations for this entrypoint") -SPRINT_FILE_ARG = typer.Option(help="Path to file with exported sprint data") ISSUE_FILE_ARG = typer.Option(help="Path to file with exported issue data") -ROADMAP_FILE_ARG = typer.Option(help="Path to file with exported roadmap data") OUTPUT_FILE_ARG = typer.Option(help="Path to file where exported data will be saved") OUTPUT_DIR_ARG = typer.Option(help="Path to directory where output files will be saved") TMP_DIR_ARG = typer.Option(help="Path to directory where intermediate files will be saved") -OWNER_ARG = typer.Option(help="GitHub handle of the repo or project owner") -REPO_ARG = typer.Option(help="Name of the GitHub repo") -PROJECT_ARG = typer.Option(help="Number of the GitHub project") -FIELD_ARG = typer.Option(help="Name of the GitHub project field") SPRINT_ARG = typer.Option(help="Name of the sprint for which we're calculating burndown") UNIT_ARG = typer.Option(help="Whether to calculate completion by 'points' or 'tickets'") +OWNER_ARG = typer.Option(help="Name of the GitHub project owner, e.g. HHS") +PROJECT_ARG = typer.Option(help="Number of the GitHub project, e.g. 13") SHOW_RESULTS_ARG = typer.Option(help="Display a chart of the results in a browser") POST_RESULTS_ARG = typer.Option(help="Post the results to slack") STATUS_ARG = typer.Option( @@ -65,26 +61,6 @@ def callback() -> None: # =========================================================== -@export_app.command(name="gh_project_data") -def export_github_project_data( - owner: Annotated[str, OWNER_ARG], - project: Annotated[int, PROJECT_ARG], - output_file: Annotated[str, OUTPUT_FILE_ARG], -) -> None: - """Export data about items in a GitHub project and write it to an output file.""" - github.export_project_data(owner, project, output_file) - - -@export_app.command(name="gh_issue_data") -def export_github_issue_data( - owner: Annotated[str, OWNER_ARG], - repo: Annotated[str, REPO_ARG], - output_file: Annotated[str, OUTPUT_FILE_ARG], -) -> None: - """Export data about issues a GitHub repo and write it to an output file.""" - github.export_issue_data(owner, repo, output_file) - - @export_app.command(name="gh_delivery_data") def export_github_data( config_file: Annotated[str, CONFIG_FILE_ARG], diff --git a/analytics/src/analytics/datasets/deliverable_tasks.py b/analytics/src/analytics/datasets/deliverable_tasks.py deleted file mode 100644 index 41c126e862..0000000000 --- a/analytics/src/analytics/datasets/deliverable_tasks.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -Implements the DeliverableTasks dataset. - -This is a sub-class of BaseDataset that groups 30k ft deliverables with the -tasks needed to complete those deliverable -""" - -from typing import Self - -import pandas as pd - -from analytics.datasets.base import BaseDataset -from analytics.datasets.utils import load_json_data_as_df - -LABEL_30K = "deliverable: 30k ft" # label for 30,000 ft deliverable in our roadmap -LABEL_10K = "deliverable: 30k ft" # label for 10,000 ft deliverable in our roadmap - - -def pluck_label_name(labels: list | None) -> list[str]: - """Reformat the label dictionary to return a list of label names.""" - if labels and isinstance(labels, list): - return [label["name"] for label in labels] - return [] - - -class DeliverableTasks(BaseDataset): - """Stores 30k ft deliverables and the tasks needed to complete them.""" - - ISSUE_DATE_COLS = ["created_date", "closed_date"] - ISSUE_COLUMN_MAP = { - "number": "issue_number", - "title": "issue_title", - "labels": "labels", - "createdAt": "created_date", - "closedAt": "closed_date", - } - SPRINT_DATE_COLS = ["milestone_due_date"] - SPRINT_COLUMN_MAP = { - "content.number": "issue_number", - "content.type": "type", - "content.body": "issue_body", - "assignees": "assignees", - "content.url": "url", - "story Points": "points", - "deliverable": "deliverable", - "milestone.title": "milestone", - "milestone.dueOn": "milestone_due_date", - "milestone.description": "milestone_description", - } - ROADMAP_COLUMN_MAP = { - "content.number": "deliverable_number", - "content.title": "deliverable_title", - "labels": "deliverable_labels", - "status": "deliverable_status", - "deliverable": "deliverable", - } - FINAL_COLUMNS = [ - "deliverable_number", - "deliverable_title", - "deliverable_status", - "issue_title", - "issue_number", - "points", - "status", - ] - - @classmethod - def load_from_json_files( - cls, - deliverable_label: str = LABEL_30K, - sprint_file: str = "data/sprint-data.json", - issue_file: str = "data/issue-data.json", - ) -> Self: - """ - Load the input datasets and instantiate the DeliverableTasks class. - - Parameters - ---------- - deliverable_label: str - The GitHub label used to flag deliverable tickets - sprint_file: str - Path to the local copy of sprint data exported from GitHub - issue_file: str - Path to the local copy of issue data exported from GitHub - - Returns - ------- - Self: - An instance of the DeliverableTasks dataset class - """ - # load and merge input datasets - df_sprints = load_json_data_as_df( - file_path=sprint_file, - column_map=cls.SPRINT_COLUMN_MAP, - date_cols=cls.SPRINT_DATE_COLS, - key_for_nested_items="items", - ) - df_issues = load_json_data_as_df( - file_path=issue_file, - column_map=cls.ISSUE_COLUMN_MAP, - date_cols=cls.ISSUE_DATE_COLS, - ) - # join the issues and sprint data and apply transformations - df = df_issues.merge(df_sprints, on="issue_number", how="left") - df = cls._apply_transformations(df, deliverable_label) - return cls(df) - - @classmethod - def _apply_transformations( - cls, - df_all: pd.DataFrame, - deliverable_label: str, - ) -> pd.DataFrame: - """ - Apply column specific data transformations. - - Parameters - ---------- - df_all: pd.DataFrame - A dataframe of all issues and their fields from the sprint board - deliverable_label: str - The GitHub label used to flag deliverable tickets - """ - # extract parent issue number from the milestone description - deliverable_regex = r"(?: deliverable: \#)(?P\d+)" - df_all["deliverable_number"] = ( - df_all["milestone_description"] - .str.extract(pat=deliverable_regex, expand=False) - .astype("Int64") - ) - # calculate task status - df_all["status"] = "open" - # tasks are closed if they DO have a closed_date - is_closed = ~df_all["closed_date"].isna() # ~ is negation - df_all.loc[is_closed, "status"] = "closed" - # isolate 30k deliverable issues and rename their cols - df_all["labels"] = df_all["labels"].apply(pluck_label_name) - deliverable_mask = df_all["labels"].apply(lambda x: deliverable_label in x) - deliverable_cols = { - "issue_number": "deliverable_number", - "issue_title": "deliverable_title", - } - df_deliverable = df_all.loc[deliverable_mask, list(deliverable_cols.keys())] - df_deliverable = df_deliverable.rename(columns=deliverable_cols) - # left join to df on "deliverable_number" to get the deliverable title - df = df_deliverable.merge(df_all, on="deliverable_number", how="left") - # add placeholder col to support filtering on deliverable status - df["deliverable_status"] = None - return df[cls.FINAL_COLUMNS] - - @classmethod - def load_from_json_files_with_roadmap_data( - cls, - deliverable_label: str = LABEL_30K, - sprint_file: str = "data/sprint-data.json", - issue_file: str = "data/issue-data.json", - roadmap_file: str = "data/roadmap-data.json", - ) -> Self: - """ - Load the data sources and instantiate the DeliverableTasks class. - - Parameters - ---------- - deliverable_label: str - The GitHub label used to flag deliverable tickets - sprint_file: str - Path to the local copy of sprint data exported from GitHub - issue_file: str - Path to the local copy of issue data exported from GitHub - roadmap_file: str - Path to the local copy of the roadmap data exported from GitHub - - Returns - ------- - Self: - An instance of the DeliverableTasks dataset class - """ - # load input datasets - df_sprints = load_json_data_as_df( - file_path=sprint_file, - column_map=cls.SPRINT_COLUMN_MAP, - date_cols=cls.SPRINT_DATE_COLS, - key_for_nested_items="items", - ) - df_issues = load_json_data_as_df( - file_path=issue_file, - column_map=cls.ISSUE_COLUMN_MAP, - date_cols=cls.ISSUE_DATE_COLS, - ) - df_roadmap = load_json_data_as_df( - file_path=roadmap_file, - column_map=cls.ROADMAP_COLUMN_MAP, - key_for_nested_items="items", - ) - # filter for 30k ft deliverables - df_roadmap = cls._isolate_deliverables(df_roadmap, deliverable_label) - # join the issues and sprint data and apply transformations - df = df_issues.merge(df_sprints, on="issue_number", how="left") - df = df_roadmap.merge(df, on="deliverable", how="left") - df = cls._calculate_issue_status(df) - # return the final list of columns in the correct order - return cls(df[cls.FINAL_COLUMNS]) - - @classmethod - def _calculate_issue_status(cls, df: pd.DataFrame) -> pd.DataFrame: - """Apply column specific data transformations with roadmap data.""" - # calculate task status - df["status"] = "open" - # tasks are closed if they DO have a closed_date - is_closed = ~df["closed_date"].isna() # ~ is negation - df.loc[is_closed, "status"] = "closed" - return df - - @classmethod - def _isolate_deliverables( - cls, - df: pd.DataFrame, - deliverable_label: str, - ) -> pd.DataFrame: - """Apply column specific data transformations with roadmap data.""" - # remove deliverables without labels or a deliverable column set - df = df[df["deliverable_labels"].notna()] - df = df[df["deliverable"].notna()] - # isolate deliverables with the correct label - is_deliverable = df["deliverable_labels"].apply( - lambda labels: deliverable_label in labels, - ) - return df[is_deliverable] diff --git a/analytics/src/analytics/datasets/sprint_board.py b/analytics/src/analytics/datasets/sprint_board.py deleted file mode 100644 index 94a4522d85..0000000000 --- a/analytics/src/analytics/datasets/sprint_board.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -Implements the SprintBoard dataset. - -This is a sub-class of BaseDataset that stores the tickets and metadata -set for each ticket in the Sprint Planning Board -""" - -from __future__ import annotations - -from typing import Self - -import pandas as pd - -from analytics.datasets.base import BaseDataset -from analytics.datasets.utils import load_json_data_as_df - - -class SprintBoard(BaseDataset): - """Stores the GitHub project data for the Sprint Planning Board.""" - - ISSUE_DATE_COLS = ["created_date", "closed_date"] - ISSUE_COLUMN_MAP = { - "number": "issue_number", - "createdAt": "created_date", - "closedAt": "closed_date", - } - SPRINT_DATE_COLS = ["sprint_start_date", "milestone_due_date"] - SPRINT_COLUMN_MAP = { - "content.number": "issue_number", - "title": "issue_title", - "content.type": "type", - "content.body": "issue_body", - "status": "status", - "assignees": "assignees", - "labels": "labels", - "content.url": "url", - "story Points": "points", - "milestone.title": "milestone", - "milestone.dueOn": "milestone_due_date", - "milestone.description": "milestone_description", - "sprint.title": "sprint", - "sprint.startDate": "sprint_start_date", - "sprint.duration": "sprint_duration", - } - - def __init__(self, df: pd.DataFrame) -> None: - """Intializes the sprint board dataset.""" - # set named columns - self.opened_col = "created_date" - self.closed_col = "closed_date" - self.sprint_col = "sprint" - self.sprint_start_col = "sprint_start_date" - self.sprint_end_col = "sprint_end_date" - # initialize the parent class - super().__init__(df) - - def sprint_start(self, sprint: str) -> pd.Timestamp: - """Return the date on which a given sprint started.""" - sprint_mask = self.df[self.sprint_col] == sprint - sprint_start = self.df.loc[sprint_mask, self.sprint_start_col].min() - return sprint_start.tz_localize("UTC") - - def sprint_end(self, sprint: str) -> pd.Timestamp: - """Return the date on which a given sprint ended.""" - sprint_mask = self.df[self.sprint_col] == sprint - sprint_end = self.df.loc[sprint_mask, self.sprint_end_col].max() - return sprint_end.tz_localize("UTC") - - @property - def sprints(self) -> pd.DataFrame: - """Return the unique list of sprints with their start and end dates.""" - sprint_cols = [self.sprint_col, self.sprint_start_col, self.sprint_end_col] - return self.df[sprint_cols].drop_duplicates() - - @property - def current_sprint(self) -> str | None: - """Return the name of the current sprint, if a sprint is currently active.""" - return self.get_sprint_name_from_date(pd.Timestamp.today().floor("d")) - - def get_sprint_name_from_date(self, date: pd.Timestamp) -> str | None: - """Get the name of a sprint from a given date, if that date falls in a sprint.""" - # fmt: off - date_filter = ( - (self.sprints[self.sprint_start_col] < date) # after sprint start - & (self.sprints[self.sprint_end_col] >= date) # before sprint end - ) - # fmt: on - matching_sprints = self.sprints.loc[date_filter, self.sprint_col] - # if there aren't any sprints return None - if len(matching_sprints) == 0: - return None - # if there are, return the first value as a string - return str(matching_sprints.squeeze()) - - @classmethod - def load_from_json_files( - cls, - sprint_file: str = "data/sprint-data.json", - issue_file: str = "data/issue-data.json", - ) -> Self: - """ - Load the input datasets and instantiate the SprintBoard class. - - Parameters - ---------- - sprint_file: str - Path to the local copy of sprint data exported from GitHub - issue_file: str - Path to the local copy of issue data exported from GitHub - - Returns - ------- - Self: - An instance of the SprintBoard dataset class - - """ - # load and merge input datasets - df_sprints = load_json_data_as_df( - file_path=sprint_file, - column_map=cls.SPRINT_COLUMN_MAP, - date_cols=cls.SPRINT_DATE_COLS, - key_for_nested_items="items", - ) - df_issues = load_json_data_as_df( - file_path=issue_file, - column_map=cls.ISSUE_COLUMN_MAP, - date_cols=cls.ISSUE_DATE_COLS, - ) - df = df_sprints.merge(df_issues, on="issue_number") - df = cls._apply_transformations(df) - return cls(df) - - @classmethod - def _apply_transformations(cls, df: pd.DataFrame) -> pd.DataFrame: - """Apply column specific data transformations.""" - # calculate sprint end date - df["sprint_duration"] = pd.to_timedelta(df["sprint_duration"], unit="day") - df["sprint_end_date"] = df["sprint_start_date"] + df["sprint_duration"] - # extract parent issue number from the milestone description - parent_issue_regex = r"(?: deliverable: \#)(?P\d+)" - df["parent_issue_number"] = ( - df["milestone_description"] - .str.extract(pat=parent_issue_regex, expand=False) - .astype("Int64") - ) - return df diff --git a/analytics/src/analytics/datasets/utils.py b/analytics/src/analytics/datasets/utils.py index a7e641a083..e48f67541d 100644 --- a/analytics/src/analytics/datasets/utils.py +++ b/analytics/src/analytics/datasets/utils.py @@ -2,58 +2,6 @@ import json -import pandas as pd - - -def load_json_data_as_df( - file_path: str, - column_map: dict, - date_cols: list[str] | None = None, - key_for_nested_items: str | None = None, -) -> pd.DataFrame: - """ - Load a file that contains JSON data and format is as a DataFrame. - - Parameters - ---------- - file_path: str - Path to the JSON file with the exported issue data - column_map: dict - Dictionary mapping of existing JSON keys to their new column names - date_cols: list[str] - List of columns that need to be converted to date types - key_for_nested_items: Optional[str] - Name of the key containing a list of objects to load as a dataframe. - Only needed if the JSON loaded is an object instead of a list - - Returns - ------- - pd.DataFrame - Pandas dataframe with columns renamed to match the values of the column map - - Notes - ----- - TODO(@widal001): 2023-11-06 - Consider replacing column_map and date_cols with a - pydantic schema which would also allow us to do type validation and conversions - """ - # load json data from the local file - with open(file_path, encoding="utf-8") as f: - json_data = json.loads(f.read()) - # if the items we want to convert are nested under a key extract them - if key_for_nested_items: - json_data = json_data[key_for_nested_items] - # flatten the nested json into a dataframe - df = pd.json_normalize(json_data) - # reorder and rename the columns - df = df[column_map.keys()] - df = df.rename(columns=column_map) - # convert datetime columns to date - if date_cols: - for col in date_cols: - # strip off the timestamp portion of the date - df[col] = pd.to_datetime(df[col]).dt.floor("d") - return df - def load_json_file(path: str) -> list[dict]: """Load contents of a JSON file into a dictionary.""" diff --git a/analytics/src/analytics/integrations/github/__init__.py b/analytics/src/analytics/integrations/github/__init__.py index aa2f28b5a8..c34934cb01 100644 --- a/analytics/src/analytics/integrations/github/__init__.py +++ b/analytics/src/analytics/integrations/github/__init__.py @@ -1,15 +1,11 @@ """Export data from GitHub.""" __all__ = [ - "export_issue_data", - "export_project_data", "export_roadmap_data", "export_sprint_data", ] from analytics.integrations.github.main import ( - export_issue_data, - export_project_data, export_roadmap_data, export_sprint_data, ) diff --git a/analytics/src/analytics/integrations/github/main.py b/analytics/src/analytics/integrations/github/main.py index 7a5e4aa531..dc4633ace4 100644 --- a/analytics/src/analytics/integrations/github/main.py +++ b/analytics/src/analytics/integrations/github/main.py @@ -5,11 +5,6 @@ from pathlib import Path PARENT_DIR = Path(__file__).resolve().parent -# Set the max number of records to return with CLI commands to 10,000 -# NOTE: GitHub exports data in batches of 100 so exporting 10k issues could take over a minute -# TODO(@widal001): 2023-11-29 - Switch to incremental export pattern -# related issue: https://github.com/HHS/simpler-grants-gov/issues/775 -MAX_RECORDS = 10000 def pipe_command_output_to_file(command: str, output_file: str) -> None: @@ -22,25 +17,6 @@ def pipe_command_output_to_file(command: str, output_file: str) -> None: subprocess.call(shlex.split(command), stdout=f) # noqa: S603 -def export_project_data(owner: str, project: int, output_file: str) -> None: - """Export and write GitHub project data to a JSON file.""" - print(f"Exporting project data from {owner}/{project} to {output_file}") - command = ( - f"gh project item-list {project} --format json --owner {owner} -L {MAX_RECORDS}" - ) - pipe_command_output_to_file(command, output_file) - - -def export_issue_data(owner: str, repo: str, output_file: str) -> None: - """Export and write GitHub issue data to a JSON file.""" - print(f"Exporting issue data from {owner}/{repo} to {output_file}") - command = ( - f"gh issue list --json number,createdAt,closedAt,labels,title " - f"-R {owner}/{repo} -L {MAX_RECORDS} --state all" - ) - pipe_command_output_to_file(command, output_file) - - def export_sprint_data( owner: str, project: int, diff --git a/analytics/tests/datasets/test_deliverable_tasks.py b/analytics/tests/datasets/test_deliverable_tasks.py deleted file mode 100644 index 0058a16046..0000000000 --- a/analytics/tests/datasets/test_deliverable_tasks.py +++ /dev/null @@ -1,330 +0,0 @@ -"""Tests for analytics/datasets/deliverable_tasks.py.""" - -import numpy as np # noqa: I001 - -from analytics.datasets.deliverable_tasks import DeliverableTasks -from tests.conftest import ( - DAY_1, - LABEL_10K, - LABEL_30K, - json_issue_row, - json_sprint_row, - json_roadmap_row, - write_test_data_to_file, -) - - -class TestLoadFromJsonFile: - """Tests the DeliverableTasks.load_from_json_file() class method.""" - - LABEL = LABEL_30K - ISSUE_FILE = "data/test-issue.json" - SPRINT_FILE = "data/test-sprint.json" - - def test_returns_correct_columns(self): - """The method should return a dataframe with the correct set of columns.""" - # setup - create test data for two different deliverables - sprint_data = [json_sprint_row(issue=1, parent_number=2)] - issue_data = [ - json_issue_row(issue=1, labels=["task"]), - json_issue_row(issue=2, labels=[self.LABEL]), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data into a sprint board - tasks = DeliverableTasks.load_from_json_files( - deliverable_label=self.LABEL, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - ) - # validation - check output columns - assert list(tasks.df.columns) == [ - "deliverable_number", - "deliverable_title", - "deliverable_status", - "issue_title", - "issue_number", - "points", - "status", - ] - - def test_join_correctly_on_deliverable_number(self): - """Tasks should be joined to 30k deliverables on deliverable number.""" - # setup - create test data for two different deliverables - sprint_data = [ - json_sprint_row(issue=1, parent_number=4), - json_sprint_row(issue=2, parent_number=4), - json_sprint_row(issue=3, parent_number=5), - ] - issue_data = [ - json_issue_row(issue=1, labels=["task"]), - json_issue_row(issue=2, labels=["task"]), - json_issue_row(issue=3, labels=["task"]), - json_issue_row(issue=4, labels=[self.LABEL]), - json_issue_row(issue=5, labels=[self.LABEL]), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data - df = DeliverableTasks.load_from_json_files( - deliverable_label=self.LABEL, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - ).df - df = df.set_index("issue_number") - # validation - check length of results - assert len(df) == 3 - # validation - check - assert df.loc[1, "deliverable_title"] == "Issue 4" - assert df.loc[2, "deliverable_title"] == "Issue 4" - assert df.loc[3, "deliverable_title"] == "Issue 5" - - def test_keep_30k_deliverables_without_tasks(self): - """30k deliverable tickets without tasks should still appear in the dataset.""" - # setup - create test data for two different deliverables - sprint_data = [json_sprint_row(issue=1, parent_number=2)] - issue_data = [ - json_issue_row(issue=1, labels=["task", "topic: frontend"]), - json_issue_row(issue=2, labels=[self.LABEL, "topic: data"]), - json_issue_row(issue=3, labels=[self.LABEL, "topic: data"]), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data - df = DeliverableTasks.load_from_json_files( - deliverable_label=self.LABEL, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - ).df - df = df.set_index("deliverable_number") - # validation - check length of results - assert len(df) == 2 - # validation - check - assert df.loc[2, "issue_title"] == "Issue 1" - assert df.loc[3, "issue_title"] is np.nan - - def test_status_is_closed_if_closed_date_is_none(self): - """The status should be 'closed' if closed_date field is None.""" - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=1, parent_number=3), - json_sprint_row(issue=2, parent_number=3), - ] - issue_data = [ - json_issue_row(issue=1, labels=["task"], closed_at=DAY_1), # closed - json_issue_row(issue=2, labels=["task"], closed_at=None), # open - json_issue_row(issue=3, labels=[self.LABEL], closed_at=None), # deliverable - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data into a sprint board - df = DeliverableTasks.load_from_json_files( - deliverable_label=self.LABEL, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - ).df - df = df.set_index("issue_number") - # validation - check length of results - assert len(df) == 2 - # validation - check - assert df.loc[1, "status"] == "closed" - assert df.loc[2, "status"] == "open" - - -class TestLoadFromJsonFilesWithRoadmapData: - """Test the load_from_json_files_with_roadmap_data() method.""" - - LABEL_30K = LABEL_30K - LABEL_10K = LABEL_10K - ISSUE_FILE = "data/test-issue.json" - SPRINT_FILE = "data/test-sprint.json" - ROADMAP_FILE = "data/test-roadmap.json" - - def test_returns_correct_columns(self): - """Return the correct columns in DeliverableTasks.""" - # setup - create test data for two different deliverables - sprint_data = [ - json_sprint_row(issue=1, deliverable=4), - json_sprint_row(issue=2, deliverable=4), - json_sprint_row(issue=3, deliverable=5), - ] - issue_data = [ - json_issue_row(issue=1), - json_issue_row(issue=2), - json_issue_row(issue=3), - ] - roadmap_data = [ - json_roadmap_row(issue=4, deliverable=4, status="Planning"), - json_roadmap_row(issue=5, deliverable=5, status="In progress"), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - write_test_data_to_file({"items": roadmap_data}, self.ROADMAP_FILE) - # execution - load data - df = DeliverableTasks.load_from_json_files_with_roadmap_data( - deliverable_label=self.LABEL_30K, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - roadmap_file=self.ROADMAP_FILE, - ).df - # validation - check length of results - assert len(df) == 3 - # validation - check output columns - assert list(df.columns) == [ - "deliverable_number", - "deliverable_title", - "deliverable_status", - "issue_title", - "issue_number", - "points", - "status", - ] - - def test_status_is_closed_if_closed_date_is_none(self): - """The status should be 'closed' if closed_date field is None.""" - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=1, deliverable=3), - json_sprint_row(issue=2, deliverable=3), - ] - issue_data = [ - json_issue_row(issue=1, closed_at=DAY_1), # closed - json_issue_row(issue=2, closed_at=None), # open - ] - roadmap_data = [ - json_roadmap_row(issue=3, deliverable=3, status="Planning"), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - write_test_data_to_file({"items": roadmap_data}, self.ROADMAP_FILE) - # execution - load data into a sprint board - df = DeliverableTasks.load_from_json_files_with_roadmap_data( - deliverable_label=self.LABEL_30K, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - roadmap_file=self.ROADMAP_FILE, - ).df - df = df.set_index("issue_number") - # validation - check length of results - assert len(df) == 2 - # validation - check - assert df.loc[1, "status"] == "closed" - assert df.loc[2, "status"] == "open" - - def test_exclude_10k_deliverables_from_results(self): - """ - 10k deliverables should not be included in the final dataset. - - If we don't drop the 10k deliverables, then joining on "deliverable" - will result in a fan out that duplicates task-level issues - """ - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=1, deliverable=3), - json_sprint_row(issue=2, deliverable=3), - ] - issue_data = [ - json_issue_row(issue=1), - json_issue_row(issue=2), - ] - roadmap_data = [ # exclude the second item from final dataset - json_roadmap_row(issue=3, deliverable=3, labels=[self.LABEL_30K]), - json_roadmap_row(issue=4, deliverable=3, labels=[self.LABEL_10K]), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - write_test_data_to_file({"items": roadmap_data}, self.ROADMAP_FILE) - # execution - df = DeliverableTasks.load_from_json_files_with_roadmap_data( - deliverable_label=self.LABEL_30K, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - roadmap_file=self.ROADMAP_FILE, - ).df - df = df.set_index("issue_number") - # validation - confirm 10k was dropped and join didn't produce a fan out - assert len(df) == 2 - assert list(df.index) == [1, 2] - - def test_exclude_deliverables_without_labels(self): - """ - Deliverables that don't have labels should also be excluded. - - This test reproduces a bug that caused this function to break when row - didn't have labels. - """ - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=1, deliverable=3), - json_sprint_row(issue=2, deliverable=4), - ] - issue_data = [ - json_issue_row(issue=1), - json_issue_row(issue=2), - ] - roadmap_data = [ # exclude the second item from final dataset - json_roadmap_row(issue=3, deliverable=3, labels=[self.LABEL_30K]), - json_roadmap_row(issue=4, deliverable=4, labels=[self.LABEL_30K]), - ] - # remove the labels for the second deliverable to reproduce bug - roadmap_data[1]["labels"] = np.nan - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - write_test_data_to_file({"items": roadmap_data}, self.ROADMAP_FILE) - # execution - df = DeliverableTasks.load_from_json_files_with_roadmap_data( - deliverable_label=self.LABEL_30K, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - roadmap_file=self.ROADMAP_FILE, - ).df - df = df.set_index("issue_number") - # validation - assert the second deliverable was dropped - assert len(df) == 1 - assert df.loc[1, "deliverable_number"] == 3 - - def test_exclude_deliverables_without_deliverable_col_set(self): - """ - Deliverables that don't have the "deliverable" column set. - - This test reproduces a bug that incorrectly joins on null values. - """ - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=1, deliverable=3), - json_sprint_row(issue=2, deliverable=4), - ] - issue_data = [ - json_issue_row(issue=1), - json_issue_row(issue=2), - ] - roadmap_data = [ # exclude the second item from final dataset - json_roadmap_row(issue=3, deliverable=3), - json_roadmap_row(issue=4, deliverable=4), - ] - # set the deliverable column to None to reproduce bug - sprint_data[1]["deliverable"] = np.nan - roadmap_data[1]["deliverable"] = np.nan - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - write_test_data_to_file({"items": roadmap_data}, self.ROADMAP_FILE) - # execution - df = DeliverableTasks.load_from_json_files_with_roadmap_data( - deliverable_label=self.LABEL_30K, - sprint_file=self.SPRINT_FILE, - issue_file=self.ISSUE_FILE, - roadmap_file=self.ROADMAP_FILE, - ).df - df = df.set_index("issue_number") - # validation - assert the second deliverable was dropped - assert len(df) == 1 - assert df.loc[1, "deliverable_number"] == 3 diff --git a/analytics/tests/datasets/test_sprint_board.py b/analytics/tests/datasets/test_sprint_board.py deleted file mode 100644 index bab2bd69c3..0000000000 --- a/analytics/tests/datasets/test_sprint_board.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Tests for analytics/datasets/sprint_board.py.""" - -import pandas as pd -import pytest -from analytics.datasets.sprint_board import SprintBoard - -from tests.conftest import ( - DAY_0, - DAY_1, - DAY_2, - DAY_3, - DAY_4, - DAY_5, - json_issue_row, - json_sprint_row, - sprint_row, - write_test_data_to_file, -) - - -class TestSprintBoard: - """Tests the SprintBoard data class.""" - - ISSUE_FILE = "data/test-issue.json" - SPRINT_FILE = "data/test-sprint.json" - - def test_get_sprint_start_and_end_dates(self): - """Sprint start date should be returned correctly.""" - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=1, sprint_name="Sprint 1", sprint_date="2023-11-01"), - json_sprint_row(issue=2, sprint_name="Sprint 2", sprint_date="2023-11-16"), - ] - issue_data = [json_issue_row(issue=1), json_issue_row(issue=2)] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data into a sprint board - board = SprintBoard.load_from_json_files(self.SPRINT_FILE, self.ISSUE_FILE) - # validation - check sprint start dates - assert board.sprint_start("Sprint 1") == pd.Timestamp("2023-11-01", tz="UTC") - assert board.sprint_start("Sprint 2") == pd.Timestamp("2023-11-16", tz="UTC") - # validation - check sprint start dates - assert board.sprint_end("Sprint 1") == pd.Timestamp("2023-11-15", tz="UTC") - assert board.sprint_end("Sprint 2") == pd.Timestamp("2023-11-30", tz="UTC") - - def test_datasets_joined_on_issue_number_correctly(self): - """The datasets should be correctly joined on issue number.""" - # setup - create test data for two different sprints - sprint_data = [ - json_sprint_row(issue=111, sprint_name="Sprint 1"), - json_sprint_row(issue=222, sprint_name="Sprint 2"), - ] - issue_data = [ - json_issue_row(issue=111, created_at="2023-11-03"), - json_issue_row(issue=222, created_at="2023-11-16"), - ] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data into a sprint board and extract the df - df = SprintBoard.load_from_json_files(self.SPRINT_FILE, self.ISSUE_FILE).df - df = df.set_index("issue_number") - # validation -- check that both rows are preserved - assert len(df) == 2 - # validation -- check that the sprints are matched to the right issue - assert df.loc[111]["sprint"] == "Sprint 1" - assert df.loc[222]["sprint"] == "Sprint 2" - # validation -- check that the correct created dates are preserved - assert df.loc[111]["created_date"] == pd.Timestamp("2023-11-03") - assert df.loc[222]["created_date"] == pd.Timestamp("2023-11-16") - - def test_drop_sprint_rows_that_are_not_found_in_issue_data(self): - """Sprint board items without a matching issue should be dropped.""" - # setup - create test data for two different sprints - sprint_data = [json_sprint_row(issue=111), json_sprint_row(issue=222)] - issue_data = [json_issue_row(issue=111)] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data into a sprint board and extract the df - df = SprintBoard.load_from_json_files(self.SPRINT_FILE, self.ISSUE_FILE).df - df = df.set_index("issue_number") - # validation -- check that issue 222 was dropped - assert len(df) == 1 - assert 222 not in list(df.index) - - @pytest.mark.parametrize( - "parent_number", - [222, 333, 444], # run this test against multiple inputs - ) - def test_extract_parent_issue_correctly(self, parent_number: int): - """The parent issue number should be extracted from the milestone description.""" - # setup - create test data for two different sprints - sprint_data = [json_sprint_row(issue=111, parent_number=parent_number)] - issue_data = [json_issue_row(issue=111)] - # setup - write test data to json files - write_test_data_to_file(issue_data, self.ISSUE_FILE) - write_test_data_to_file({"items": sprint_data}, self.SPRINT_FILE) - # execution - load data into a sprint board and extract the df - df = SprintBoard.load_from_json_files(self.SPRINT_FILE, self.ISSUE_FILE).df - df = df.set_index("issue_number") - # validation -- check that issue 111's parent_issue_number is 222 - assert df.loc[111]["parent_issue_number"] == parent_number - - -class TestGetSprintNameFromDate: - """Test the SprintBoard.get_sprint_name_from_date() method.""" - - @pytest.mark.parametrize( - ("date", "expected"), - [ - (DAY_1, "Sprint 1"), - (DAY_2, "Sprint 1"), - (DAY_4, "Sprint 2"), - (DAY_5, "Sprint 2"), - ], - ) - def test_return_name_if_matching_sprint_exists(self, date: str, expected: str): - """Test that correct sprint is returned if date exists in a sprint.""" - # setup - create sample dataset - board_data = [ - sprint_row(issue=1, sprint=1, sprint_start=DAY_0, sprint_length=3), - sprint_row(issue=2, sprint=1, sprint_start=DAY_0, sprint_length=3), - sprint_row(issue=3, sprint=2, sprint_start=DAY_3, sprint_length=3), - ] - board = SprintBoard.from_dict(board_data) - # validation - sprint_date = pd.Timestamp(date) - sprint_name = board.get_sprint_name_from_date(sprint_date) - assert sprint_name == expected - - def test_return_none_if_no_matching_sprint(self): - """The method should return None if no sprint contains the date.""" - # setup - create sample dataset - board_data = [ - sprint_row(issue=1, sprint=1, sprint_start=DAY_1), - sprint_row(issue=2, sprint=2, sprint_start=DAY_4), - ] - board = SprintBoard.from_dict(board_data) - # validation - bad_date = pd.Timestamp("1900-01-01") - sprint_name = board.get_sprint_name_from_date(bad_date) - assert sprint_name is None - - def test_return_previous_sprint_if_date_is_start_of_next_sprint(self): - """ - Test correct behavior for sprint end/start dates. - - If date provided is both the the end of one sprint and the beginning of - another, then return the name of the sprint that just ended. - """ - # setup - create sample dataset - board_data = [ - sprint_row(issue=1, sprint=1, sprint_start=DAY_1, sprint_length=2), - sprint_row(issue=2, sprint=2, sprint_start=DAY_3, sprint_length=2), - ] - board = SprintBoard.from_dict(board_data) - # execution - bad_date = pd.Timestamp(DAY_3) # end of sprint 1 and start of sprint 2 - sprint_name = board.get_sprint_name_from_date(bad_date) - assert sprint_name == "Sprint 1" diff --git a/analytics/tests/etl/test_github.py b/analytics/tests/etl/test_github.py index 44529f6f6c..7fca688475 100644 --- a/analytics/tests/etl/test_github.py +++ b/analytics/tests/etl/test_github.py @@ -15,10 +15,16 @@ RoadmapConfig, SprintBoardConfig, get_parent_with_type, + populate_issue_lookup_table, ) +from analytics.integrations import github from tests.conftest import issue +# =========================================================== +# Fixtures +# =========================================================== + @pytest.fixture(name="config") def mock_config(tmp_path: Path) -> GitHubProjectConfig: @@ -66,121 +72,161 @@ def mock_roadmap_data_file(config: GitHubProjectConfig) -> str: return roadmap_file -def test_extract(monkeypatch: pytest.MonkeyPatch, etl: GitHubProjectETL): - """Test the extract step by mocking export functions.""" - mock_export_roadmap_data = MagicMock() - mock_export_sprint_data = MagicMock() - monkeypatch.setattr(etl, "_export_roadmap_data", mock_export_roadmap_data) - monkeypatch.setattr(etl, "_export_sprint_data", mock_export_sprint_data) +# =========================================================== +# Test ETL class +# =========================================================== - # Run the extract method - etl.extract() - # Assert roadmap export was called with expected arguments - roadmap = etl.config.roadmap_project - mock_export_roadmap_data.assert_called_once_with( - roadmap=roadmap, - output_file=str(Path(etl.config.temp_dir) / "roadmap-data.json"), - ) +class TestGitHubProjectETL: + """Tests the GitHubProjectETL class.""" - # Assert sprint export was called with expected arguments - sprint_board = etl.config.sprint_projects[0] - mock_export_sprint_data.assert_called_once_with( - sprint_board=sprint_board, - output_file=str( - Path(etl.config.temp_dir) - / f"sprint-data-{sprint_board.project_number}.json", - ), - ) + def test_extract( + self, + monkeypatch: pytest.MonkeyPatch, + etl: GitHubProjectETL, + ): + """Test the extract step by mocking export functions.""" + mock_export_roadmap_data = MagicMock() + mock_export_sprint_data = MagicMock() + monkeypatch.setattr(etl, "_export_roadmap_data", mock_export_roadmap_data) + monkeypatch.setattr(etl, "_export_sprint_data", mock_export_sprint_data) - # Verify transient files were set correctly - assert len(etl._transient_files) == 1 - assert etl._transient_files[0].roadmap.endswith("roadmap-data.json") - assert etl._transient_files[0].sprint.endswith( - f"sprint-data-{sprint_board.project_number}.json", - ) + # Run the extract method + etl.extract() + # Assert roadmap export was called with expected arguments + roadmap = etl.config.roadmap_project + mock_export_roadmap_data.assert_called_once_with( + roadmap=roadmap, + output_file=str(Path(etl.config.temp_dir) / "roadmap-data.json"), + ) -def test_transform(etl: GitHubProjectETL, sprint_file: str, roadmap_file: str): - """Test the transform step by mocking GitHubIssues.load_from_json_files.""" - # Arrange - output_data = [ - issue( - issue=1, - points=2, - parent="Epic3", - deliverable="Deliverable5", - quad="quad1", - epic="Epic3", - ), - issue( - issue=2, - points=1, - parent="Epic4", - deliverable=None, - quad=None, - epic="Epic4", - ), - ] - wanted = [i.model_dump() for i in output_data] - etl._transient_files = [InputFiles(roadmap=roadmap_file, sprint=sprint_file)] - # Act - etl.transform() - # Assert - assert etl.dataset.to_dict() == wanted - - -def test_load(etl: GitHubProjectETL): - """Test the load step by mocking the to_json method.""" - mock_to_json = MagicMock() - etl.dataset = MagicMock() - etl.dataset.to_json = mock_to_json - - # Run the load method - etl.load() - - # Check if to_json was called with the correct output file - mock_to_json.assert_called_once_with(etl.config.output_file) - - -def test_run( - monkeypatch: pytest.MonkeyPatch, - etl: GitHubProjectETL, - sprint_file: str, - roadmap_file: str, -): - """Test the entire ETL pipeline by verifying method calls in run.""" - # Arrange - Mock the export private methods - mock_export_roadmap_data = MagicMock() - mock_export_sprint_data = MagicMock() - monkeypatch.setattr(etl, "_export_roadmap_data", mock_export_roadmap_data) - monkeypatch.setattr(etl, "_export_sprint_data", mock_export_sprint_data) - # Arrange - specify the output wanted - output_data = [ - issue( - issue=1, - points=2, - parent="Epic3", - deliverable="Deliverable5", - quad="quad1", - epic="Epic3", - ), - issue( - issue=2, - points=1, - parent="Epic4", - deliverable=None, - quad=None, - epic="Epic4", - ), - ] - dataset_wanted = [i.model_dump() for i in output_data] - files_wanted = [InputFiles(roadmap=roadmap_file, sprint=sprint_file)] - # Act - run the ETL - etl.run() - # Assert - assert etl._transient_files == files_wanted - assert etl.dataset.to_dict() == dataset_wanted + # Assert sprint export was called with expected arguments + sprint_board = etl.config.sprint_projects[0] + mock_export_sprint_data.assert_called_once_with( + sprint_board=sprint_board, + output_file=str( + Path(etl.config.temp_dir) + / f"sprint-data-{sprint_board.project_number}.json", + ), + ) + + # Verify transient files were set correctly + assert len(etl._transient_files) == 1 + assert etl._transient_files[0].roadmap.endswith("roadmap-data.json") + assert etl._transient_files[0].sprint.endswith( + f"sprint-data-{sprint_board.project_number}.json", + ) + + def test_transform( + self, + etl: GitHubProjectETL, + sprint_file: str, + roadmap_file: str, + ): + """Test the transform step by mocking GitHubIssues.load_from_json_files.""" + # Arrange + output_data = [ + issue( + issue=1, + points=2, + parent="Epic3", + deliverable="Deliverable5", + quad="quad1", + epic="Epic3", + ), + issue( + issue=2, + points=1, + parent="Epic4", + deliverable=None, + quad=None, + epic="Epic4", + ), + ] + wanted = [i.model_dump() for i in output_data] + etl._transient_files = [InputFiles(roadmap=roadmap_file, sprint=sprint_file)] + # Act + etl.transform() + # Assert + assert etl.dataset.to_dict() == wanted + + def test_load(self, etl: GitHubProjectETL): + """Test the load step by mocking the to_json method.""" + mock_to_json = MagicMock() + etl.dataset = MagicMock() + etl.dataset.to_json = mock_to_json + + # Run the load method + etl.load() + + # Check if to_json was called with the correct output file + mock_to_json.assert_called_once_with(etl.config.output_file) + + def test_run( + self, + monkeypatch: pytest.MonkeyPatch, + etl: GitHubProjectETL, + sprint_file: str, + roadmap_file: str, + ): + """Test the entire ETL pipeline by verifying method calls in run.""" + # Arrange - Mock the export private methods + monkeypatch.setattr(github, "export_roadmap_data", MagicMock()) + monkeypatch.setattr(github, "export_sprint_data", MagicMock()) + # Arrange - specify the output wanted + output_data = [ + issue( + issue=1, + points=2, + parent="Epic3", + deliverable="Deliverable5", + quad="quad1", + epic="Epic3", + ), + issue( + issue=2, + points=1, + parent="Epic4", + deliverable=None, + quad=None, + epic="Epic4", + ), + ] + dataset_wanted = [i.model_dump() for i in output_data] + files_wanted = [InputFiles(roadmap=roadmap_file, sprint=sprint_file)] + # Act - run the ETL + etl.run() + # Assert + assert etl._transient_files == files_wanted + assert etl.dataset.to_dict() == dataset_wanted + + +# =========================================================== +# Test ETL helper functions +# =========================================================== + + +class TestPopulateLookupTable: + """Test the populate_lookup_table() function.""" + + def test_drop_issues_with_validation_errors(self): + """Issues with validation errors should be excluded from the lookup table.""" + # Arrange + test_data = [ + issue(issue=1).model_dump(), + issue(issue=2).model_dump(), + { + "issue_url": "bad_issue", + "issue_points": "foo", + }, # missing required field and wrong type for points + ] + wanted = 2 + # Act + got = populate_issue_lookup_table(lookup={}, issues=test_data) + # Assert + assert len(got) == wanted + assert "bad_issue" not in got class TestGetParentWithType: @@ -282,3 +328,18 @@ def test_return_none_if_deliverable_is_not_found_in_parents(self): ) # Assert assert got == wanted + + def test_raise_value_error_if_child_url_not_in_lookup(self): + """Raise a value error if the child_url isn't found in lookup table.""" + # Arrange + task = "Task1" + lookup = { + task: issue(issue=1, kind=IssueType.TASK), + } + # Act + with pytest.raises(ValueError, match="Lookup doesn't contain"): + get_parent_with_type( + child_url="fake", + lookup=lookup, + type_wanted=IssueType.DELIVERABLE, + ) diff --git a/documentation/analytics/development.md b/documentation/analytics/development.md index c078d83d99..918e11c822 100644 --- a/documentation/analytics/development.md +++ b/documentation/analytics/development.md @@ -58,8 +58,8 @@ After choosing your approach, following the corresponding setup instructions: - Add `export GH_TOKEN=...` to your `zshrc` or similar 3. Set the slackbot token and the channel ID for Slack after following the instructions in [configuring secrets](#configuring-secrets). **Note:** replace the `...` with the value of these secrets: ``` - export SLACK_BOT_TOKEN=... - export REPORTING_CHANNEL_ID=... + export ANALYTICS_SLACK_BOT_TOKEN=... + export ANALYTICS_REPORTING_CHANNEL_ID=... ``` 4. Run `make test-audit` to confirm the application is running correctly. @@ -86,8 +86,8 @@ After choosing your approach, following the corresponding setup instructions: - Add `export GH_TOKEN=...` to your `zshrc` or similar 3. Set the slackbot token and the channel ID for Slack after following the instructions in [configuring secrets](#configuring-secrets). **Note:** replace the `...` with the value of these secrets: ``` - export SLACK_BOT_TOKEN=... - export REPORTING_CHANNEL_ID=... + export ANALYTICS_SLACK_BOT_TOKEN=... + export ANALYTICS_REPORTING_CHANNEL_ID=... ``` 4. Run `make test-audit` to confirm the application is running correctly. diff --git a/documentation/analytics/usage.md b/documentation/analytics/usage.md index 9086aacc8f..c84a4f0210 100644 --- a/documentation/analytics/usage.md +++ b/documentation/analytics/usage.md @@ -144,6 +144,8 @@ Once you've exported the sprint and issue data from GitHub, you can start calcul poetry run analytics calculate sprint_burndown \ --issue-file data/delivery-data.json \ --sprint "@current" \ + --owner HHS \ + --project 13 \ --unit points \ --show-results ``` @@ -158,6 +160,7 @@ A couple of important notes about this command: - `--issue-file data/delivery-data.json` refers to the output of `poetry run export gh_delivery_data` which exports issue and sprint data from GitHub - `--sprint @current` In order to calculate burndown, you'll need to specify either `"@current"` for the current sprint or the name of another sprint, e.g. `"Sprint 10"` +- `--owner HHS` and `--project 13` You can also specify which GitHub project owner and number you want to calculate burndown for, the default is `HHS` and `13` respectively. - `--unit points` In order to calculate burndown based on story points, you pass `points` to the `--unit` option. The other option for unit is `issues` - `--show-results` In order to the see the output in a browser you'll need to pass this flag. @@ -165,14 +168,20 @@ A couple of important notes about this command: You can also post the results of this metric to a Slack channel: +> [!NOTE] You must have the following environment variables set to post to Slack: +> - `ANALYTICS_SLACK_BOT_TOKEN` the OAuth token for a slackbot installed in your workspace +> - `ANALYTICS_REPORTING_CHANNEL_ID` the id of the channel you want to post to in Slack. +> +> For more information about setting up these variables see the [installation guide](development.md#configuring-secrets) + ```bash -poetry run analytics calculate sprint_burndown --sprint-file data/sprint-data.json --issue-file data/issue-data.json --sprint "Sprint 10" --unit points --post-results +poetry run analytics calculate sprint_burndown \ + --issue-file data/delivery-data.json \ + --sprint "@current" \ + --unit points \ + --post-results ``` -> **NOTE:** This requires you to have the `.secrets.toml` configured according to the directions in step 5 of the [installation section](#installation) - -![Screenshot of burndown report in slack](../../analytics/static/screenshot-slack-burndown.png) - ### Calculating deliverable percent complete Another key metric you can report is the percentage of issues or points completed per 30k deliverable. @@ -181,14 +190,20 @@ You can specify the unit you want to use for percent complete (e.g. points or is For example, here we're calculating percentage completion based on the number of tickets under each deliverable. ```bash -poetry run analytics calculate deliverable_percent_complete --sprint-file data/sprint-data.json --issue-file data/issue-data.json --show-results --unit issues +poetry run analytics calculate deliverable_percent_complete \ + --issue-file data/delivery-data.json \ + --show-results \ + --unit issues ``` ![Screenshot of deliverable percent complete by issues](../../analytics/static/screenshot-deliverable-pct-complete-tasks.png) And here we're calculating it based on the total story point value of those tickets. ```bash -poetry run analytics calculate deliverable_percent_complete --sprint-file data/sprint-data.json --issue-file data/issue-data.json --show-results --unit points +poetry run analytics calculate deliverable_percent_complete \ + --issue-file data/delivery-data.json \ + --show-results \ + --unit points ``` ![Screenshot of deliverable percent complete by points](../../analytics/static/screenshot-deliverable-pct-complete-points.png) @@ -196,20 +211,13 @@ poetry run analytics calculate deliverable_percent_complete --sprint-file data/s The `deliverable_pct_complete` sub-command also supports the `--post-results` flag if you want to post this data to slack. -### Experimental features - -We also have some flags that enable experimental features for the deliverables. The currently supported flags for `calculate deliverable_percent_complete` are: - -- `--roadmap-file` Accepts a path to a file that loads data exported from the Product roadmap GitHub project. This also uses a different join path to associate issues with their parent deliverables. -- `--include-status` Accepts the name of a status to include in the report. Can be passed multiple times to include multiple statuses. +You can also pass the `--include-status` flag to limit the percent complete report to deliverables with specific statuses. It can be passed multiple times to include multiple statuses. Here's an example of how to use these in practice: ```bash poetry run analytics calculate deliverable_percent_complete \ - --sprint-file data/sprint-data.json \ - --issue-file data/issue-data.json \ - --roadmap-file data/roadmap-data.json \ + --issue-file data/delivery-data.json \ --include-status "In Progress" \ --include-status "Planning" \ --show-results \ From cfee98705ba41ff520e69d8b4ae5768cb89aa537 Mon Sep 17 00:00:00 2001 From: doug-s-nava <92806979+doug-s-nava@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:28:01 -0500 Subject: [PATCH 03/13] [Issue 1890] add GA beacon for filter use on search (#2626) * Adds "search_attempt" event to track each load of the search results page and pass along a list of filters in place --- frontend/src/app/[locale]/search/page.tsx | 21 +++------ .../src/components/search/SearchAnalytics.tsx | 23 ++++++++++ .../src/types/search/searchRequestTypes.ts | 12 +++++ .../search/SearchAnalytics.test.tsx | 46 +++++++++++++++++++ 4 files changed, 87 insertions(+), 15 deletions(-) create mode 100644 frontend/src/components/search/SearchAnalytics.tsx create mode 100644 frontend/tests/components/search/SearchAnalytics.test.tsx diff --git a/frontend/src/app/[locale]/search/page.tsx b/frontend/src/app/[locale]/search/page.tsx index 453a70a820..2c0329d0c2 100644 --- a/frontend/src/app/[locale]/search/page.tsx +++ b/frontend/src/app/[locale]/search/page.tsx @@ -1,5 +1,7 @@ import { Metadata } from "next"; +import QueryProvider from "src/app/[locale]/search/QueryProvider"; import withFeatureFlag from "src/hoc/search/withFeatureFlag"; +import { SearchParamsTypes } from "src/types/search/searchRequestTypes"; import { Breakpoints } from "src/types/uiTypes"; import { convertSearchParamsToProperTypes } from "src/utils/search/convertSearchParamsToProperTypes"; @@ -7,10 +9,10 @@ import { useTranslations } from "next-intl"; import { getTranslations, unstable_setRequestLocale } from "next-intl/server"; import ContentDisplayToggle from "src/components/ContentDisplayToggle"; +import SearchAnalytics from "src/components/search/SearchAnalytics"; import SearchBar from "src/components/search/SearchBar"; import SearchFilters from "src/components/search/SearchFilters"; import SearchResults from "src/components/search/SearchResults"; -import QueryProvider from "./QueryProvider"; export async function generateMetadata() { const t = await getTranslations({ locale: "en" }); @@ -20,22 +22,10 @@ export async function generateMetadata() { }; return meta; } - -interface searchParamsTypes { - agency?: string; - category?: string; - eligibility?: string; - fundingInstrument?: string; - page?: string; - query?: string; - sortby?: string; - status?: string; - [key: string]: string | undefined; -} - -function Search({ searchParams }: { searchParams: searchParamsTypes }) { +function Search({ searchParams }: { searchParams: SearchParamsTypes }) { unstable_setRequestLocale("en"); const t = useTranslations("Search"); + const convertedSearchParams = convertSearchParamsToProperTypes(searchParams); const { agency, category, eligibility, fundingInstrument, query, status } = convertedSearchParams; @@ -46,6 +36,7 @@ function Search({ searchParams }: { searchParams: searchParamsTypes }) { return ( <> +
diff --git a/frontend/src/components/search/SearchAnalytics.tsx b/frontend/src/components/search/SearchAnalytics.tsx new file mode 100644 index 0000000000..667ddb672f --- /dev/null +++ b/frontend/src/components/search/SearchAnalytics.tsx @@ -0,0 +1,23 @@ +"use client"; + +import { sendGAEvent } from "@next/third-parties/google"; +import { omit } from "lodash"; +import { SearchParamsTypes } from "src/types/search/searchRequestTypes"; + +import { useEffect } from "react"; + +const getCurrentFilters = (params: SearchParamsTypes): string => { + return JSON.stringify(omit(params, "query", "page")); +}; + +function SearchAnalytics({ params }: { params: SearchParamsTypes }) { + useEffect(() => { + // send list of filters defined in page query params on each page load + sendGAEvent("event", "search_attempt", { + search_filters: getCurrentFilters(params), + }); + }, [params]); + return <>; +} + +export default SearchAnalytics; diff --git a/frontend/src/types/search/searchRequestTypes.ts b/frontend/src/types/search/searchRequestTypes.ts index c77ebd16c2..40d40cedfd 100644 --- a/frontend/src/types/search/searchRequestTypes.ts +++ b/frontend/src/types/search/searchRequestTypes.ts @@ -64,3 +64,15 @@ export interface QueryParamData { actionType?: SearchFetcherActionType; fieldChanged?: string; } + +export interface SearchParamsTypes { + agency?: string; + category?: string; + eligibility?: string; + fundingInstrument?: string; + page?: string; + query?: string; + sortby?: string; + status?: string; + [key: string]: string | undefined; +} diff --git a/frontend/tests/components/search/SearchAnalytics.test.tsx b/frontend/tests/components/search/SearchAnalytics.test.tsx new file mode 100644 index 0000000000..134bf48064 --- /dev/null +++ b/frontend/tests/components/search/SearchAnalytics.test.tsx @@ -0,0 +1,46 @@ +import { render } from "@testing-library/react"; + +import SearchAnalytics from "src/components/search/SearchAnalytics"; + +const sendGAEventMock = jest.fn(); + +jest.mock("@next/third-parties/google", () => ({ + /* eslint-disable-next-line @typescript-eslint/no-unsafe-return */ + sendGAEvent: (...args: unknown[]) => sendGAEventMock(...args), +})); + +describe("SearchAnalytics", () => { + it("calls sendGAEvent with expected params on render", () => { + const { rerender } = render( + , + ); + expect(sendGAEventMock).toHaveBeenCalledWith("event", "search_attempt", { + search_filters: + '{"fundingInstrument":"cooperative_agreement","status":"posted, archived","agency":"AC,PAMS-SC"}', + }); + + rerender( + , + ); + expect(sendGAEventMock).toHaveBeenCalledWith("event", "search_attempt", { + search_filters: + '{"status":"posted, archived, closed","agency":"AC","category":"recovery_act"}', + }); + }); +}); From 3e7a90638b6ab968de076f60f27c67c8a26ba310 Mon Sep 17 00:00:00 2001 From: doug-s-nava <92806979+doug-s-nava@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:57:36 -0500 Subject: [PATCH 04/13] [Issue 2709] add opportunity number to breadcrumb (#2727) * Adds the opportunity number to the end of the breadcrumb text on the opportunity listing page * Fixes the casing on the search "sort by" label --- .../app/[locale]/opportunity/[id]/page.tsx | 4 +-- frontend/src/components/Breadcrumbs.tsx | 30 +++++++++---------- frontend/src/i18n/messages/en/index.ts | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/frontend/src/app/[locale]/opportunity/[id]/page.tsx b/frontend/src/app/[locale]/opportunity/[id]/page.tsx index 0c6af29de4..4431140a67 100644 --- a/frontend/src/app/[locale]/opportunity/[id]/page.tsx +++ b/frontend/src/app/[locale]/opportunity/[id]/page.tsx @@ -100,8 +100,8 @@ async function OpportunityListing({ params }: { params: { id: string } }) { : emptySummary(); breadcrumbs.push({ - title: opportunityData.opportunity_title, - path: `/opportunity/${opportunityData.opportunity_id}/`, + title: `${opportunityData.opportunity_title}: ${opportunityData.opportunity_number}`, + path: `/opportunity/${opportunityData.opportunity_id}/`, // unused but required in breadcrumb implementation }); return ( diff --git a/frontend/src/components/Breadcrumbs.tsx b/frontend/src/components/Breadcrumbs.tsx index fdd164a59d..88fb811ca4 100644 --- a/frontend/src/components/Breadcrumbs.tsx +++ b/frontend/src/components/Breadcrumbs.tsx @@ -16,22 +16,22 @@ type Props = { breadcrumbList: BreadcrumbList; }; -const Breadcrumbs = ({ breadcrumbList }: Props) => { - const rdfaMetadata = { - ol: { - vocab: "http://schema.org/", - typeof: "BreadcrumbList", - }, - li: { - property: "itemListElement", - typeof: "ListItem", - }, - a: { - property: "item", - typeof: "WebPage", - }, - }; +const rdfaMetadata = { + ol: { + vocab: "http://schema.org/", + typeof: "BreadcrumbList", + }, + li: { + property: "itemListElement", + typeof: "ListItem", + }, + a: { + property: "item", + typeof: "WebPage", + }, +}; +const Breadcrumbs = ({ breadcrumbList }: Props) => { const breadcrumArray = breadcrumbList.map((breadcrumbInfo, i) => { return ( Date: Tue, 5 Nov 2024 12:57:00 -0500 Subject: [PATCH 05/13] [Issue #2510] Update search logic for /search path (#2704) ## Summary Fixes #2510 ### Time to review: __5 mins__ ## Changes proposed Change the logic to the following: ### 1. user visits `/search` forecasted,posted selected while no changes are made the the path or query params: ![Image](https://github.com/user-attachments/assets/2b3cafee-f399-45d3-9bcd-a4df497fba73) ### 2. user deselects all statuses statuses stay un-selected `status=none` for no status selected in the URL ![Image](https://github.com/user-attachments/assets/3738990b-fb10-458d-bb87-4b1d0879b3a1) ### 3. user selects one status when none are selected `status=none` is replaced by the selected status: ![Image](https://github.com/user-attachments/assets/71fe82d2-1e26-4ab8-a2c1-4893dbacecf1) ### 4. all other user selections and actions stay the same --------- Co-authored-by: doug-s-nava <92806979+doug-s-nava@users.noreply.github.com> --- frontend/src/components/Header.tsx | 2 +- .../search/SearchOpportunityStatus.tsx | 5 +++ frontend/src/constants/breadcrumbs.ts | 2 +- frontend/src/constants/search.ts | 2 + .../src/types/search/searchRequestTypes.ts | 2 + .../convertSearchParamsToProperTypes.ts | 16 +++++-- .../tests/e2e/search/search-loading.spec.ts | 2 +- .../tests/e2e/search/search-navigate.spec.ts | 22 +-------- .../e2e/search/search-no-results.spec.ts | 7 +-- frontend/tests/e2e/search/search.spec.ts | 20 +++++---- frontend/tests/e2e/search/searchSpecUtil.ts | 3 +- frontend/tests/pages/search/page.test.tsx | 45 +++++++++++++++++++ 12 files changed, 88 insertions(+), 40 deletions(-) create mode 100644 frontend/src/constants/search.ts diff --git a/frontend/src/components/Header.tsx b/frontend/src/components/Header.tsx index 9ffa1750c5..7292075de9 100644 --- a/frontend/src/components/Header.tsx +++ b/frontend/src/components/Header.tsx @@ -43,7 +43,7 @@ const Header = ({ logoPath, locale }: Props) => { ]; const searchNavLink = { i18nKey: t("nav_link_search"), - href: "/search?status=forecasted,posted", + href: "/search", }; if (featureFlagsManager.isFeatureEnabled("showSearchV0")) { primaryLinksRef.current.splice(1, 0, searchNavLink); diff --git a/frontend/src/components/search/SearchOpportunityStatus.tsx b/frontend/src/components/search/SearchOpportunityStatus.tsx index 373a0413f2..6fd4ea72a6 100644 --- a/frontend/src/components/search/SearchOpportunityStatus.tsx +++ b/frontend/src/components/search/SearchOpportunityStatus.tsx @@ -1,6 +1,7 @@ "use client"; import { QueryContext } from "src/app/[locale]/search/QueryProvider"; +import { SEARCH_NO_STATUS_VALUE } from "src/constants/search"; import { useSearchParamUpdater } from "src/hooks/useSearchParamUpdater"; import { useTranslations } from "next-intl"; @@ -26,6 +27,10 @@ export default function SearchOpportunityStatus({ const handleCheck = (value: string, isChecked: boolean) => { const updated = new Set(query); isChecked ? updated.add(value) : updated.delete(value); + // Add "status=NO_STATUS_VALUE" if no values are selected. + if (updated.size === 0) { + updated.add(SEARCH_NO_STATUS_VALUE); + } updateQueryParams(updated, "status", queryTerm); }; diff --git a/frontend/src/constants/breadcrumbs.ts b/frontend/src/constants/breadcrumbs.ts index 5b9025dcb9..c747c5b1d1 100644 --- a/frontend/src/constants/breadcrumbs.ts +++ b/frontend/src/constants/breadcrumbs.ts @@ -6,7 +6,7 @@ const PROCESS: Breadcrumb = { title: "Process", path: "/process/" }; const SUBSCRIBE: Breadcrumb = { title: "Subscribe", path: "/subscribe/" }; const SEARCH: Breadcrumb = { title: "Search", - path: "/search?status=forecasted,posted", + path: "/search", }; export const SUBSCRIBE_CONFIRMATION: Breadcrumb = { title: "Confirmation", diff --git a/frontend/src/constants/search.ts b/frontend/src/constants/search.ts new file mode 100644 index 0000000000..c7e9ea3acf --- /dev/null +++ b/frontend/src/constants/search.ts @@ -0,0 +1,2 @@ +// Show all values status for search. +export const SEARCH_NO_STATUS_VALUE = "none"; diff --git a/frontend/src/types/search/searchRequestTypes.ts b/frontend/src/types/search/searchRequestTypes.ts index 40d40cedfd..aa8c083793 100644 --- a/frontend/src/types/search/searchRequestTypes.ts +++ b/frontend/src/types/search/searchRequestTypes.ts @@ -34,6 +34,8 @@ export enum SearchFetcherActionType { Update = "update", } +export type QuerySetParam = string | string[] | undefined; + export type SortOptions = | "relevancy" | "postedDateDesc" diff --git a/frontend/src/utils/search/convertSearchParamsToProperTypes.ts b/frontend/src/utils/search/convertSearchParamsToProperTypes.ts index 98498e267d..1ece9b231a 100644 --- a/frontend/src/utils/search/convertSearchParamsToProperTypes.ts +++ b/frontend/src/utils/search/convertSearchParamsToProperTypes.ts @@ -1,5 +1,7 @@ +import { SEARCH_NO_STATUS_VALUE } from "src/constants/search"; import { QueryParamData, + QuerySetParam, SearchFetcherActionType, SortOptions, } from "src/types/search/searchRequestTypes"; @@ -16,7 +18,7 @@ export function convertSearchParamsToProperTypes( return { ...params, query: params.query || "", // Convert empty string to null if needed - status: paramToSet(params.status), + status: paramToSet(params.status, "status"), fundingInstrument: paramToSet(params.fundingInstrument), eligibility: paramToSet(params.eligibility), agency: paramToSet(params.agency), @@ -30,8 +32,16 @@ export function convertSearchParamsToProperTypes( } // Helper function to convert query parameters to set -function paramToSet(param: string | string[] | undefined): Set { - if (!param) return new Set(); +// and to reset that status params none if status=none is set +function paramToSet(param: QuerySetParam, type?: string): Set { + if (!param && type === "status") { + return new Set(["forecasted", "posted"]); + } + + if (!param || (type === "status" && param === SEARCH_NO_STATUS_VALUE)) { + return new Set(); + } + if (Array.isArray(param)) { return new Set(param); } diff --git a/frontend/tests/e2e/search/search-loading.spec.ts b/frontend/tests/e2e/search/search-loading.spec.ts index bef38bc942..e5d608c7a9 100644 --- a/frontend/tests/e2e/search/search-loading.spec.ts +++ b/frontend/tests/e2e/search/search-loading.spec.ts @@ -13,7 +13,7 @@ test.describe("Search page tests", () => { const browser = await chromium.launch({ slowMo: 100 }); const page = await browser.newPage(); - await page.goto("/search?_ff=showSearchV0:true"); + await page.goto("/search"); const loadingIndicator = page.getByTestId("loading-message"); await fillSearchInputAndSubmit(searchTerm, page); diff --git a/frontend/tests/e2e/search/search-navigate.spec.ts b/frontend/tests/e2e/search/search-navigate.spec.ts index 127bd55c40..b911ca504f 100644 --- a/frontend/tests/e2e/search/search-navigate.spec.ts +++ b/frontend/tests/e2e/search/search-navigate.spec.ts @@ -1,13 +1,7 @@ import { expect, Page, test } from "@playwright/test"; import { BrowserContextOptions } from "playwright-core"; -import { - clickMobileNavMenu, - expectCheckboxIDIsChecked, - expectURLContainsQueryParam, - getMobileMenuButton, - hasMobileMenu, -} from "./searchSpecUtil"; +import { expectCheckboxIDIsChecked } from "./searchSpecUtil"; interface PageProps { page: Page; @@ -18,25 +12,13 @@ interface PageProps { test("should navigate from index to search page", async ({ page, }: PageProps) => { - // Start from the index page with feature flag set - await page.goto("/?_ff=showSearchV0:true"); - - // Mobile chrome must first click the menu button - if (await hasMobileMenu(page)) { - const menuButton = getMobileMenuButton(page); - await clickMobileNavMenu(menuButton); - } - - await page.click("nav >> text=Search"); + await page.goto("/search"); // Verify the presence of "Search" content on the page await expect(page.locator("h1")).toContainText( "Search funding opportunities", ); - // Verify that the new URL is correct - expectURLContainsQueryParam(page, "status", "forecasted,posted"); - // Verify that the 'forecasted' and 'posted' are checked await expectCheckboxIDIsChecked(page, "#status-forecasted"); await expectCheckboxIDIsChecked(page, "#status-posted"); diff --git a/frontend/tests/e2e/search/search-no-results.spec.ts b/frontend/tests/e2e/search/search-no-results.spec.ts index 9137975104..7c63ec5b6b 100644 --- a/frontend/tests/e2e/search/search-no-results.spec.ts +++ b/frontend/tests/e2e/search/search-no-results.spec.ts @@ -14,14 +14,11 @@ interface PageProps { } test.describe("Search page tests", () => { - test.beforeEach(async ({ page }: PageProps) => { - // Navigate to the search page with the feature flag set - await page.goto("/search?_ff=showSearchV0:true"); - }); - test("should return 0 results when searching for obscure term", async ({ page, }: PageProps) => { + await page.goto("/search"); + const searchTerm = generateRandomString([10]); await fillSearchInputAndSubmit(searchTerm, page); diff --git a/frontend/tests/e2e/search/search.spec.ts b/frontend/tests/e2e/search/search.spec.ts index 8e60b837b7..a1c0b53a62 100644 --- a/frontend/tests/e2e/search/search.spec.ts +++ b/frontend/tests/e2e/search/search.spec.ts @@ -27,20 +27,16 @@ interface PageProps { } test.describe("Search page tests", () => { - test.beforeEach(async ({ page }: PageProps) => { - // Navigate to the search page with the feature flag set - await page.goto("/search?_ff=showSearchV0:true"); - }); - test("should refresh and retain filters in a new tab", async ({ page }, { project, }) => { + await page.goto("/search"); + // Set all inputs, then refresh the page. Those same inputs should be // set from query params. const searchTerm = "education"; const statusCheckboxes = { - "status-forecasted": "forecasted", - "status-posted": "posted", + "status-closed": "closed", }; const fundingInstrumentCheckboxes = { "funding-instrument-cooperative_agreement": "cooperative_agreement", @@ -70,7 +66,12 @@ test.describe("Search page tests", () => { await fillSearchInputAndSubmit(searchTerm, page); - await toggleCheckboxes(page, statusCheckboxes, "status"); + await toggleCheckboxes( + page, + statusCheckboxes, + "status", + "forecasted,posted", + ); await clickAccordionWithTitle(page, "Funding instrument"); await toggleCheckboxes( @@ -120,6 +121,7 @@ test.describe("Search page tests", () => { test("resets page back to 1 when choosing a filter", async ({ page }, { project, }) => { + await page.goto("/search?status=none"); await clickPaginationPageNumber(page, 2); // Verify that page 1 is highlighted @@ -155,6 +157,7 @@ test.describe("Search page tests", () => { test("last result becomes first result when flipping sort order", async ({ page, }: PageProps) => { + await page.goto("/search"); await selectSortBy(page, "opportunityTitleDesc"); await clickLastPaginationPage(page); @@ -173,6 +176,7 @@ test.describe("Search page tests", () => { test("number of results is the same with none or all opportunity status checked", async ({ page, }, { project }) => { + await page.goto("/search?status=none"); const initialNumberOfOpportunityResults = await getNumberOfOpportunitySearchResults(page); diff --git a/frontend/tests/e2e/search/searchSpecUtil.ts b/frontend/tests/e2e/search/searchSpecUtil.ts index f401bc3e40..d44dcdd2c4 100644 --- a/frontend/tests/e2e/search/searchSpecUtil.ts +++ b/frontend/tests/e2e/search/searchSpecUtil.ts @@ -100,8 +100,9 @@ export async function toggleCheckboxes( page: Page, checkboxObject: Record, queryParamName: string, + startingQueryParams?: string, ) { - let runningQueryParams = ""; + let runningQueryParams = startingQueryParams ?? ""; for (const [checkboxID, queryParamValue] of Object.entries(checkboxObject)) { await toggleCheckbox(page, checkboxID); runningQueryParams += runningQueryParams diff --git a/frontend/tests/pages/search/page.test.tsx b/frontend/tests/pages/search/page.test.tsx index 1a2aff9516..fd9d127df1 100644 --- a/frontend/tests/pages/search/page.test.tsx +++ b/frontend/tests/pages/search/page.test.tsx @@ -1,6 +1,7 @@ import { render, screen } from "@testing-library/react"; import { identity } from "lodash"; import Search from "src/app/[locale]/search/page"; +import { SEARCH_NO_STATUS_VALUE } from "src/constants/search"; import { useTranslationsMock } from "src/utils/testing/intlMocks"; // test without feature flag functionality @@ -79,4 +80,48 @@ describe("Search Route", () => { expect(archivedCheckbox).toBeInTheDocument(); expect(archivedCheckbox).not.toBeChecked(); }); + + it("renders the search page with all opportunities if no status selected", async () => { + const mockSearchParams = { + status: SEARCH_NO_STATUS_VALUE, + }; + render(); + + // None should be checked if the "no status checked" value is present. + const statuses = ["forecasted", "posted", "closed", "archived"]; + for (const status of statuses) { + const checkbox = await screen.findByLabelText( + `opportunityStatus.label.${status}`, + ); + expect(checkbox).toBeInTheDocument(); + expect(checkbox).not.toBeChecked(); + } + }); + + it("renders the search page two status by default", async () => { + const mockSearchParams = { + status: undefined, + }; + render(); + + // These should be clicked if no status is present. + const clicked = ["forecasted", "posted"]; + for (const status of clicked) { + const checkbox = await screen.findByLabelText( + `opportunityStatus.label.${status}`, + ); + expect(checkbox).toBeInTheDocument(); + expect(checkbox).toBeChecked(); + } + + // These should not be clicked if no status is present. + const noClicked = ["closed", "archived"]; + for (const status of noClicked) { + const checkbox = await screen.findByLabelText( + `opportunityStatus.label.${status}`, + ); + expect(checkbox).toBeInTheDocument(); + expect(checkbox).not.toBeChecked(); + } + }); }); From b64f4190cc9a47a72084a3450b50e173aec3204d Mon Sep 17 00:00:00 2001 From: David Dudas Date: Tue, 5 Nov 2024 12:46:05 -0800 Subject: [PATCH 06/13] [Issue 2482] Migrate delivery metrics transform and load from simpler-grants-sandbox (#2617) ## Summary Adds new CLI capabilities to (1) initialize ETL database and (2) transform and load into the ETL database Fixes #2482 ### Time to review: __10 mins__ ## Changes proposed > What was added, updated, or removed in this PR. - Creates new dataset `etl_dataset` that can be hydrated from json - Adds new entry point to CLI: `poetry run analytics etl` - Exposes new commands `initialize_database` and `transform_and_load` - Creates new subpackage `integrations/etldb` to encapsulate transform and load logic - Ported `create table` sql from sandbox repo, updated to be Postgres-friendly TODO - [x] DB integration - connect to Postgres - [x] Finish `initialize_database` - [x] Port insert/update/select sql from sandbox repo, update it to be Postgres-friendly - [x] Finish `transform_and_load` - [x] Fix linter issues - [x] Write documentation - [x] Write tests ## Context for reviewers > Testing instructions, background context, more in-depth details of the implementation, and anything else you'd like to call out or ask reviewers. Explain how the changes were verified. 1. To initialize ETL database: `poetry run analytics etl initialize_database` 2. To transform and load into ETL database: `poetry run analytics etl transform_and_load --deliverable-file ./data/test-etl-01.json --effective-date 2024-10-21` ## Additional information > Screenshots, GIF demos, code examples or output to help show the changes working as expected. --------- Co-authored-by: widal001 --- analytics/Makefile | 17 +- analytics/src/analytics/cli.py | 51 +- .../src/analytics/datasets/etl_dataset.py | 145 +++++ analytics/src/analytics/datasets/utils.py | 47 ++ analytics/src/analytics/integrations/db.py | 1 - .../analytics/integrations/etldb/__init__.py | 11 + .../integrations/etldb/create_etl_db.sql | 100 +++ .../integrations/etldb/deliverable_model.py | 149 +++++ .../integrations/etldb/epic_model.py | 137 +++++ .../src/analytics/integrations/etldb/etldb.py | 44 ++ .../integrations/etldb/issue_model.py | 202 ++++++ .../src/analytics/integrations/etldb/main.py | 134 ++++ .../integrations/etldb/quad_model.py | 129 ++++ .../integrations/etldb/sprint_model.py | 132 ++++ analytics/tests/datasets/test_etldb.py | 83 +++ analytics/tests/etldb_test_01.json | 574 ++++++++++++++++++ analytics/tests/test_cli.py | 67 ++ documentation/analytics/usage.md | 14 + 18 files changed, 2034 insertions(+), 3 deletions(-) create mode 100644 analytics/src/analytics/datasets/etl_dataset.py create mode 100644 analytics/src/analytics/integrations/etldb/__init__.py create mode 100644 analytics/src/analytics/integrations/etldb/create_etl_db.sql create mode 100644 analytics/src/analytics/integrations/etldb/deliverable_model.py create mode 100644 analytics/src/analytics/integrations/etldb/epic_model.py create mode 100644 analytics/src/analytics/integrations/etldb/etldb.py create mode 100644 analytics/src/analytics/integrations/etldb/issue_model.py create mode 100644 analytics/src/analytics/integrations/etldb/main.py create mode 100644 analytics/src/analytics/integrations/etldb/quad_model.py create mode 100644 analytics/src/analytics/integrations/etldb/sprint_model.py create mode 100644 analytics/tests/datasets/test_etldb.py create mode 100644 analytics/tests/etldb_test_01.json diff --git a/analytics/Makefile b/analytics/Makefile index 036fdfc6b9..a9b1bd07d4 100644 --- a/analytics/Makefile +++ b/analytics/Makefile @@ -12,12 +12,13 @@ PROJECT_CONFIG_FILE ?= $(CONFIG_DIR)/github-projects.json ISSUE_FILE ?= $(OUTPUT_DIR)/delivery-data.json SPRINT ?= @current # Names of the points and sprint fields in the GitHub project -POINTS_FIELD ?= Points +POINTS_FIELD ?= Story Points SPRINT_FIELD ?= Sprint UNIT ?= points ACTION ?= show-results MIN_TEST_COVERAGE ?= 80 APP_NAME ?= grants-analytics +EFFECTIVE_DATE ?= $(shell date +"%Y-%m-%d") # Required for CI to work properly SHELL = /bin/bash -o pipefail @@ -143,6 +144,20 @@ lint: ## runs code quality checks # Data Commands # ################# +init-db: + @echo "=> Initializing the database schema" + @echo "=====================================================" + $(POETRY) analytics etl initialize_database + @echo "=====================================================" + +gh-transform-and-load: + @echo "=> Transforming and loading GitHub data into the database" + @echo "=====================================================" + $(POETRY) analytics etl transform_and_load \ + --deliverable-file $(DELIVERY_FILE) \ + --effective-date $(EFFECTIVE_DATE) + @echo "=====================================================" + gh-db-data-import: @echo "=> Importing sprint data to the database" @echo "=====================================================" diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index 37bea4334f..1d9bde629f 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -1,7 +1,9 @@ # pylint: disable=C0415 """Expose a series of CLI entrypoints for the analytics package.""" + import logging import logging.config +from datetime import datetime from pathlib import Path from typing import Annotated, Optional @@ -9,10 +11,11 @@ from slack_sdk import WebClient from sqlalchemy import text +from analytics.datasets.etl_dataset import EtlDataset from analytics.datasets.issues import GitHubIssues from analytics.etl.github import GitHubProjectConfig, GitHubProjectETL from analytics.etl.utils import load_config -from analytics.integrations import db, slack +from analytics.integrations import db, etldb, slack from analytics.metrics.base import BaseMetric, Unit from analytics.metrics.burndown import SprintBurndown from analytics.metrics.burnup import SprintBurnup @@ -37,6 +40,8 @@ STATUS_ARG = typer.Option( help="Deliverable status to include in report, can be passed multiple times", ) +DELIVERABLE_FILE_ARG = typer.Option(help="Path to file with exported deliverable data") +EFFECTIVE_DATE_ARG = typer.Option(help="YYYY-MM-DD effective date to apply to each imported row") # fmt: on # instantiate the main CLI entrypoint @@ -45,10 +50,12 @@ export_app = typer.Typer() metrics_app = typer.Typer() import_app = typer.Typer() +etl_app = typer.Typer() # add sub-commands to main entrypoint app.add_typer(export_app, name="export", help="Export data needed to calculate metrics") app.add_typer(metrics_app, name="calculate", help="Calculate key project metrics") app.add_typer(import_app, name="import", help="Import data into the database") +app.add_typer(etl_app, name="etl", help="Transform and load local file") @app.callback() @@ -240,3 +247,45 @@ def export_json_to_database(delivery_file: Annotated[str, ISSUE_FILE_ARG]) -> No ) rows = len(issues.to_dict()) logger.info("Number of rows in table: %s", rows) + + +# =========================================================== +# Etl commands +# =========================================================== + + +@etl_app.command(name="initialize_database") +def initialize_database() -> None: + """Initialize etl database.""" + print("initializing database") + etldb.init_db() + print("done") + + +@etl_app.command(name="transform_and_load") +def transform_and_load( + deliverable_file: Annotated[str, DELIVERABLE_FILE_ARG], + effective_date: Annotated[str, EFFECTIVE_DATE_ARG], +) -> None: + """Transform and load etl data.""" + # validate effective date arg + try: + dateformat = "%Y-%m-%d" + datestamp = ( + datetime.strptime(effective_date, dateformat) + .astimezone() + .strftime(dateformat) + ) + print(f"running transform and load with effective date {datestamp}") + except ValueError: + print("FATAL ERROR: malformed effective date, expected YYYY-MM-DD format") + return + + # hydrate a dataset instance from the input data + dataset = EtlDataset.load_from_json_file(file_path=deliverable_file) + + # sync data to db + etldb.sync_db(dataset, datestamp) + + # finish + print("transform and load is done") diff --git a/analytics/src/analytics/datasets/etl_dataset.py b/analytics/src/analytics/datasets/etl_dataset.py new file mode 100644 index 0000000000..8469ab6e42 --- /dev/null +++ b/analytics/src/analytics/datasets/etl_dataset.py @@ -0,0 +1,145 @@ +""" +Implement the EtlDataset class. + +This is a sub-class of BaseDataset that models +quad, deliverable, epic, issue, and sprint data. +""" + +from enum import Enum +from typing import Any, Self + +import pandas as pd +from numpy.typing import NDArray + +from analytics.datasets.base import BaseDataset +from analytics.datasets.utils import load_json_data_as_df + + +class EtlEntityType(Enum): + """Define entity types in the db schema.""" + + DELIVERABLE = "deliverable" + EPIC = "epic" + ISSUE = "issue" + SPRINT = "sprint" + QUAD = "quad" + + +class EtlDataset(BaseDataset): + """Encapsulate data exported from github.""" + + COLUMN_MAP = { + "deliverable_url": "deliverable_ghid", + "deliverable_title": "deliverable_title", + "deliverable_pillar": "deliverable_pillar", + "epic_url": "epic_ghid", + "epic_title": "epic_title", + "issue_url": "issue_ghid", + "issue_title": "issue_title", + "issue_parent": "issue_parent", + "issue_type": "issue_type", + "issue_is_closed": "issue_is_closed", + "issue_opened_at": "issue_opened_at", + "issue_closed_at": "issue_closed_at", + "issue_points": "issue_points", + "issue_status": "issue_status", + "sprint_id": "sprint_ghid", + "sprint_name": "sprint_name", + "sprint_start": "sprint_start", + "sprint_length": "sprint_length", + "sprint_end": "sprint_end", + "quad_id": "quad_ghid", + "quad_name": "quad_name", + "quad_start": "quad_start", + "quad_length": "quad_length", + "quad_end": "quad_end", + } + + @classmethod + def load_from_json_file(cls, file_path: str) -> Self: + """ + Load the input json file and instantiates an instance of EtlDataset. + + Parameters + ---------- + file_path: str + Path to the local json file containing data exported from GitHub + + Returns + ------- + Self: + An instance of the EtlDataset dataset class + """ + # load input datasets + df = load_json_data_as_df( + file_path=file_path, + column_map=cls.COLUMN_MAP, + date_cols=None, + ) + + # transform entity id columns + prefix = "https://github.com/" + for col in ("deliverable_ghid", "epic_ghid", "issue_ghid", "issue_parent"): + df[col] = df[col].str.replace(prefix, "") + + return cls(df) + + # QUAD getters + + def get_quad(self, quad_ghid: str) -> pd.Series: + """Fetch data about a given quad.""" + query_string = f"quad_ghid == '{quad_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_quad_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null quad ghids.""" + df = self.df[self.df.quad_ghid.notna()] + return df.quad_ghid.unique() + + # DELIVERABLE getters + + def get_deliverable(self, deliverable_ghid: str) -> pd.Series: + """Fetch data about a given deliverable.""" + query_string = f"deliverable_ghid == '{deliverable_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_deliverable_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null deliverable ghids.""" + df = self.df[self.df.deliverable_ghid.notna()] + return df.deliverable_ghid.unique() + + # SPRINT getters + + def get_sprint(self, sprint_ghid: str) -> pd.Series: + """Fetch data about a given sprint.""" + query_string = f"sprint_ghid == '{sprint_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_sprint_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null sprint ghids.""" + df = self.df[self.df.sprint_ghid.notna()] + return df.sprint_ghid.unique() + + # EPIC getters + + def get_epic(self, epic_ghid: str) -> pd.Series: + """Fetch data about a given epic.""" + query_string = f"epic_ghid == '{epic_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_epic_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null epic ghids.""" + df = self.df[self.df.epic_ghid.notna()] + return df.epic_ghid.unique() + + # ISSUE getters + + def get_issue(self, issue_ghid: str) -> pd.Series: + """Fetch data about a given issue.""" + query_string = f"issue_ghid == '{issue_ghid}'" + return self.df.query(query_string).iloc[0] + + def get_issue_ghids(self) -> NDArray[Any]: + """Fetch an array of unique non-null issue ghids.""" + df = self.df[self.df.issue_ghid.notna()] + return df.issue_ghid.unique() diff --git a/analytics/src/analytics/datasets/utils.py b/analytics/src/analytics/datasets/utils.py index e48f67541d..e6efcae545 100644 --- a/analytics/src/analytics/datasets/utils.py +++ b/analytics/src/analytics/datasets/utils.py @@ -2,6 +2,53 @@ import json +import pandas as pd + + +def load_json_data_as_df( + file_path: str, + column_map: dict, + date_cols: list[str] | None = None, + key_for_nested_items: str | None = None, +) -> pd.DataFrame: + """ + Load a file that contains JSON data and format is as a DataFrame. + + Parameters + ---------- + file_path: str + Path to the JSON file with the exported issue data + column_map: dict + Dictionary mapping of existing JSON keys to their new column names + date_cols: list[str] + List of columns that need to be converted to date types + key_for_nested_items: Optional[str] + Name of the key containing a list of objects to load as a dataframe. + Only needed if the JSON loaded is an object instead of a list + + Returns + ------- + pd.DataFrame + Pandas dataframe with columns renamed to match the values of the column map + """ + # load json data from the local file + with open(file_path, encoding="utf-8") as f: + json_data = json.loads(f.read()) + # if the items we want to convert are nested under a key extract them + if key_for_nested_items: + json_data = json_data[key_for_nested_items] + # flatten the nested json into a dataframe + df = pd.json_normalize(json_data) + # reorder and rename the columns + df = df[column_map.keys()] + df = df.rename(columns=column_map) + # convert datetime columns to date + if date_cols: + for col in date_cols: + # strip off the timestamp portion of the date + df[col] = pd.to_datetime(df[col]).dt.floor("d") + return df + def load_json_file(path: str) -> list[dict]: """Load contents of a JSON file into a dictionary.""" diff --git a/analytics/src/analytics/integrations/db.py b/analytics/src/analytics/integrations/db.py index 89bdeaa09e..e3314ec0bd 100644 --- a/analytics/src/analytics/integrations/db.py +++ b/analytics/src/analytics/integrations/db.py @@ -22,7 +22,6 @@ def get_db() -> Engine: A SQLAlchemy engine object representing the connection to the database. """ db = get_db_settings() - print(f"postgresql+psycopg://{db.user}:{db.password}@{db.db_host}:{db.port}") return create_engine( f"postgresql+psycopg://{db.user}:{db.password}@{db.db_host}:{db.port}", pool_pre_ping=True, diff --git a/analytics/src/analytics/integrations/etldb/__init__.py b/analytics/src/analytics/integrations/etldb/__init__.py new file mode 100644 index 0000000000..c1afd09469 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/__init__.py @@ -0,0 +1,11 @@ +"""Read and write data from/to delivery metrics database.""" + +__all__ = [ + "init_db", + "sync_db", +] + +from analytics.integrations.etldb.main import ( + init_db, + sync_db, +) diff --git a/analytics/src/analytics/integrations/etldb/create_etl_db.sql b/analytics/src/analytics/integrations/etldb/create_etl_db.sql new file mode 100644 index 0000000000..304c4c95b0 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/create_etl_db.sql @@ -0,0 +1,100 @@ +CREATE TABLE IF NOT EXISTS gh_deliverable ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + pillar TEXT, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS gh_deliverable_quad_map ( + id SERIAL PRIMARY KEY, + deliverable_id INTEGER NOT NULL, + quad_id INTEGER, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(deliverable_id, d_effective) +); +CREATE INDEX IF NOT EXISTS gh_dqm_i1 on gh_deliverable_quad_map(quad_id, d_effective); + +CREATE TABLE IF NOT EXISTS gh_epic ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS gh_epic_deliverable_map ( + id SERIAL PRIMARY KEY, + epic_id INTEGER NOT NULL, + deliverable_id INTEGER, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(epic_id, d_effective) +); +CREATE INDEX IF NOT EXISTS gh_edm_i1 on gh_epic_deliverable_map(deliverable_id, d_effective); + +CREATE TABLE IF NOT EXISTS gh_issue ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + type TEXT NOT NULL, + opened_date DATE, + closed_date DATE, + parent_issue_ghid TEXT, + epic_id INTEGER, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); +CREATE INDEX IF NOT EXISTS gh_issue_i1 on gh_issue(epic_id); + +CREATE TABLE IF NOT EXISTS gh_issue_history ( + id SERIAL PRIMARY KEY, + issue_id INTEGER NOT NULL, + status TEXT, + is_closed INTEGER NOT NULL, + points INTEGER NOT NULL DEFAULT 0, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(issue_id, d_effective) +); +CREATE INDEX IF NOT EXISTS gh_ih_i1 on gh_issue_history(issue_id, d_effective); + +CREATE TABLE IF NOT EXISTS gh_issue_sprint_map ( + id SERIAL PRIMARY KEY, + issue_id INTEGER NOT NULL, + sprint_id INTEGER, + d_effective DATE NOT NULL, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP, + UNIQUE(issue_id, d_effective) +); + +CREATE TABLE IF NOT EXISTS gh_sprint ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + name TEXT NOT NULL, + start_date DATE, + end_date DATE, + duration INTEGER, + quad_id INTEGER, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS gh_quad ( + id SERIAL PRIMARY KEY, + ghid TEXT UNIQUE NOT NULL, + name TEXT NOT NULL, + start_date DATE, + end_date DATE, + duration INTEGER, + t_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + t_modified TIMESTAMP +); +CREATE INDEX IF NOT EXISTS gh_quad_i1 on gh_quad(start_date); + diff --git a/analytics/src/analytics/integrations/etldb/deliverable_model.py b/analytics/src/analytics/integrations/etldb/deliverable_model.py new file mode 100644 index 0000000000..0f0d8cd35f --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/deliverable_model.py @@ -0,0 +1,149 @@ +"""Define EtlDeliverableModel class to encapsulate db CRUD operations.""" + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlDeliverableModel: + """Encapsulate CRUD operations for deliverable entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_deliverable( + self, + deliverable_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Write deliverable data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + deliverable_id = self._insert_dimensions(deliverable_df) + if deliverable_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if deliverable_id is None: + deliverable_id, change_type = self._update_dimensions(deliverable_df) + + # insert facts + if deliverable_id is not None: + self._insert_facts(deliverable_id, deliverable_df, ghid_map) + + return deliverable_id, change_type + + def _insert_dimensions(self, deliverable_df: Series) -> int | None: + """Write deliverable dimension data to etl database.""" + # insert into dimension table: deliverable + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_deliverable(ghid, title, pillar) " + "values (:ghid, :title, :pillar) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": deliverable_df["deliverable_ghid"], + "title": deliverable_df["deliverable_title"], + "pillar": deliverable_df["deliverable_pillar"], + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _insert_facts( + self, + deliverable_id: int, + deliverable_df: Series, + ghid_map: dict, + ) -> int | None: + """Write deliverable fact data to etl database.""" + # insert into fact table: deliverable_quad_map + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_deliverable_quad_map(deliverable_id, quad_id, d_effective) " + "values (:deliverable_id, :quad_id, :effective) " + "on conflict(deliverable_id, d_effective) do update " + "set (quad_id, t_modified) = (:quad_id, current_timestamp) returning id", + ), + { + "deliverable_id": deliverable_id, + "quad_id": ghid_map[EtlEntityType.QUAD].get( + deliverable_df["quad_ghid"], + ), + "effective": self.dbh.effective_date, + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions( + self, + deliverable_df: Series, + ) -> tuple[int | None, EtlChangeType]: + """Update deliverable fact data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_title = deliverable_df["deliverable_title"] + new_pillar = deliverable_df["deliverable_pillar"] + new_values = (new_title, new_pillar) + + # select old values + deliverable_id, old_title, old_pillar = self._select( + deliverable_df["deliverable_ghid"], + ) + old_values = (old_title, old_pillar) + + # compare + if deliverable_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + update_sql = text( + "update gh_deliverable set title = :new_title, pillar = :new_pillar, " + "t_modified = current_timestamp where id = :deliverable_id", + ) + update_values = { + "new_title": new_title, + "new_pillar": new_pillar, + "deliverable_id": deliverable_id, + } + cursor.execute(update_sql, update_values) + self.dbh.commit(cursor) + + return deliverable_id, change_type + + def _select(self, ghid: str) -> tuple[int | None, str | None, str | None]: + """Select deliverable data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text("select id, title, pillar from gh_deliverable where ghid = :ghid"), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2] + + return None, None, None diff --git a/analytics/src/analytics/integrations/etldb/epic_model.py b/analytics/src/analytics/integrations/etldb/epic_model.py new file mode 100644 index 0000000000..af0fdf45d8 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/epic_model.py @@ -0,0 +1,137 @@ +"""Defines EtlEpicModel class to encapsulate db CRUD operations.""" + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlEpicModel: + """Encapsulate CRUD operations for epic entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_epic( + self, + epic_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Write epic data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + epic_id = self._insert_dimensions(epic_df) + if epic_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if epic_id is None: + epic_id, change_type = self._update_dimensions(epic_df) + + # insert facts + if epic_id is not None: + self._insert_facts(epic_id, epic_df, ghid_map) + + return epic_id, change_type + + def _insert_dimensions(self, epic_df: Series) -> int | None: + """Write epic dimension data to etl database.""" + # insert into dimension table: epic + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_epic(ghid, title) values (:ghid, :title) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": epic_df["epic_ghid"], + "title": epic_df["epic_title"], + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _insert_facts( + self, + epic_id: int, + epic_df: Series, + ghid_map: dict, + ) -> int | None: + """Write epic fact data to etl database.""" + # insert into fact table: epic_deliverable_map + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_epic_deliverable_map(epic_id, deliverable_id, d_effective) " + "values (:epic_id, :deliverable_id, :effective) " + "on conflict(epic_id, d_effective) do update " + "set (deliverable_id, t_modified) = (:deliverable_id, current_timestamp) " + "returning id", + ), + { + "deliverable_id": ghid_map[EtlEntityType.DELIVERABLE].get( + epic_df["deliverable_ghid"], + ), + "epic_id": epic_id, + "effective": self.dbh.effective_date, + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions(self, epic_df: Series) -> tuple[int | None, EtlChangeType]: + """Update epic dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_title = epic_df["epic_title"] + + # select old values + epic_id, old_title = self._select(epic_df["epic_ghid"]) + + # compare + if epic_id is not None and (new_title,) != (old_title,): + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_epic set title = :new_title, t_modified = current_timestamp " + "where id = :epic_id", + ), + {"new_title": new_title, "epic_id": epic_id}, + ) + self.dbh.commit(cursor) + + return epic_id, change_type + + def _select(self, ghid: str) -> tuple[int | None, str | None]: + """Select epic data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text("select id, title from gh_epic where ghid = :ghid"), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1] + + return None, None diff --git a/analytics/src/analytics/integrations/etldb/etldb.py b/analytics/src/analytics/integrations/etldb/etldb.py new file mode 100644 index 0000000000..7a25faed39 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/etldb.py @@ -0,0 +1,44 @@ +"""Define EtlDb as an abstraction layer for database connections.""" + +from enum import Enum + +from sqlalchemy import Connection + +from analytics.integrations import db + + +class EtlDb: + """Encapsulate etl database connections.""" + + def __init__(self, effective: str | None = None) -> None: + """Construct instance.""" + self._db_engine = db.get_db() + self._connection: Connection | None = None + self.effective_date = effective + self.dateformat = "%Y-%m-%d" + + def __del__(self) -> None: + """Destroy instance.""" + self.disconnect() + + def connection(self) -> Connection: + """Get a connection object from the db engine.""" + if self._connection is None: + self._connection = self._db_engine.connect() + return self._connection + + def commit(self, connection: Connection) -> None: + """Commit an open transaction.""" + connection.commit() + + def disconnect(self) -> None: + """Dispose of db connection.""" + self._db_engine.dispose() + + +class EtlChangeType(Enum): + """An enum to describe ETL change types.""" + + NONE = 0 + INSERT = 1 + UPDATE = 2 diff --git a/analytics/src/analytics/integrations/etldb/issue_model.py b/analytics/src/analytics/integrations/etldb/issue_model.py new file mode 100644 index 0000000000..36740438d5 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/issue_model.py @@ -0,0 +1,202 @@ +"""Define EtlIssueModel class to encapsulate db CRUD operations.""" + +from datetime import datetime + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlIssueModel: + """Encapsulate CRUD operations for issue entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_issue( + self, + issue_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Write issue data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + issue_id = self._insert_dimensions(issue_df, ghid_map) + if issue_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if issue_id is None: + issue_id, change_type = self._update_dimensions(issue_df, ghid_map) + + # insert facts + if issue_id is not None: + self._insert_facts(issue_id, issue_df, ghid_map) + + return issue_id, change_type + + def _insert_dimensions(self, issue_df: Series, ghid_map: dict) -> int | None: + """Write issue dimension data to etl database.""" + # insert into dimension table: issue + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_issue " + "(ghid, title, type, opened_date, closed_date, parent_issue_ghid, epic_id) " + "values (:ghid, :title, :type, :opened_date, :closed_date, :parent_ghid, :epic_id) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": issue_df["issue_ghid"], + "title": issue_df["issue_title"], + "type": issue_df["issue_type"] or "None", + "opened_date": issue_df["issue_opened_at"], + "closed_date": issue_df["issue_closed_at"], + "parent_ghid": issue_df["issue_parent"], + "epic_id": ghid_map[EtlEntityType.EPIC].get(issue_df["epic_ghid"]), + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _insert_facts( + self, + issue_id: int, + issue_df: Series, + ghid_map: dict, + ) -> tuple[int | None, int | None]: + """Write issue fact data to etl database.""" + # get values needed for sql statement + issue_df = issue_df.fillna(0) + insert_values = { + "issue_id": issue_id, + "status": issue_df["issue_status"], + "is_closed": int(issue_df["issue_is_closed"]), + "points": issue_df["issue_points"], + "sprint_id": ghid_map[EtlEntityType.SPRINT].get(issue_df["sprint_ghid"]), + "effective": self.dbh.effective_date, + } + history_id = None + map_id = None + + # insert into fact table: issue_history + cursor = self.dbh.connection() + insert_sql1 = text( + "insert into gh_issue_history (issue_id, status, is_closed, points, d_effective) " + "values (:issue_id, :status, :is_closed, :points, :effective) " + "on conflict (issue_id, d_effective) " + "do update set (status, is_closed, points, t_modified) = " + "(:status, :is_closed, :points, current_timestamp) " + "returning id", + ) + result1 = cursor.execute(insert_sql1, insert_values) + row1 = result1.fetchone() + if row1: + history_id = row1[0] + + # insert into fact table: issue_sprint_map + insert_sql2 = text( + "insert into gh_issue_sprint_map (issue_id, sprint_id, d_effective) " + "values (:issue_id, :sprint_id, :effective) " + "on conflict (issue_id, d_effective) " + "do update set (sprint_id, t_modified) = " + "(:sprint_id, current_timestamp) returning id", + ) + result2 = cursor.execute(insert_sql2, insert_values) + row2 = result2.fetchone() + if row2: + map_id = row2[0] + + # commit + self.dbh.commit(cursor) + + return history_id, map_id + + def _update_dimensions( + self, + issue_df: Series, + ghid_map: dict, + ) -> tuple[int | None, EtlChangeType]: + """Update issue dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_values = ( + issue_df["issue_title"], + issue_df["issue_type"] or "None", + issue_df["issue_opened_at"], + issue_df["issue_closed_at"], + issue_df["issue_parent"], + ghid_map[EtlEntityType.EPIC].get(issue_df["epic_ghid"]), + ) + + # select old values + issue_id, o_title, o_type, o_opened, o_closed, o_parent, o_epic_id = ( + self._select(issue_df["issue_ghid"]) + ) + old_values = (o_title, o_type, o_opened, o_closed, o_parent, o_epic_id) + + # compare + if issue_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_issue set " + "title = :new_title, type = :new_type, opened_date = :new_opened, " + "closed_date = :new_closed, parent_issue_ghid = :new_parent, " + "epic_id = :new_epic_id, t_modified = current_timestamp " + "where id = :issue_id", + ), + { + "new_title": issue_df["issue_title"], + "new_type": issue_df["issue_type"] or "None", + "new_opened": issue_df["issue_opened_at"], + "new_closed": issue_df["issue_closed_at"], + "new_parent": issue_df["issue_parent"], + "new_epic_id": ghid_map[EtlEntityType.EPIC].get( + issue_df["epic_ghid"], + ), + "issue_id": issue_id, + }, + ) + self.dbh.commit(cursor) + + return issue_id, change_type + + def _select(self, ghid: str) -> tuple[ + int | None, + str | None, + str | None, + datetime | None, + datetime | None, + str | None, + int | None, + ]: + """Select issue data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text( + "select id, title, type, opened_date, closed_date, parent_issue_ghid, epic_id " + "from gh_issue where ghid = :ghid", + ), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2], row[3], row[4], row[5], row[6] + + return None, None, None, None, None, None, None diff --git a/analytics/src/analytics/integrations/etldb/main.py b/analytics/src/analytics/integrations/etldb/main.py new file mode 100644 index 0000000000..11f790bda8 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/main.py @@ -0,0 +1,134 @@ +"""Integrate with database to read and write etl data.""" + +from pathlib import Path + +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlDataset, EtlEntityType +from analytics.integrations.etldb.deliverable_model import EtlDeliverableModel +from analytics.integrations.etldb.epic_model import EtlEpicModel +from analytics.integrations.etldb.etldb import EtlDb +from analytics.integrations.etldb.issue_model import EtlIssueModel +from analytics.integrations.etldb.quad_model import EtlQuadModel +from analytics.integrations.etldb.sprint_model import EtlSprintModel + +VERBOSE = False + + +def init_db() -> None: + """Initialize etl database.""" + # define the path to the sql file + parent_path = Path(__file__).resolve().parent + sql_path = f"{parent_path}/create_etl_db.sql" + + # read sql file + with open(sql_path) as f: + sql = f.read() + + # execute sql + db = EtlDb() + cursor = db.connection() + cursor.execute( + text(sql), + ) + db.commit(cursor) + + +def sync_db(dataset: EtlDataset, effective: str) -> None: + """Write github data to etl database.""" + # initialize a map of github id to db row id + ghid_map: dict[EtlEntityType, dict[str, int]] = { + EtlEntityType.DELIVERABLE: {}, + EtlEntityType.EPIC: {}, + EtlEntityType.SPRINT: {}, + EtlEntityType.QUAD: {}, + } + + # initialize db connection + db = EtlDb(effective) + + # sync quad data to db resulting in row id for each quad + ghid_map[EtlEntityType.QUAD] = sync_quads(db, dataset) + print(f"quad row(s) processed: {len(ghid_map[EtlEntityType.QUAD])}") + + # sync deliverable data to db resulting in row id for each deliverable + ghid_map[EtlEntityType.DELIVERABLE] = sync_deliverables( + db, + dataset, + ghid_map, + ) + print(f"deliverable row(s) processed: {len(ghid_map[EtlEntityType.DELIVERABLE])}") + + # sync sprint data to db resulting in row id for each sprint + ghid_map[EtlEntityType.SPRINT] = sync_sprints(db, dataset, ghid_map) + print(f"sprint row(s) processed: {len(ghid_map[EtlEntityType.SPRINT])}") + + # sync epic data to db resulting in row id for each epic + ghid_map[EtlEntityType.EPIC] = sync_epics(db, dataset, ghid_map) + print(f"epic row(s) processed: {len(ghid_map[EtlEntityType.EPIC])}") + + # sync issue data to db resulting in row id for each issue + issue_map = sync_issues(db, dataset, ghid_map) + print(f"issue row(s) processed: {len(issue_map)}") + + +def sync_deliverables(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each deliverable and return a map of row ids.""" + result = {} + model = EtlDeliverableModel(db) + for ghid in dataset.get_deliverable_ghids(): + deliverable_df = dataset.get_deliverable(ghid) + result[ghid], _ = model.sync_deliverable(deliverable_df, ghid_map) + if VERBOSE: + print(f"DELIVERABLE '{ghid}' row_id = {result[ghid]}") + return result + + +def sync_epics(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each epic and return a map of row ids.""" + result = {} + model = EtlEpicModel(db) + for ghid in dataset.get_epic_ghids(): + epic_df = dataset.get_epic(ghid) + result[ghid], _ = model.sync_epic(epic_df, ghid_map) + if VERBOSE: + print(f"EPIC '{ghid}' row_id = {result[ghid]}") + return result + + +def sync_issues(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each issue and return a map of row ids.""" + result = {} + model = EtlIssueModel(db) + for ghid in dataset.get_issue_ghids(): + issue_df = dataset.get_issue(ghid) + result[ghid], _ = model.sync_issue(issue_df, ghid_map) + if VERBOSE: + print(f"ISSUE '{ghid}' issue_id = {result[ghid]}") + return result + + +def sync_sprints(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict: + """Insert or update (if necessary) a row for each sprint and return a map of row ids.""" + result = {} + model = EtlSprintModel(db) + for ghid in dataset.get_sprint_ghids(): + sprint_df = dataset.get_sprint(ghid) + result[ghid], _ = model.sync_sprint(sprint_df, ghid_map) + if VERBOSE: + print(f"SPRINT '{ghid}' row_id = {result[ghid]}") + return result + + +def sync_quads(db: EtlDb, dataset: EtlDataset) -> dict: + """Insert or update (if necessary) a row for each quad and return a map of row ids.""" + result = {} + model = EtlQuadModel(db) + for ghid in dataset.get_quad_ghids(): + quad_df = dataset.get_quad(ghid) + result[ghid], _ = model.sync_quad(quad_df) + if VERBOSE: + print( + f"QUAD '{ghid}' title = '{quad_df['quad_name']}', row_id = {result[ghid]}", + ) + return result diff --git a/analytics/src/analytics/integrations/etldb/quad_model.py b/analytics/src/analytics/integrations/etldb/quad_model.py new file mode 100644 index 0000000000..6324710ec9 --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/quad_model.py @@ -0,0 +1,129 @@ +"""Defines EtlQuadModel class to encapsulate db CRUD operations.""" + +from datetime import datetime + +from pandas import Series +from sqlalchemy import text + +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlQuadModel: + """Encapsulates CRUD operations for quad entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_quad(self, quad_df: Series) -> tuple[int | None, EtlChangeType]: + """Write quad data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + quad_id = self._insert_dimensions(quad_df) + if quad_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, then select and update + if quad_id is None: + quad_id, change_type = self._update_dimensions(quad_df) + + return quad_id, change_type + + def _insert_dimensions(self, quad_df: Series) -> int | None: + """Write quad dimension data to etl database.""" + # insert into dimension table: quad + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_quad(ghid, name, start_date, end_date, duration) " + "values (:ghid, :name, :start_date, :end_date, :duration) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": quad_df["quad_ghid"], + "name": quad_df["quad_name"], + "start_date": quad_df["quad_start"], + "end_date": quad_df["quad_end"], + "duration": quad_df["quad_length"], + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions(self, quad_df: Series) -> tuple[int | None, EtlChangeType]: + """Update quad dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_values = ( + quad_df["quad_name"], + quad_df["quad_start"], + quad_df["quad_end"], + int(quad_df["quad_length"]), + ) + + # select old values + quad_id, old_name, old_start, old_end, old_duration = self._select( + quad_df["quad_ghid"], + ) + old_values = ( + old_name, + old_start.strftime(self.dbh.dateformat) if old_start is not None else None, + old_end.strftime(self.dbh.dateformat) if old_end is not None else None, + old_duration, + ) + + # compare + if quad_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_quad set name = :new_name, " + "start_date = :new_start, end_date = :new_end, " + "duration = :new_duration, t_modified = current_timestamp " + "where id = :quad_id", + ), + { + "new_name": new_values[0], + "new_start": new_values[1], + "new_end": new_values[2], + "new_duration": new_values[3], + "quad_id": quad_id, + }, + ) + self.dbh.commit(cursor) + + return quad_id, change_type + + def _select(self, ghid: str) -> tuple[ + int | None, + str | None, + datetime | None, + datetime | None, + int | None, + ]: + """Select epic data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text( + "select id, name, start_date, end_date, duration " + "from gh_quad where ghid = :ghid", + ), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2], row[3], row[4] + + return None, None, None, None, None diff --git a/analytics/src/analytics/integrations/etldb/sprint_model.py b/analytics/src/analytics/integrations/etldb/sprint_model.py new file mode 100644 index 0000000000..f14dc14cbc --- /dev/null +++ b/analytics/src/analytics/integrations/etldb/sprint_model.py @@ -0,0 +1,132 @@ +"""Define EtlSprintModel class to encapsulate db CRUD operations.""" + +from pandas import Series +from sqlalchemy import text + +from analytics.datasets.etl_dataset import EtlEntityType +from analytics.integrations.etldb.etldb import EtlChangeType, EtlDb + + +class EtlSprintModel: + """Encapsulate CRUD operations for sprint entity.""" + + def __init__(self, dbh: EtlDb) -> None: + """Instantiate a class instance.""" + self.dbh = dbh + + def sync_sprint(self, sprint_df: Series, ghid_map: dict) -> tuple[ + int | None, + EtlChangeType, + ]: + """Write sprint data to etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # insert dimensions + sprint_id = self._insert_dimensions(sprint_df, ghid_map) + if sprint_id is not None: + change_type = EtlChangeType.INSERT + + # if insert failed, select and update + if sprint_id is None: + sprint_id, change_type = self._update_dimensions(sprint_df, ghid_map) + + return sprint_id, change_type + + def _insert_dimensions(self, sprint_df: Series, ghid_map: dict) -> int | None: + """Write sprint dimension data in etl database.""" + # insert into dimension table: sprint + new_row_id = None + cursor = self.dbh.connection() + result = cursor.execute( + text( + "insert into gh_sprint(ghid, name, start_date, end_date, duration, quad_id) " + "values (:ghid, :name, :start, :end, :duration, :quad_id) " + "on conflict(ghid) do nothing returning id", + ), + { + "ghid": sprint_df["sprint_ghid"], + "name": sprint_df["sprint_name"], + "start": sprint_df["sprint_start"], + "end": sprint_df["sprint_end"], + "duration": sprint_df["sprint_length"], + "quad_id": ghid_map[EtlEntityType.QUAD].get(sprint_df["quad_ghid"]), + }, + ) + row = result.fetchone() + if row: + new_row_id = row[0] + + # commit + self.dbh.commit(cursor) + + return new_row_id + + def _update_dimensions(self, sprint_df: Series, ghid_map: dict) -> tuple[ + int | None, + EtlChangeType, + ]: + """Update sprint dimension data in etl database.""" + # initialize return value + change_type = EtlChangeType.NONE + + # get new values + new_values = ( + sprint_df["sprint_name"], + sprint_df["sprint_start"], + sprint_df["sprint_end"], + sprint_df["sprint_length"], + ghid_map[EtlEntityType.QUAD].get(sprint_df["quad_ghid"]), + ) + + # select old values + sprint_id, old_name, old_start, old_end, old_duration, old_quad_id = ( + self._select(sprint_df["sprint_ghid"]) + ) + old_values = (old_name, old_start, old_end, old_duration, old_quad_id) + + # compare + if sprint_id is not None and new_values != old_values: + change_type = EtlChangeType.UPDATE + cursor = self.dbh.connection() + cursor.execute( + text( + "update gh_sprint set name = :new_name, start_date = :new_start, " + "end_date = :new_end, duration = :new_duration, quad_id = :quad_id, " + "t_modified = current_timestamp where id = :sprint_id", + ), + { + "new_name": new_values[0], + "new_start": new_values[1], + "new_end": new_values[2], + "new_duration": new_values[3], + "quad_id": new_values[4], + "sprint_id": sprint_id, + }, + ) + self.dbh.commit(cursor) + + return sprint_id, change_type + + def _select(self, ghid: str) -> tuple[ + int | None, + str | None, + str | None, + str | None, + int | None, + int | None, + ]: + """Select epic data from etl database.""" + cursor = self.dbh.connection() + result = cursor.execute( + text( + "select id, name, start_date, end_date, duration, quad_id " + "from gh_sprint where ghid = :ghid", + ), + {"ghid": ghid}, + ) + row = result.fetchone() + if row: + return row[0], row[1], row[2], row[3], row[4], row[5] + + return None, None, None, None, None, None diff --git a/analytics/tests/datasets/test_etldb.py b/analytics/tests/datasets/test_etldb.py new file mode 100644 index 0000000000..042022a896 --- /dev/null +++ b/analytics/tests/datasets/test_etldb.py @@ -0,0 +1,83 @@ +"""Tests the code in datasets/etl_dataset.py.""" + +from analytics.datasets.etl_dataset import EtlDataset + + +class TestEtlDataset: + """Test EtlDataset methods.""" + + TEST_FILE_1 = "./tests/etldb_test_01.json" + + def test_load_from_json_files(self): + """Class method should return the correctly transformed data.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + row_count = dataset.df.shape[0] + col_count = dataset.df.shape[1] + assert row_count == 22 + assert col_count == 24 + + def test_deliverable_fetchers(self): + """Deliverable fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_deliverable_ghids() + assert len(unique_ghids) == 2 + + ghid = unique_ghids[0] + assert ghid == "agilesix/simpler-grants-sandbox/issues/2" + + deliverable = dataset.get_deliverable(ghid) + assert deliverable["deliverable_title"] == "Opportunity listing page" + + def test_epic_fetchers(self): + """Epic fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_epic_ghids() + assert len(unique_ghids) == 4 + + ghid = unique_ghids[0] + assert ghid == "agilesix/simpler-grants-sandbox/issues/8" + + epic = dataset.get_epic(ghid) + assert epic["epic_title"] == "Deploy opportunity listing behind a feature flag" + + def test_issue_fetchers(self): + """Issue fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_issue_ghids() + assert len(unique_ghids) == 22 + + ghid = unique_ghids[0] + assert ghid == "agilesix/simpler-grants-sandbox/issues/46" + + issue = dataset.get_issue(ghid) + assert issue["issue_opened_at"] == "2024-09-27T15:29:37Z" + + def test_sprint_fetchers(self): + """Deliverable fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_sprint_ghids() + assert len(unique_ghids) == 5 + + ghid = unique_ghids[0] + assert ghid == "74402b12" + + sprint = dataset.get_sprint(ghid) + assert sprint["sprint_name"] == "Sprint 2" + + def test_quad_fetchers(self): + """Quad fetchers should return expected values.""" + dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1) + + unique_ghids = dataset.get_quad_ghids() + assert len(unique_ghids) == 1 + + ghid = unique_ghids[0] + assert ghid == "de5f962b" + + quad = dataset.get_quad(ghid) + assert quad["quad_name"] == "BY1 Quad 1" diff --git a/analytics/tests/etldb_test_01.json b/analytics/tests/etldb_test_01.json new file mode 100644 index 0000000000..2c3801b444 --- /dev/null +++ b/analytics/tests/etldb_test_01.json @@ -0,0 +1,574 @@ +[ + { + "issue_title": "exampel that doesn't ask type when created", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/46", + "issue_parent": null, + "issue_type": null, + "issue_is_closed": false, + "issue_opened_at": "2024-09-27T15:29:37Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "Implement opportunity listing UI", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/11", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:49:17Z", + "issue_closed_at": null, + "issue_points": 5, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "Implement opportunity listing API", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/10", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:49:03Z", + "issue_closed_at": null, + "issue_points": 5, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "exampel creating from project interface", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/45", + "issue_parent": null, + "issue_type": null, + "issue_is_closed": false, + "issue_opened_at": "2024-09-26T23:23:31Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "In Progress", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "Implement search API", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/5", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "issue_type": "Task", + "issue_is_closed": true, + "issue_opened_at": "2024-09-18T15:41:49Z", + "issue_closed_at": "2024-09-18T19:40:40Z", + "issue_points": 3, + "issue_status": "Done", + "sprint_id": "26a4c39d", + "sprint_name": "Sprint 1", + "sprint_start": "2024-09-09", + "sprint_length": 14, + "sprint_end": "2024-09-23", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "\ud83d\udd0e SimplerFind", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/1", + "deliverable_title": "Search", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "epic_title": "Deploy search behind a feature flag" + }, + { + "issue_title": "Enable feature flag for 1000 users", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/15", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:50:41Z", + "issue_closed_at": null, + "issue_points": 3, + "issue_status": "Todo", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "epic_title": "Release to opportunity listing to 10k users" + }, + { + "issue_title": "Load test for 10k active users", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/14", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:50:20Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": "8a6d26a4", + "sprint_name": "Sprint 4", + "sprint_start": "2024-10-21", + "sprint_length": 14, + "sprint_end": "2024-11-04", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "epic_title": "Release to opportunity listing to 10k users" + }, + { + "issue_title": "Enable feature flag for first 100 users", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/13", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:50:02Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": "11878b69", + "sprint_name": "Sprint 5", + "sprint_start": "2024-11-04", + "sprint_length": 14, + "sprint_end": "2024-11-18", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/9", + "epic_title": "Release to opportunity listing to 10k users" + }, + { + "issue_title": "Conduct first usability test for opportunity listing", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/12", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:49:32Z", + "issue_closed_at": null, + "issue_points": 3, + "issue_status": "Todo", + "sprint_id": "0a9ff409", + "sprint_name": "Sprint 3", + "sprint_start": "2024-10-07", + "sprint_length": 14, + "sprint_end": "2024-10-21", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "Implement Search UI", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/6", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:41:58Z", + "issue_closed_at": null, + "issue_points": 8, + "issue_status": "Todo", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "\ud83d\udd0e SimplerFind", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/1", + "deliverable_title": "Search", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "epic_title": "Deploy search behind a feature flag" + }, + { + "issue_title": "Host first usability test for search", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/7", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-18T15:42:24Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": "8a6d26a4", + "sprint_name": "Sprint 4", + "sprint_start": "2024-10-21", + "sprint_length": 14, + "sprint_end": "2024-11-04", + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "\ud83d\udd0e SimplerFind", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/1", + "deliverable_title": "Search", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/3", + "epic_title": "Deploy search behind a feature flag" + }, + { + "issue_title": "[Bug] DD test 01", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/25", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:09:08Z", + "issue_closed_at": "2024-09-21T01:26:21Z", + "issue_points": 2, + "issue_status": "Done", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Bug] DD test 02", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/26", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:14:39Z", + "issue_closed_at": "2024-09-21T01:26:39Z", + "issue_points": 1, + "issue_status": "Done", + "sprint_id": "74402b12", + "sprint_name": "Sprint 2", + "sprint_start": "2024-09-23", + "sprint_length": 14, + "sprint_end": "2024-10-07", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[BUG] DD test 03", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/27", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:22:52Z", + "issue_closed_at": "2024-09-21T01:26:45Z", + "issue_points": 5, + "issue_status": "Done", + "sprint_id": "26a4c39d", + "sprint_name": "Sprint 1", + "sprint_start": "2024-09-09", + "sprint_length": 14, + "sprint_end": "2024-09-23", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[BUG] DD test 04 with screenshot", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/28", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": true, + "issue_opened_at": "2024-09-21T01:24:39Z", + "issue_closed_at": "2024-09-21T01:26:52Z", + "issue_points": 2, + "issue_status": "Done", + "sprint_id": "26a4c39d", + "sprint_name": "Sprint 1", + "sprint_start": "2024-09-09", + "sprint_length": 14, + "sprint_end": "2024-09-23", + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "Sub-issue 1", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/32", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/10", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-24T17:06:03Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "Sub issue 2", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/33", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/10", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-24T17:06:18Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": "de5f962b", + "quad_name": "BY1 Quad 1", + "quad_start": "2024-09-09", + "quad_length": 122, + "quad_end": "2025-01-09", + "deliverable_pillar": "SimplerApply", + "deliverable_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/2", + "deliverable_title": "Opportunity listing page", + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/8", + "epic_title": "Deploy opportunity listing behind a feature flag" + }, + { + "issue_title": "[Bug] Sample bug created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/35", + "issue_parent": null, + "issue_type": "Bug", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T17:58:41Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Feature] Sample feature request created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/36", + "issue_parent": null, + "issue_type": "Enhancement", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T17:59:29Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[ADR] Sample decision created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/37", + "issue_parent": null, + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T18:00:34Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Task] Sample task created with issue template", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/40", + "issue_parent": null, + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T18:02:47Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": null, + "epic_title": null + }, + { + "issue_title": "[Task] Bar 1", + "issue_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/42", + "issue_parent": "https://github.com/agilesix/simpler-grants-sandbox/issues/39", + "issue_type": "Task", + "issue_is_closed": false, + "issue_opened_at": "2024-09-25T19:14:49Z", + "issue_closed_at": null, + "issue_points": null, + "issue_status": "Todo", + "sprint_id": null, + "sprint_name": null, + "sprint_start": null, + "sprint_length": null, + "sprint_end": null, + "quad_id": null, + "quad_name": null, + "quad_start": null, + "quad_length": null, + "quad_end": null, + "deliverable_pillar": null, + "deliverable_url": null, + "deliverable_title": null, + "epic_url": "https://github.com/agilesix/simpler-grants-sandbox/issues/39", + "epic_title": "[Epic] Sample epic created with issue template" + } +] \ No newline at end of file diff --git a/analytics/tests/test_cli.py b/analytics/tests/test_cli.py index 8c230aaaec..da7dbffc86 100644 --- a/analytics/tests/test_cli.py +++ b/analytics/tests/test_cli.py @@ -272,3 +272,70 @@ def test_stdout_message_includes_issues_if_unit_set_to_issues( # validation - check that slack message is printed and includes 'points' assert "Slack message" in result.stdout assert "issues" in result.stdout + + +class TestEtlEntryPoint: + """Test the etl entry point.""" + + TEST_FILE_1 = "./tests/etldb_test_01.json" + EFFECTIVE_DATE = "2024-10-07" + + def test_init_db(self): + """Test the db initialization command.""" + # setup - create command + command = [ + "etl", + "initialize_database", + ] + # execution + result = runner.invoke(app, command) + print(result.stdout) + # validation - check there wasn't an error + assert result.exit_code == 0 + assert "initializing database" in result.stdout + assert "done" in result.stdout + + def test_transform_and_load_with_valid_parameters(self): + """Test the transform and load command.""" + # setup - create command + command = [ + "etl", + "transform_and_load", + "--deliverable-file", + self.TEST_FILE_1, + "--effective-date", + str(self.EFFECTIVE_DATE), + ] + # execution + result = runner.invoke(app, command) + print(result.stdout) + # validation - check there wasn't an error + assert result.exit_code == 0 + assert ( + f"running transform and load with effective date {self.EFFECTIVE_DATE}" + in result.stdout + ) + assert "quad row(s) processed: 1" in result.stdout + assert "deliverable row(s) processed: 2" in result.stdout + assert "sprint row(s) processed: 5" in result.stdout + assert "epic row(s) processed: 4" in result.stdout + assert "issue row(s) processed: 22" in result.stdout + assert "transform and load is done" in result.stdout + + def test_transform_and_load_with_malformed_effective_date_parameter(self): + """Test the transform and load command.""" + # setup - create command + command = [ + "etl", + "transform_and_load", + "--deliverable-file", + self.TEST_FILE_1, + "--effective-date", + "2024-Oct-07", + ] + # execution + result = runner.invoke(app, command) + print(result.stdout) + # validation - check there wasn't an error + assert result.exit_code == 0 + assert "FATAL ERROR: malformed effective date" in result.stdout diff --git a/documentation/analytics/usage.md b/documentation/analytics/usage.md index c84a4f0210..801b8cbb38 100644 --- a/documentation/analytics/usage.md +++ b/documentation/analytics/usage.md @@ -223,3 +223,17 @@ poetry run analytics calculate deliverable_percent_complete \ --show-results \ --unit points ``` + +### Extract and Load + +Development is underway on new as-is/as-was reporting capabilities, the foundation of which is an extract-and-load workflow that writes to an ETL DB. + +Initialize the ETL DB: +```bash +poetry run analytics etl initialize_database +``` + +Transform and load a json file into the ETL DB: +```bash +poetry run analytics etl transform_and_load --deliverable-file ./data/test-etl-01.json --effective-date 2024-10-28 +``` From c93c1cfc736359d5ef89765000cdf3301c0d734e Mon Sep 17 00:00:00 2001 From: doug-s-nava <92806979+doug-s-nava@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:03:42 -0500 Subject: [PATCH 07/13] [Issue 2616] show correct date opportunity data with collapsable text (#2683) * replaces the hardcoded close date warning message in the opportunity status widget with the the close date description from the opportunity summary data * handles long summary descriptions and long close date descriptions by collapsing the description after a certain number of characters * updates the ContentDisplayToggle component to be useful in this use case with flexibility around horizontal alignment and the ability to display hidden content above the button when expanded * creates utility functions to handle splitting a string, such as the summary description, that may contain markup that we want to not break by splitting at the wrong spot * updates local seed data with two records containing long summary and close date descriptions to ease local testing --- api/Makefile | 2 +- api/tests/lib/seed_local_db.py | 9 ++ api/tests/src/db/models/factories.py | 13 +++ frontend/package-lock.json | 1 - frontend/src/app/[locale]/search/page.tsx | 1 + .../src/components/CollapsableContent.tsx | 0 .../src/components/ContentDisplayToggle.tsx | 42 ++++++--- .../opportunity/OpportunityDescription.tsx | 53 +++++++++++- .../opportunity/OpportunityHistory.tsx | 58 +++++++------ .../opportunity/OpportunityStatusWidget.tsx | 48 ++++++++++- frontend/src/i18n/messages/en/index.ts | 6 +- frontend/src/utils/generalUtils.ts | 78 +++++++++++++++++ .../opportunity/OpportunityHistory.test.tsx | 42 +++------ .../OpportunityStatusWidget.test.tsx | 3 +- frontend/tests/utils/generalUtils.test.ts | 86 +++++++++++++++++++ 15 files changed, 359 insertions(+), 83 deletions(-) create mode 100644 frontend/src/components/CollapsableContent.tsx create mode 100644 frontend/src/utils/generalUtils.ts create mode 100644 frontend/tests/utils/generalUtils.test.ts diff --git a/api/Makefile b/api/Makefile index c0da34cb1a..67079cafc2 100644 --- a/api/Makefile +++ b/api/Makefile @@ -192,7 +192,7 @@ start-opensearch: ./bin/wait-for-local-opensearch.sh ################################################## -# Opensearch +# Localstack ################################################## init-localstack: start-localstack setup-localstack ## Start localstack (local s3) and setup buckets diff --git a/api/tests/lib/seed_local_db.py b/api/tests/lib/seed_local_db.py index c634f280e5..f299ce6847 100644 --- a/api/tests/lib/seed_local_db.py +++ b/api/tests/lib/seed_local_db.py @@ -98,6 +98,9 @@ def _build_opportunities(db_session: db.Session, iterations: int, include_histor no_current_summary_opps = factories.OpportunityFactory.create_batch( size=5, no_current_summary=True ) + long_description_opps = factories.OpportunityFactory.create_batch( + size=2, is_posted_summary=True, has_long_descriptions=True + ) if include_history: _add_history(forecasted_opps, add_forecast_hist=True) @@ -115,6 +118,12 @@ def _build_opportunities(db_session: db.Session, iterations: int, include_histor ) _add_history(archived_forecast_opps, add_forecast_hist=True) _add_history(no_current_summary_opps, is_history_deleted=True) + _add_history( + long_description_opps, + add_non_forecast_hist=True, + add_forecast=True, + add_forecast_hist=True, + ) # generate a few opportunities with mostly null values all_null_opportunities = factories.OpportunityFactory.create_batch( diff --git a/api/tests/src/db/models/factories.py b/api/tests/src/db/models/factories.py index a4b17411a3..728752777a 100644 --- a/api/tests/src/db/models/factories.py +++ b/api/tests/src/db/models/factories.py @@ -338,6 +338,10 @@ class Params: current_opportunity_summary__is_archived_forecast_summary=True ) + has_long_descriptions = factory.Trait( + current_opportunity_summary__has_long_descriptions=True + ) + # Set all nullable fields to null all_fields_null = factory.Trait( agency=None, @@ -571,6 +575,11 @@ class Params: link_applicant_types=[], ) + has_long_descriptions = factory.Trait( + summary_description=factory.Faker("paragraph", nb_sentences=60), + close_date_description=factory.Faker("paragraph", nb_sentences=30), + ) + class CurrentOpportunitySummaryFactory(BaseFactory): class Meta: @@ -608,6 +617,10 @@ class Params: opportunity_summary__is_archived_forecast_summary=True, ) + has_long_descriptions = factory.Trait( + opportunity_summary__has_long_descriptions=True, + ) + class OpportunityAssistanceListingFactory(BaseFactory): class Meta: diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 854aa98cbe..1eab857890 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -38,7 +38,6 @@ "@testing-library/jest-dom": "^5.16.5", "@testing-library/react": "^16.0.0", "@testing-library/user-event": "^14.4.3", - "@types/jest": "^29.5.13", "@types/jest-axe": "^3.5.5", "@types/js-cookie": "^3.0.6", "@types/node": "^20.8.2", diff --git a/frontend/src/app/[locale]/search/page.tsx b/frontend/src/app/[locale]/search/page.tsx index 2c0329d0c2..8023f6465c 100644 --- a/frontend/src/app/[locale]/search/page.tsx +++ b/frontend/src/app/[locale]/search/page.tsx @@ -48,6 +48,7 @@ function Search({ searchParams }: { searchParams: SearchParamsTypes }) { showCallToAction={t("filterDisplayToggle.showFilters")} hideCallToAction={t("filterDisplayToggle.hideFilters")} breakpoint={Breakpoints.TABLET} + type="centered" > + {children} +
+ ); + return ( <> + {!positionButtonBelowContent && toggledContent}
@@ -47,20 +71,12 @@ export default function ContentDisplayToggle({ className="usa-button usa-button--unstyled text-no-underline" > - + {toggledContentVisible ? hideCallToAction : showCallToAction}
-
- {children} -
+ {positionButtonBelowContent && toggledContent} ); } diff --git a/frontend/src/components/opportunity/OpportunityDescription.tsx b/frontend/src/components/opportunity/OpportunityDescription.tsx index 95917237eb..ba9b685f8b 100644 --- a/frontend/src/components/opportunity/OpportunityDescription.tsx +++ b/frontend/src/components/opportunity/OpportunityDescription.tsx @@ -1,8 +1,11 @@ import DOMPurify from "isomorphic-dompurify"; import { Summary } from "src/types/opportunity/opportunityResponseTypes"; +import { splitMarkup } from "src/utils/generalUtils"; import { useTranslations } from "next-intl"; +import ContentDisplayToggle from "src/components/ContentDisplayToggle"; + type Props = { summary: Summary; }; @@ -45,6 +48,49 @@ const eligibleApplicantsFormatter = (applicantTypes: string[]) => { }); }; +const SummaryDescriptionDisplay = ({ + summaryDescription = "", +}: { + summaryDescription: string; +}) => { + const t = useTranslations("OpportunityListing.description"); + if (summaryDescription?.length < 750) { + return ( +
+ ); + } + + const purifiedSummary = DOMPurify.sanitize(summaryDescription); + + const { preSplit, postSplit } = splitMarkup(purifiedSummary, 600); + return ( + <> +
+ +
+ + + ); +}; + const OpportunityDescription = ({ summary }: Props) => { const t = useTranslations("OpportunityListing.description"); const agency_phone_number_stripped = summary?.agency_phone_number @@ -68,15 +114,14 @@ const OpportunityDescription = ({ summary }: Props) => { ) : ( "--" ); + return ( <>

{t("title")}

{t("summary")}

-

{t("eligibility")}

{t("eligible_applicants")}

diff --git a/frontend/src/components/opportunity/OpportunityHistory.tsx b/frontend/src/components/opportunity/OpportunityHistory.tsx index b999a06924..be39094b1d 100644 --- a/frontend/src/components/opportunity/OpportunityHistory.tsx +++ b/frontend/src/components/opportunity/OpportunityHistory.tsx @@ -7,39 +7,47 @@ type Props = { summary: Summary; }; -type TranslationKeys = - | "version" - | "posted_date" - | "closing_date" - | "archive_date"; - const formatHistoryDate = (date: string | null) => { return date === null ? "--" : formatDate(date); }; +const OpportunityHistoryItem = ({ + title, + content, +}: { + title: string; + content: string; +}) => { + return ( +
+

+ {title} + {":"} +

+

{content}

+
+ ); +}; + const OpportunityHistory = ({ summary }: Props) => { const t = useTranslations("OpportunityListing.history"); - const opportunityDates = { - posted_date: summary.post_date, - closing_date: summary.close_date, - archive_date: summary.archive_date, - }; return (
-

History

-
-

{t("version")}:

-

{summary.version_number || "--"}

-
- {Object.entries(opportunityDates).map(([title, date], index) => ( -
-

- {t(`${title as TranslationKeys}`)} - {":"} -

-

{formatHistoryDate(date)}

-
- ))} +

{t("history")}

+ + +
); }; diff --git a/frontend/src/components/opportunity/OpportunityStatusWidget.tsx b/frontend/src/components/opportunity/OpportunityStatusWidget.tsx index f6ad2fcad9..8319a6c928 100644 --- a/frontend/src/components/opportunity/OpportunityStatusWidget.tsx +++ b/frontend/src/components/opportunity/OpportunityStatusWidget.tsx @@ -1,12 +1,52 @@ import { Opportunity } from "src/types/opportunity/opportunityResponseTypes"; import { formatDate } from "src/utils/dateUtil"; +import { findFirstWhitespace } from "src/utils/generalUtils"; import { useTranslations } from "next-intl"; +import ContentDisplayToggle from "src/components/ContentDisplayToggle"; + type Props = { opportunityData: Opportunity; }; +const CloseDateDescriptionDisplay = ({ + closeDateDescription = "", +}: { + closeDateDescription: string; +}) => { + const t = useTranslations("OpportunityListing.description"); + if (!closeDateDescription) { + return; + } + + if (closeDateDescription?.length < 150) { + return ( +
+

{closeDateDescription}

+
+ ); + } + + // close date description should not contain markup so no need to use splitMarkup + const splitAt = findFirstWhitespace(closeDateDescription, 120); + const preSplit = closeDateDescription.substring(0, splitAt); + const postSplit = closeDateDescription.substring(splitAt + 1); + + return ( +
+

{preSplit}...

+ +

{postSplit}

+
+
+ ); +}; + const OpportunityStatusWidget = ({ opportunityData }: Props) => { const t = useTranslations("OpportunityListing.status_widget"); @@ -47,9 +87,11 @@ const OpportunityStatusWidget = ({ opportunityData }: Props) => { {formatDate(closeDate) || "--"}

-
-

{t("closing_warn")}

-
+ ); case "forecasted": diff --git a/frontend/src/i18n/messages/en/index.ts b/frontend/src/i18n/messages/en/index.ts index 7ee3275e0a..f39338fa1e 100644 --- a/frontend/src/i18n/messages/en/index.ts +++ b/frontend/src/i18n/messages/en/index.ts @@ -23,6 +23,9 @@ export const messages = { description: "Description", email: "Email", telephone: "Phone", + show_summary: "Show full summary", + show_description: "Show full description", + hide_summary_description: "Hide full description", }, award_info: { yes: "Yes", @@ -40,6 +43,7 @@ export const messages = { category_explanation: "Category Explanation", }, history: { + history: "History", posted_date: "Posted date", closing_date: "Original closing date for applications", archive_date: "Archive date", @@ -53,8 +57,6 @@ export const messages = { closed: "Closed: ", closing: "Closing: ", forecasted: "Forecasted", - closing_warn: - "Electronically submitted applications must be submitted no later than 5:00 p.m., ET, on the listed application due date.", }, cta: { apply_title: "Application process", diff --git a/frontend/src/utils/generalUtils.ts b/frontend/src/utils/generalUtils.ts new file mode 100644 index 0000000000..d0a2cd18e5 --- /dev/null +++ b/frontend/src/utils/generalUtils.ts @@ -0,0 +1,78 @@ +// splits a string containing markup at a specified character length +// tracks open tags to ensure that split does not occur until all open tags are closed. +// will throw on malformed markup, so any callers will need to gracefully handle that error. +// Note that: +// * the character count is zero indexed +// * the split will happen on the first whitespace AFTER the supplied split point +// Refer to tests to see how this works in practice +export const splitMarkup = ( + markupString: string, + splitAt: number, +): { + preSplit: string; + postSplit: string; +} => { + if (splitAt > markupString.length) { + return { preSplit: markupString, postSplit: "" }; + } + const { preSplit, postSplit } = Array.from(markupString).reduce( + (tracker, character, index) => { + if ( + !tracker.splitComplete && + !tracker.tagOpen && + index > splitAt && + character.match(/\s/) + ) { + tracker.splitComplete = true; + } + if (tracker.splitComplete) { + tracker.postSplit += character; + return tracker; + } + if (character === "<") { + if (tracker.openTagIndicator) { + throw new Error("Malformed markup: unclosed tag"); + } + tracker.openTagIndicator = true; + } + if (tracker.openTagIndicator && character === "/") { + if (tracker.closeTagIndicator) { + throw new Error("Malformed markup: improperly closed tag"); + } + tracker.closeTagIndicator = true; + } + if (tracker.openTagIndicator && character === ">") { + if (tracker.closeTagIndicator) { + tracker.tagOpen--; + tracker.closeTagIndicator = false; + tracker.openTagIndicator = false; + if (tracker.tagOpen < 0) { + throw new Error("Malformed markup: tag open close mismatch"); + } + } else { + tracker.tagOpen++; + tracker.openTagIndicator = false; + } + } + tracker.preSplit += character; + return tracker; + }, + { + preSplit: "", + postSplit: "", + tagOpen: 0, + openTagIndicator: false, + closeTagIndicator: false, + splitComplete: false, + }, + ); + return { + preSplit, + postSplit, + }; +}; + +// for a given string, find the first whitespace character following a given index +// useful for splitting strings of text at word breaks +export const findFirstWhitespace = (content: string, startAt: number): number => + content.substring(startAt).search(/\s/) + startAt; diff --git a/frontend/tests/components/opportunity/OpportunityHistory.test.tsx b/frontend/tests/components/opportunity/OpportunityHistory.test.tsx index 6eee3fd4c6..4ed21ae51d 100644 --- a/frontend/tests/components/opportunity/OpportunityHistory.test.tsx +++ b/frontend/tests/components/opportunity/OpportunityHistory.test.tsx @@ -3,6 +3,7 @@ import { render, screen } from "@testing-library/react"; import { Summary } from "src/types/opportunity/opportunityResponseTypes"; import { formatDate } from "src/utils/dateUtil"; +import { useTranslationsMock } from "src/utils/testing/intlMocks"; import OpportunityHistory from "src/components/opportunity/OpportunityHistory"; @@ -13,20 +14,11 @@ jest.mock("src/utils/dateUtil", () => ({ // Mock `useTranslations` jest.mock("next-intl", () => ({ - useTranslations: jest.fn().mockReturnValue((key: string) => { - const translations: { [key: string]: string } = { - posted_date: "Posted date", - closing_date: "Original closing date for applications", - archive_date: "Archive date", - version: "Version", - }; - return translations[key] || key; - }), + useTranslations: () => useTranslationsMock(), })); const mockSummary = { post_date: "2024-01-15", - close_date: "2024-06-30", archive_date: "2024-12-31", version_number: 1, } as Summary; @@ -35,23 +27,15 @@ describe("OpportunityHistory", () => { it("renders history section with dates formatted correctly", () => { render(); - // Check for section heading - expect(screen.getByText("History")).toBeInTheDocument(); + expect(screen.getByText("history")).toBeInTheDocument(); - // Check version label - expect(screen.getByText("Version:")).toBeInTheDocument(); + expect(screen.getByText("version:")).toBeInTheDocument(); expect(screen.getByText("1")).toBeInTheDocument(); - // Check Posted Date - expect(screen.getByText("Posted date:")).toBeInTheDocument(); + expect(screen.getByText("posted_date:")).toBeInTheDocument(); expect(screen.getByText("2024-01-15")).toBeInTheDocument(); - // Check Original Closing Date - expect( - screen.getByText("Original closing date for applications:"), - ).toBeInTheDocument(); - expect(screen.getByText("2024-06-30")).toBeInTheDocument(); - expect(screen.getByText("Archive date:")).toBeInTheDocument(); + expect(screen.getByText("archive_date:")).toBeInTheDocument(); expect(screen.getByText("2024-12-31")).toBeInTheDocument(); }); @@ -60,7 +44,6 @@ describe("OpportunityHistory", () => { // Check that formatDate is called with the right dates expect(formatDate).toHaveBeenCalledWith("2024-01-15"); - expect(formatDate).toHaveBeenCalledWith("2024-06-30"); expect(formatDate).toHaveBeenCalledWith("2024-12-31"); }); @@ -70,7 +53,6 @@ describe("OpportunityHistory", () => { summary={ { post_date: null, - close_date: null, archive_date: null, version_number: null, } as Summary @@ -78,18 +60,14 @@ describe("OpportunityHistory", () => { />, ); - const firstHeading = screen.getByText("History"); + const firstHeading = screen.getByText("history"); expect(firstHeading.nextSibling).toHaveTextContent("--"); - const secondHeading = screen.getByText("Version:"); + const secondHeading = screen.getByText("version:"); expect(secondHeading.nextSibling).toHaveTextContent("--"); - const thirdHeading = screen.getByText("Posted date:"); + const thirdHeading = screen.getByText("posted_date:"); expect(thirdHeading.nextSibling).toHaveTextContent("--"); - const fourthHeading = screen.getByText( - "Original closing date for applications:", - ); - expect(fourthHeading.nextSibling).toHaveTextContent("--"); - const fifthHeading = screen.getByText("Archive date:"); + const fifthHeading = screen.getByText("archive_date:"); expect(fifthHeading.nextSibling).toHaveTextContent("--"); }); }); diff --git a/frontend/tests/components/opportunity/OpportunityStatusWidget.test.tsx b/frontend/tests/components/opportunity/OpportunityStatusWidget.test.tsx index dacd91c0b0..9b654c85a9 100644 --- a/frontend/tests/components/opportunity/OpportunityStatusWidget.test.tsx +++ b/frontend/tests/components/opportunity/OpportunityStatusWidget.test.tsx @@ -18,8 +18,6 @@ jest.mock("next-intl", () => ({ closed: "Closed: ", closing: "Closing: ", forecasted: "Forecasted", - closing_warn: - "Electronically submitted applications must be submitted no later than 5:00 p.m., ET, on the listed application due date.", }; return translations[key] || key; }), @@ -37,6 +35,7 @@ const mockOpportunityData: Opportunity = { summary: { close_date: "2024-12-01", archive_date: "2025-01-01", + close_date_description: "Electronically submitted applications", } as Summary, } as Opportunity; diff --git a/frontend/tests/utils/generalUtils.test.ts b/frontend/tests/utils/generalUtils.test.ts new file mode 100644 index 0000000000..555b611aa4 --- /dev/null +++ b/frontend/tests/utils/generalUtils.test.ts @@ -0,0 +1,86 @@ +import { findFirstWhitespace, splitMarkup } from "src/utils/generalUtils"; + +describe("splitMarkup", () => { + it("handles case where markdown string is shorter than split point", () => { + const exampleMarkup = "Hi!"; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 10); + expect(preSplit).toEqual(exampleMarkup); + expect(postSplit).toEqual(""); + }); + + it("waits until whitespace to perform split (space)", () => { + const exampleMarkup = "Hi! Exceptionally long word."; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 6); + expect(preSplit).toEqual("Hi! Exceptionally"); + expect(postSplit).toEqual(" long word."); + }); + + it("waits until whitespace to perform split (line break)", () => { + const exampleMarkup = `Hi! Exceptionally + long word.`; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 6); + expect(preSplit).toEqual("Hi! Exceptionally"); + expect(postSplit).toEqual(` + long word.`); + }); + + it("splits correctly with no tags", () => { + const exampleMarkup = + "In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since."; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 73); + expect(preSplit).toEqual( + "In my younger and more vulnerable years my father gave me some advice that", + ); + expect(postSplit).toEqual(" I've been turning over in my mind ever since."); + }); + + it("splits correctly with simple tags", () => { + const exampleMarkup = + "
In my younger and more vulnerable years my father gave me some advice
that I've been turning over in my mind ever since.
"; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 55); + expect(preSplit).toEqual( + "
In my younger and more vulnerable years my father gave me some advice
", + ); + expect(postSplit).toEqual( + "
that I've been turning over in my mind ever since.
", + ); + }); + + it("splits correctly on tag names", () => { + const exampleMarkup = + "
In my younger and more vulnerable years my father gave me some advice
that I've been turning over in my mind ever since.
"; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 78); + expect(preSplit).toEqual( + "
In my younger and more vulnerable years my father gave me some advice
", + ); + expect(postSplit).toEqual( + "
that I've been turning over in my mind ever since.
", + ); + }); + + it("splits correctly with nested tags", () => { + const exampleMarkup = + "
In

my younger

and more

vulnerable years my

  • father
  • gave
me some

advice
that I've been turning over in my mind ever since.
"; + const { preSplit, postSplit } = splitMarkup(exampleMarkup, 75); + expect(preSplit).toEqual( + "
In

my younger

and more

vulnerable years my

  • father
  • gave
me some

advice
", + ); + expect(postSplit).toEqual( + "
that I've been turning over in my mind ever since.
", + ); + }); +}); + +describe("findFirstWhitespace", () => { + it("gives you the index of the first whitespace character in a string after a given index", () => { + expect(findFirstWhitespace("hi there", 0)).toEqual(2); + expect(findFirstWhitespace("hi there dude", 3)).toEqual(8); + expect( + findFirstWhitespace( + `hi there + dude`, + 3, + ), + ).toEqual(8); + }); +}); From e513579e0818439fd93282a3cb53dd8c8fe717c2 Mon Sep 17 00:00:00 2001 From: David Dudas Date: Tue, 5 Nov 2024 15:28:38 -0800 Subject: [PATCH 08/13] [Issue 2665] Minor fix to Makefile and CLI to integrate GitHub export and import capabilities (#2744) ## Summary Partially Fixes #2665 ### Time to review: __1 min__ ## Changes proposed > What was added, updated, or removed in this PR. Minor change to Makefile and CLI to integrate the export work done in #2481 with the import work done in #2482. ## Context for reviewers > Testing instructions, background context, more in-depth details of the implementation, and anything else you'd like to call out or ask reviewers. Explain how the changes were verified. Issue #2481 added capability to export data from GitHub to a flat file that can be ingested by the new import capabilities added by issue #2482. However, in testing, I just noticed a filename mismatch in `Makefile` that precludes e2e integration. This PR fixes the filename mismatch. ## Additional information > Screenshots, GIF demos, code examples or output to help show the changes working as expected. --- analytics/Makefile | 2 +- analytics/src/analytics/cli.py | 5 ++--- analytics/tests/test_cli.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/analytics/Makefile b/analytics/Makefile index a9b1bd07d4..1e7fbc646e 100644 --- a/analytics/Makefile +++ b/analytics/Makefile @@ -154,7 +154,7 @@ gh-transform-and-load: @echo "=> Transforming and loading GitHub data into the database" @echo "=====================================================" $(POETRY) analytics etl transform_and_load \ - --deliverable-file $(DELIVERY_FILE) \ + --issue-file $(ISSUE_FILE) \ --effective-date $(EFFECTIVE_DATE) @echo "=====================================================" diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index 1d9bde629f..84c3c379bd 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -40,7 +40,6 @@ STATUS_ARG = typer.Option( help="Deliverable status to include in report, can be passed multiple times", ) -DELIVERABLE_FILE_ARG = typer.Option(help="Path to file with exported deliverable data") EFFECTIVE_DATE_ARG = typer.Option(help="YYYY-MM-DD effective date to apply to each imported row") # fmt: on @@ -264,7 +263,7 @@ def initialize_database() -> None: @etl_app.command(name="transform_and_load") def transform_and_load( - deliverable_file: Annotated[str, DELIVERABLE_FILE_ARG], + issue_file: Annotated[str, ISSUE_FILE_ARG], effective_date: Annotated[str, EFFECTIVE_DATE_ARG], ) -> None: """Transform and load etl data.""" @@ -282,7 +281,7 @@ def transform_and_load( return # hydrate a dataset instance from the input data - dataset = EtlDataset.load_from_json_file(file_path=deliverable_file) + dataset = EtlDataset.load_from_json_file(file_path=issue_file) # sync data to db etldb.sync_db(dataset, datestamp) diff --git a/analytics/tests/test_cli.py b/analytics/tests/test_cli.py index da7dbffc86..dec9ed29d1 100644 --- a/analytics/tests/test_cli.py +++ b/analytics/tests/test_cli.py @@ -301,7 +301,7 @@ def test_transform_and_load_with_valid_parameters(self): command = [ "etl", "transform_and_load", - "--deliverable-file", + "--issue-file", self.TEST_FILE_1, "--effective-date", str(self.EFFECTIVE_DATE), @@ -328,7 +328,7 @@ def test_transform_and_load_with_malformed_effective_date_parameter(self): command = [ "etl", "transform_and_load", - "--deliverable-file", + "--issue-file", self.TEST_FILE_1, "--effective-date", "2024-Oct-07", From fe2e37d83a8640fded6bdb96c61ffd1ab71dd1d9 Mon Sep 17 00:00:00 2001 From: Michael Chouinard <46358556+chouinar@users.noreply.github.com> Date: Thu, 7 Nov 2024 12:00:12 -0500 Subject: [PATCH 09/13] [Issue #2729] OpenSearch should return an accurate count of total results (#2730) ## Summary Fixes #2729 ### Time to review: __3 mins__ ## Changes proposed Set `track_total_hits` to True when calling OpenSearch ## Context for reviewers While this field says it has possible performance cost due to needing to count all the records, we also request a count for various facet counts anyways, so I imagine this won't matter at all. ## Additional information https://opensearch.org/docs/latest/api-reference/search/ I loaded ~16k records into my local search index. Querying it with no filters returns this pagination info now: ```json { "order_by": "opportunity_id", "page_offset": 1, "page_size": 25, "sort_direction": "ascending", "total_pages": 676, "total_records": 16884 } ``` --- api/src/adapters/search/opensearch_query_builder.py | 12 ++++++++++++ .../opportunities_v1/search_opportunities.py | 3 +++ .../adapters/search/test_opensearch_query_builder.py | 12 +++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/api/src/adapters/search/opensearch_query_builder.py b/api/src/adapters/search/opensearch_query_builder.py index a7e900c4d1..7e2cebe4df 100644 --- a/api/src/adapters/search/opensearch_query_builder.py +++ b/api/src/adapters/search/opensearch_query_builder.py @@ -94,6 +94,8 @@ def __init__(self) -> None: self.sort_values: list[dict[str, dict[str, str]]] = [] + self._track_total_hits: bool = True + self.must: list[dict] = [] self.filters: list[dict] = [] @@ -133,6 +135,15 @@ def sort_by(self, sort_values: list[typing.Tuple[str, SortDirection]]) -> typing return self + def track_total_hits(self, track_total_hits: bool) -> typing.Self: + """ + Whether or not to track the total number of hits in the response accurately. + + By default OpenSearch will stop counting after 10k records are counted. + """ + self._track_total_hits = track_total_hits + return self + def simple_query(self, query: str, fields: list[str]) -> typing.Self: """ Adds a simple_query_string which queries against the provided fields. @@ -238,6 +249,7 @@ def build(self) -> dict: # Always include the scores in the response objects # even if we're sorting by non-relevancy "track_scores": True, + "track_total_hits": self._track_total_hits, } # Add sorting if any was provided diff --git a/api/src/services/opportunities_v1/search_opportunities.py b/api/src/services/opportunities_v1/search_opportunities.py index aff12321d6..9f232cc79c 100644 --- a/api/src/services/opportunities_v1/search_opportunities.py +++ b/api/src/services/opportunities_v1/search_opportunities.py @@ -143,6 +143,9 @@ def _add_aggregations(builder: search.SearchQueryBuilder) -> None: def _get_search_request(params: SearchOpportunityParams) -> dict: builder = search.SearchQueryBuilder() + # Make sure total hit count gets counted for more than 10k records + builder.track_total_hits(True) + # Pagination builder.pagination( page_size=params.pagination.page_size, page_number=params.pagination.page_offset diff --git a/api/tests/src/adapters/search/test_opensearch_query_builder.py b/api/tests/src/adapters/search/test_opensearch_query_builder.py index 33f9b2b199..9ce82f1098 100644 --- a/api/tests/src/adapters/search/test_opensearch_query_builder.py +++ b/api/tests/src/adapters/search/test_opensearch_query_builder.py @@ -161,7 +161,12 @@ def seed_data(self, search_client, search_index): def test_query_builder_empty(self, search_client, search_index): builder = SearchQueryBuilder() - assert builder.build() == {"size": 25, "from": 0, "track_scores": True} + assert builder.build() == { + "size": 25, + "from": 0, + "track_scores": True, + "track_total_hits": True, + } validate_valid_request(search_client, search_index, builder, FULL_DATA) @@ -265,6 +270,7 @@ def test_query_builder_pagination_and_sorting( "size": page_size, "from": page_size * (page_number - 1), "track_scores": True, + "track_total_hits": True, "sort": expected_sort, } @@ -369,6 +375,7 @@ def test_query_builder_filter_terms( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": {"bool": {"filter": expected_terms}}, } @@ -429,6 +436,7 @@ def test_query_builder_filter_date_range( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": {"bool": {"filter": [{"range": {"publication_date": expected_ranges}}]}}, } @@ -474,6 +482,7 @@ def test_query_builder_filter_int_range( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": {"bool": {"filter": [{"range": {"page_count": expected_ranges}}]}}, } @@ -633,6 +642,7 @@ def test_query_builder_simple_query_and_aggregations( "size": 25, "from": 0, "track_scores": True, + "track_total_hits": True, "query": { "bool": { "must": [ From 9cc181eea1b1d7877fb6084b4ac5f6a0055fce67 Mon Sep 17 00:00:00 2001 From: "kai [they]" Date: Thu, 7 Nov 2024 10:13:00 -0800 Subject: [PATCH 10/13] [no ticket] update labeler action version (#2769) ## Context This is currently failing a lot of CI builds --- .github/workflows/labeler.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index a17c354ba8..38fd452776 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -20,8 +20,8 @@ jobs: pull-requests: write runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 # Uploads repository content to the runner - with: - sparse-checkout: | - .github - - uses: actions/labeler@v4 + - uses: actions/checkout@v4 # Uploads repository content to the runner + with: + sparse-checkout: | + .github + - uses: actions/labeler@v5 From b7863d4d0ee30144e5d14093ea10119222f62e00 Mon Sep 17 00:00:00 2001 From: David Dudas Date: Thu, 7 Nov 2024 10:30:54 -0800 Subject: [PATCH 11/13] [Issue 2665] Add gh-transform-and-load command to scheduled jobs (#2759) ## Summary Fixes #2665 ### Time to review: __1 min__ ## Changes proposed > What was added, updated, or removed in this PR. Added `gh-transform-and-load` command to existing `make gh-data-export` command. I'm not sure if this is sufficient or correct, but I'm taking a guess based on what I see in https://github.com/HHS/simpler-grants-gov/pull/2546 and https://github.com/HHS/simpler-grants-gov/pull/2506. ## Context for reviewers > Testing instructions, background context, more in-depth details of the implementation, and anything else you'd like to call out or ask reviewers. Explain how the changes were verified. In the analytics work stream, we have a new CLI command `make gh-transform-and-load` for transforming and loading (some) GitHub data. Per issue #2665, that command should be run daily, after the existing `gh-data-export` command which exports data from Github. I see that `scheduled_jobs.tf` seems to be the mechanism by which `make gh-data-export` runs daily. In this PR I'm taking and educated guess and attempting to add `gh-transform-and-load` to the existing job, and requesting feedback from @coilysiren as to whether this is the correct approach. ## Additional information > Screenshots, GIF demos, code examples or output to help show the changes working as expected. Co-authored-by: kai [they] --- infra/analytics/app-config/env-config/scheduled_jobs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/analytics/app-config/env-config/scheduled_jobs.tf b/infra/analytics/app-config/env-config/scheduled_jobs.tf index 3d0774888c..b6d098ffac 100644 --- a/infra/analytics/app-config/env-config/scheduled_jobs.tf +++ b/infra/analytics/app-config/env-config/scheduled_jobs.tf @@ -7,7 +7,7 @@ locals { scheduled_jobs = { sprint-reports = { - task_command = ["make", "gh-data-export", "sprint-reports"] + task_command = ["make", "gh-data-export", "sprint-reports", "gh-transform-and-load"] schedule_expression = "rate(1 days)" state = "ENABLED" } From f8c2fbe0729c453f5eed0b02678e52a7245174a5 Mon Sep 17 00:00:00 2001 From: David Dudas Date: Thu, 7 Nov 2024 11:42:11 -0800 Subject: [PATCH 12/13] [Issue 2665] Add new job to initialize EtlDb (#2778) ## Summary Fixes #2665 ### Time to review: __1 min__ ## Changes proposed > What was added, updated, or removed in this PR. Added scheduled job to run `make init-db` ## Context for reviewers > Testing instructions, background context, more in-depth details of the implementation, and anything else you'd like to call out or ask reviewers. Explain how the changes were verified. The GitHub data export, transform, and load job (see https://github.com/HHS/simpler-grants-gov/pull/2759) depends on a certain schema existing in Postgres. This PR creates a job to ensure the schema exists. ## Additional information > Screenshots, GIF demos, code examples or output to help show the changes working as expected. --- infra/analytics/app-config/env-config/scheduled_jobs.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/infra/analytics/app-config/env-config/scheduled_jobs.tf b/infra/analytics/app-config/env-config/scheduled_jobs.tf index b6d098ffac..80ac5a7a99 100644 --- a/infra/analytics/app-config/env-config/scheduled_jobs.tf +++ b/infra/analytics/app-config/env-config/scheduled_jobs.tf @@ -11,5 +11,10 @@ locals { schedule_expression = "rate(1 days)" state = "ENABLED" } + init-etldb = { + task_command = ["make", "init-db"] + schedule_expression = "rate(1 days)" + state = "ENABLED" + } } } From 4b5f330cf6c2c2a39bee2f1a9918e5280ace5645 Mon Sep 17 00:00:00 2001 From: "kai [they]" Date: Thu, 7 Nov 2024 12:23:13 -0800 Subject: [PATCH 13/13] [no ticket] Fix infinite state locks (#2779) ### Time to review: __1 mins__ ## Context for reviewers Platform's assertion is this: whenever a deploy fails for any reason, it cancels the deploy, which locks the other 3 jobs. Those 3 jobs remain locked indefinitely. On the next deploy, every job but 1 is locked, but the other 3 jobs fail because they were locked prior, which causes 1 first job to be canceled, and thusly all 4 jobs are locked. It's an avalanche effect. Whenever 1 deploy fails, all 4 fail that point onwards. --- .github/workflows/cd-analytics-infra.yml | 1 + .github/workflows/cd-analytics.yml | 1 + .github/workflows/cd-api-infra.yml | 1 + .github/workflows/cd-api.yml | 1 + .github/workflows/cd-frontend-infra.yml | 1 + .github/workflows/cd-frontend.yml | 1 + 6 files changed, 6 insertions(+) diff --git a/.github/workflows/cd-analytics-infra.yml b/.github/workflows/cd-analytics-infra.yml index 14e4f28c8c..209eb5171a 100644 --- a/.github/workflows/cd-analytics-infra.yml +++ b/.github/workflows/cd-analytics-infra.yml @@ -39,6 +39,7 @@ jobs: name: Deploy Infrastructure runs-on: ubuntu-latest strategy: + fail-fast: false matrix: directory: ["database", "service"] envs: ${{ github.event_name == 'release' && fromJSON('["prod"]') || fromJSON('["dev", "staging"]') }} # deploy prod on releases, otherwise deploy staging and dev diff --git a/.github/workflows/cd-analytics.yml b/.github/workflows/cd-analytics.yml index a9a2bc36f1..6b23c9eaa0 100644 --- a/.github/workflows/cd-analytics.yml +++ b/.github/workflows/cd-analytics.yml @@ -33,6 +33,7 @@ jobs: uses: ./.github/workflows/deploy.yml strategy: max-parallel: 1 + fail-fast: false matrix: envs: ${{ github.event_name == 'release' && fromJSON('["prod"]') || github.ref_name == 'main' && fromJSON('["dev", "staging"]') || fromJSON('["dev"]') }} with: diff --git a/.github/workflows/cd-api-infra.yml b/.github/workflows/cd-api-infra.yml index 2b1b83c0ea..c02677a2cc 100644 --- a/.github/workflows/cd-api-infra.yml +++ b/.github/workflows/cd-api-infra.yml @@ -38,6 +38,7 @@ jobs: name: Deploy Infrastructure runs-on: ubuntu-latest strategy: + fail-fast: false matrix: directory: ["database", "service"] envs: ${{ github.event_name == 'release' && fromJSON('["prod"]') || fromJSON('["dev", "staging"]') }} # deploy prod on releases, otherwise deploy staging and dev diff --git a/.github/workflows/cd-api.yml b/.github/workflows/cd-api.yml index e61f5421c2..a67839e326 100644 --- a/.github/workflows/cd-api.yml +++ b/.github/workflows/cd-api.yml @@ -32,6 +32,7 @@ jobs: uses: ./.github/workflows/deploy.yml strategy: max-parallel: 1 + fail-fast: false matrix: envs: ${{ github.event_name == 'release' && fromJSON('["prod"]') || github.ref_name == 'main' && fromJSON('["dev", "staging"]') || fromJSON('["dev"]') }} with: diff --git a/.github/workflows/cd-frontend-infra.yml b/.github/workflows/cd-frontend-infra.yml index a0dc4c9b7d..312e76424f 100644 --- a/.github/workflows/cd-frontend-infra.yml +++ b/.github/workflows/cd-frontend-infra.yml @@ -38,6 +38,7 @@ jobs: name: Deploy Infrastructure runs-on: ubuntu-latest strategy: + fail-fast: false matrix: directory: ["service"] envs: ${{ github.event_name == 'release' && fromJSON('["prod"]') || fromJSON('["dev", "staging"]') }} # deploy prod on releases, otherwise deploy staging and dev diff --git a/.github/workflows/cd-frontend.yml b/.github/workflows/cd-frontend.yml index 101b80c160..9bc3a7ecc6 100644 --- a/.github/workflows/cd-frontend.yml +++ b/.github/workflows/cd-frontend.yml @@ -32,6 +32,7 @@ jobs: uses: ./.github/workflows/deploy.yml strategy: max-parallel: 1 + fail-fast: false matrix: envs: ${{ github.event_name == 'release' && fromJSON('["prod"]') || github.ref_name == 'main' && fromJSON('["dev", "staging"]') || fromJSON('["dev"]') }} with: