diff --git a/analytics/Dockerfile b/analytics/Dockerfile index 2bbd90838..6874120af 100644 --- a/analytics/Dockerfile +++ b/analytics/Dockerfile @@ -19,6 +19,7 @@ RUN apt-get update \ libpq-dev \ postgresql \ wget \ + jq \ # Reduce the image size by clear apt cached lists # Complies with https://github.com/codacy/codacy-hadolint/blob/master/codacy-hadolint/docs/description/DL3009.md && rm -fr /var/lib/apt/lists/* \ diff --git a/analytics/Makefile b/analytics/Makefile index 1b22a9452..040713929 100644 --- a/analytics/Makefile +++ b/analytics/Makefile @@ -10,7 +10,11 @@ OUTPUT_DIR ?= data SPRINT_FILE ?= $(OUTPUT_DIR)/sprint-data.json ROADMAP_FILE ?= $(OUTPUT_DIR)/roadmap-data.json ISSUE_FILE ?= $(OUTPUT_DIR)/issue-data.json +DELIVERY_FILE ?= $(OUTPUT_DIR)/delivery-data.json SPRINT ?= @current +# Names of the points and sprint fields in the GitHub project +POINTS_FIELD ?= Points +SPRINT_FIELD ?= Sprint UNIT ?= points ACTION ?= show-results MIN_TEST_COVERAGE ?= 80 @@ -151,9 +155,7 @@ sprint-data-export: gh-db-data-import: @echo "=> Importing sprint data to the database" @echo "=====================================================" - $(POETRY) analytics import db_import \ - --sprint-file $(SPRINT_FILE) \ - --issue-file $(ISSUE_FILE) + $(POETRY) analytics import db_import --delivery-file $(DELIVERY_FILE) roadmap-data-export: @echo "=> Exporting project data from the product roadmap" @@ -163,6 +165,17 @@ roadmap-data-export: --project $(ROADMAP_PROJECT) \ --output-file $(ROADMAP_FILE) +delivery-data-export: + @echo "=> Exporting GitHub issue and sprint data for delivery metrics" + @echo "=====================================================" + $(POETRY) analytics export gh_delivery_data \ + --owner $(ORG) \ + --sprint-project $(SPRINT_PROJECT) \ + --roadmap-project $(ROADMAP_PROJECT) \ + --output-file $(DELIVERY_FILE) \ + --points-field "$(POINTS_FIELD)" \ + --sprint-field "$(SPRINT_FIELD)" + issue-data-export: @echo "=> Exporting issue data from the repository" @echo "=====================================================" @@ -171,7 +184,7 @@ issue-data-export: --repo $(REPO) \ --output-file $(ISSUE_FILE) -gh-data-export: sprint-data-export issue-data-export roadmap-data-export +gh-data-export: sprint-data-export issue-data-export roadmap-data-export delivery-data-export sprint-burndown: @echo "=> Running sprint burndown report" @@ -200,4 +213,3 @@ percent-complete: sprint-reports: sprint-burndown percent-complete sprint-reports-with-latest-data: gh-data-export sprint-reports - diff --git a/analytics/pyproject.toml b/analytics/pyproject.toml index e87256222..cb53199be 100644 --- a/analytics/pyproject.toml +++ b/analytics/pyproject.toml @@ -51,6 +51,7 @@ disable = [ "R0913", # too-many-arguments "R0902", # too-many-instance-attributes "R0903", # too-few-public-methods + "W1514", # unspecified-encoding ] [tool.ruff] diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py index a4a0a2935..e019a7e88 100644 --- a/analytics/src/analytics/cli.py +++ b/analytics/src/analytics/cli.py @@ -1,6 +1,7 @@ # pylint: disable=C0415 """Expose a series of CLI entrypoints for the analytics package.""" import logging +import logging.config from pathlib import Path from typing import Annotated, Optional @@ -9,6 +10,7 @@ from sqlalchemy import text from analytics.datasets.deliverable_tasks import DeliverableTasks +from analytics.datasets.issues import GitHubIssues from analytics.datasets.sprint_board import SprintBoard from analytics.integrations import db, github, slack from analytics.metrics.base import BaseMetric, Unit @@ -26,9 +28,11 @@ ROADMAP_FILE_ARG = typer.Option(help="Path to file with exported roadmap data") OUTPUT_FILE_ARG = typer.Option(help="Path to file where exported data will be saved") OUTPUT_DIR_ARG = typer.Option(help="Path to directory where output files will be saved") +TMP_DIR_ARG = typer.Option(help="Path to directory where intermediate files will be saved") OWNER_ARG = typer.Option(help="GitHub handle of the repo or project owner") REPO_ARG = typer.Option(help="Name of the GitHub repo") PROJECT_ARG = typer.Option(help="Number of the GitHub project") +FIELD_ARG = typer.Option(help="Name of the GitHub project field") SPRINT_ARG = typer.Option(help="Name of the sprint for which we're calculating burndown") UNIT_ARG = typer.Option(help="Whether to calculate completion by 'points' or 'tickets'") SHOW_RESULTS_ARG = typer.Option(help="Display a chart of the results in a browser") @@ -55,6 +59,11 @@ def callback() -> None: """Analyze data about the Simpler.Grants.gov project.""" +# =========================================================== +# Export commands +# =========================================================== + + @export_app.command(name="gh_project_data") def export_github_project_data( owner: Annotated[str, OWNER_ARG], @@ -75,6 +84,53 @@ def export_github_issue_data( github.export_issue_data(owner, repo, output_file) +@export_app.command(name="gh_delivery_data") +def export_github_data( + owner: Annotated[str, OWNER_ARG], + sprint_project: Annotated[int, PROJECT_ARG], + roadmap_project: Annotated[int, PROJECT_ARG], + output_file: Annotated[str, OUTPUT_FILE_ARG], + sprint_field: Annotated[str, FIELD_ARG] = "Sprint", + points_field: Annotated[str, FIELD_ARG] = "Points", + tmp_dir: Annotated[str, TMP_DIR_ARG] = "data", +) -> None: + """Export and flatten metadata about GitHub issues used for delivery metrics.""" + # Specify path to intermediate files + sprint_file = Path(tmp_dir) / "sprint-data.json" + roadmap_file = Path(tmp_dir) / "roadmap-data.json" + + # # Export sprint and roadmap data + logger.info("Exporting roadmap data") + github.export_roadmap_data( + owner=owner, + project=roadmap_project, + quad_field="Quad", + pillar_field="Pillar", + output_file=str(roadmap_file), + ) + logger.info("Exporting sprint data") + github.export_sprint_data( + owner=owner, + project=sprint_project, + sprint_field=sprint_field, + points_field=points_field, + output_file=str(sprint_file), + ) + + # load and flatten data into GitHubIssues dataset + logger.info("Transforming exported data") + issues = GitHubIssues.load_from_json_files( + sprint_file=str(sprint_file), + roadmap_file=str(roadmap_file), + ) + issues.to_json(output_file) + + +# =========================================================== +# Calculate commands +# =========================================================== + + @metrics_app.command(name="sprint_burndown") def calculate_sprint_burndown( sprint_file: Annotated[str, SPRINT_FILE_ARG], @@ -129,55 +185,6 @@ def calculate_sprint_burnup( ) -@import_app.command(name="test_connection") -def test_connection() -> None: - """Test function that ensures the DB connection works.""" - engine = db.get_db() - # connection method from sqlalchemy - connection = engine.connect() - - # Test INSERT INTO action - result = connection.execute( - text( - "INSERT INTO audit_log (topic,timestamp, end_timestamp, user_id, details)" - "VALUES('test','2024-06-11 10:41:15','2024-06-11 10:54:15',87654,'test from command');", - ), - ) - # Test SELECT action - result = connection.execute(text("SELECT * FROM audit_log WHERE user_id=87654;")) - for row in result: - print(row) - # commits the transaction to the db - connection.commit() - result.close() - - -@import_app.command(name="db_import") -def export_json_to_database( - sprint_file: Annotated[str, SPRINT_FILE_ARG], - issue_file: Annotated[str, ISSUE_FILE_ARG], -) -> None: - """Import JSON data to the database.""" - logger.info("Beginning import") - - # Get the database engine and establish a connection - engine = db.get_db() - - # Load data from the sprint board - sprint_data = SprintBoard.load_from_json_files( - sprint_file=sprint_file, - issue_file=issue_file, - ) - - sprint_data.to_sql( - output_table="github_project_data", - engine=engine, - replace_table=True, - ) - rows = len(sprint_data.to_dict()) - logger.info("Number of rows in table: %s", rows) - - @metrics_app.command(name="deliverable_percent_complete") def calculate_deliverable_percent_complete( sprint_file: Annotated[str, SPRINT_FILE_ARG], @@ -246,3 +253,51 @@ def show_and_or_post_results( channel_id=settings.reporting_channel_id, output_dir=Path(output_dir), ) + + +# =========================================================== +# Import commands +# =========================================================== + + +@import_app.command(name="test_connection") +def test_connection() -> None: + """Test function that ensures the DB connection works.""" + engine = db.get_db() + # connection method from sqlalchemy + connection = engine.connect() + + # Test INSERT INTO action + result = connection.execute( + text( + "INSERT INTO audit_log (topic,timestamp, end_timestamp, user_id, details)" + "VALUES('test','2024-06-11 10:41:15','2024-06-11 10:54:15',87654,'test from command');", + ), + ) + # Test SELECT action + result = connection.execute(text("SELECT * FROM audit_log WHERE user_id=87654;")) + for row in result: + print(row) + # commits the transaction to the db + connection.commit() + result.close() + + +@import_app.command(name="db_import") +def export_json_to_database(delivery_file: Annotated[str, ISSUE_FILE_ARG]) -> None: + """Import JSON data to the database.""" + logger.info("Beginning import") + + # Get the database engine and establish a connection + engine = db.get_db() + + # Load data from the sprint board + issues = GitHubIssues.from_json(delivery_file) + + issues.to_sql( + output_table="github_project_data", + engine=engine, + replace_table=True, + ) + rows = len(issues.to_dict()) + logger.info("Number of rows in table: %s", rows) diff --git a/analytics/src/analytics/datasets/base.py b/analytics/src/analytics/datasets/base.py index bd9bf7d6b..f115a4d1e 100644 --- a/analytics/src/analytics/datasets/base.py +++ b/analytics/src/analytics/datasets/base.py @@ -4,9 +4,12 @@ from pathlib import Path from typing import Self +import numpy as np import pandas as pd from sqlalchemy import Engine +from analytics.datasets.utils import dump_to_json, load_json_file + class BaseDataset: """Base class for all datasets.""" @@ -25,6 +28,12 @@ def from_dict(cls, data: list[dict]) -> Self: """Load the dataset from a list of python dictionaries representing records.""" return cls(df=pd.DataFrame(data)) + @classmethod + def from_json(cls, file_path: str | Path) -> Self: + """Load the dataset from a JSON file.""" + data = load_json_file(str(file_path)) + return cls(df=pd.DataFrame(data)) + def to_sql( self, output_table: str, @@ -112,4 +121,8 @@ def to_csv( def to_dict(self) -> list[dict]: """Export the dataset to a list of python dictionaries representing records.""" - return self.df.to_dict(orient="records") + return self.df.replace([np.nan], [None], regex=False).to_dict(orient="records") + + def to_json(self, output_file: str) -> None: + """Dump dataset to JSON.""" + return dump_to_json(output_file, self.to_dict()) diff --git a/analytics/src/analytics/datasets/issues.py b/analytics/src/analytics/datasets/issues.py new file mode 100644 index 000000000..b726fed58 --- /dev/null +++ b/analytics/src/analytics/datasets/issues.py @@ -0,0 +1,204 @@ +"""Transform exported issue data into a flattened list.""" + +import logging +from enum import Enum +from typing import Self + +from pandas import DataFrame +from pydantic import BaseModel, Field, ValidationError + +from analytics.datasets.base import BaseDataset +from analytics.datasets.utils import load_json_file + +logger = logging.getLogger(__name__) + +# =============================================================== +# Dataset schema and enums +# =============================================================== + + +class IssueType(Enum): + """Supported issue types.""" + + BUG = "Bug" + TASK = "Task" + EPIC = "Epic" + ENHANCEMENT = "Enhancement" + DELIVERABLE = "Deliverable" + NONE = None + + +class IssueMetadata(BaseModel): + """Stores information about issue type and parent (if applicable).""" + + # Common metadata -- attributes about the issue common to both projects + issue_title: str + issue_url: str + issue_parent: str | None + issue_type: str | None + issue_is_closed: bool + issue_opened_at: str + issue_closed_at: str | None + # Sprint metadata -- custom fields specific to the sprint board project + issue_points: int | float | None = Field(default=None) + issue_status: str | None = Field(default=None) + sprint_id: str | None = Field(default=None) + sprint_name: str | None = Field(default=None) + sprint_start: str | None = Field(default=None) + sprint_length: int | None = Field(default=None) + sprint_end: str | None = Field(default=None) + # Roadmap metadata -- custom fields specific to the roadmap project + quad_id: str | None = Field(default=None) + quad_name: str | None = Field(default=None) + quad_start: str | None = Field(default=None) + quad_length: int | None = Field(default=None) + quad_end: str | None = Field(default=None) + deliverable_pillar: str | None = Field(default=None) + # Parent metadata -- attributes about parent issues populated via lookup + deliverable_url: str | None = Field(default=None) + deliverable_title: str | None = Field(default=None) + epic_url: str | None = Field(default=None) + epic_title: str | None = Field(default=None) + + +# =============================================================== +# Dataset class +# =============================================================== + + +class GitHubIssues(BaseDataset): + """GitHub issues with metadata about their parents (Epics and Deliverables) and sprints.""" + + def __init__(self, df: DataFrame) -> None: + """Initialize the GitHub Issues dataset.""" + self.opened_col = "issue_created_at" + self.closed_col = "issue_closed_at" + self.sprint_col = "sprint_name" + self.sprint_start_col = "sprint_start" + self.sprint_end_col = "sprint_end" + super().__init__(df) + + @classmethod + def load_from_json_files( + cls, + sprint_file: str = "data/sprint-data.json", + roadmap_file: str = "data/roadmap-data.json", + ) -> Self: + """Load GitHubIssues dataset from input json files.""" + # Load sprint and roadmap data + sprint_data_in = load_json_file(sprint_file) + roadmap_data_in = load_json_file(roadmap_file) + # Populate a lookup table with this data + lookup: dict = {} + lookup = populate_issue_lookup_table(lookup, roadmap_data_in) + lookup = populate_issue_lookup_table(lookup, sprint_data_in) + # Flatten and write issue level data to output file + issues = flatten_issue_data(lookup) + return cls(DataFrame(data=issues)) + + +# =============================================================== +# Transformation helper functions +# =============================================================== + + +def populate_issue_lookup_table( + lookup: dict[str, IssueMetadata], + issues: list[dict], +) -> dict[str, IssueMetadata]: + """Populate a lookup table that maps issue URLs to their issue type and parent.""" + for i, issue in enumerate(issues): + try: + entry = IssueMetadata.model_validate(issue) + except ValidationError as err: # noqa: PERF203 + logger.error("Error parsing row %d, skipped.", i) # noqa: TRY400 + logger.debug("Error: %s", err) + continue + lookup[entry.issue_url] = entry + return lookup + + +def get_parent_with_type( + child_url: str, + lookup: dict[str, IssueMetadata], + type_wanted: IssueType, +) -> IssueMetadata | None: + """ + Traverse the lookup table to find an issue's parent with a specific type. + + This is useful if we have multiple nested issues, and we want to find the + top level deliverable or epic that a given task or bug is related to. + """ + # Get the initial child issue and its parent (if applicable) from the URL + child = lookup.get(child_url) + if not child: + err = f"Lookup doesn't contain issue with url: {child_url}" + raise ValueError(err) + if not child.issue_parent: + return None + + # Travel up the issue hierarchy until we: + # - Find a parent issue with the desired type + # - Get to an issue without a parent + # - Have traversed 5 issues (breaks out of issue cycles) + max_traversal = 5 + parent_url = child.issue_parent + for _ in range(max_traversal): + parent = lookup.get(parent_url) + # If no parent is found, return None + if not parent: + return None + # If the parent matches the desired type, return it + if IssueType(parent.issue_type) == type_wanted: + return parent + # If the parent doesn't have a its own parent, return None + if not parent.issue_parent: + return None + # Otherwise update the parent_url to "grandparent" and continue + parent_url = parent.issue_parent + + # Return the URL of the parent deliverable (or None) + return None + + +def flatten_issue_data(lookup: dict[str, IssueMetadata]) -> list[dict]: + """Flatten issue data and inherit data from parent epic an deliverable.""" + result: list[dict] = [] + for issue in lookup.values(): + # If the issue is a deliverable or epic, move to the next one + if IssueType(issue.issue_type) in [IssueType.DELIVERABLE, IssueType.EPIC]: + continue + + # Get the parent deliverable, if the issue has one + deliverable = get_parent_with_type( + child_url=issue.issue_url, + lookup=lookup, + type_wanted=IssueType.DELIVERABLE, + ) + if deliverable: + # Set deliverable metadata + issue.deliverable_title = deliverable.issue_title + issue.deliverable_url = deliverable.issue_url + issue.deliverable_pillar = deliverable.deliverable_pillar + # Set quad metadata + issue.quad_id = deliverable.quad_id + issue.quad_name = deliverable.quad_name + issue.quad_start = deliverable.quad_start + issue.quad_end = deliverable.quad_end + issue.quad_length = deliverable.quad_length + + # Get the parent epic, if the issue has one + epic = get_parent_with_type( + child_url=issue.issue_url, + lookup=lookup, + type_wanted=IssueType.EPIC, + ) + if epic: + issue.epic_title = epic.issue_title + issue.epic_url = epic.issue_url + + # Add the issue to the results + result.append(issue.__dict__) + + # Return the results + return result diff --git a/analytics/src/analytics/datasets/utils.py b/analytics/src/analytics/datasets/utils.py index 878e415c3..a7e641a08 100644 --- a/analytics/src/analytics/datasets/utils.py +++ b/analytics/src/analytics/datasets/utils.py @@ -53,3 +53,17 @@ def load_json_data_as_df( # strip off the timestamp portion of the date df[col] = pd.to_datetime(df[col]).dt.floor("d") return df + + +def load_json_file(path: str) -> list[dict]: + """Load contents of a JSON file into a dictionary.""" + with open(path) as f: + return json.load(f) + + +def dump_to_json(path: str, data: dict | list[dict]) -> None: + """Write a dictionary or list of dicts to a json file.""" + with open(path, "w") as f: + # Uses ensure_ascii=False to preserve emoji characters in output + # https://stackoverflow.com/a/52206290/7338319 + json.dump(data, f, indent=2, ensure_ascii=False) diff --git a/analytics/src/analytics/integrations/github.py b/analytics/src/analytics/integrations/github.py deleted file mode 100644 index 055978f33..000000000 --- a/analytics/src/analytics/integrations/github.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Integrate with GitHub to read and write data from projects and repos.""" - -import shlex -import subprocess -from pathlib import Path - -# Set the max number of records to return with CLI commands to 10,000 -# NOTE: GitHub exports data in batches of 100 so exporting 10k issues could take over a minute -# TODO(@widal001): 2023-11-29 - Switch to incremental export pattern -# related issue: https://github.com/HHS/simpler-grants-gov/issues/775 -MAX_RECORDS = 10000 - - -def pipe_command_output_to_file(command: str, output_file: str) -> None: - """Write the command line output to a file.""" - # make sure the output file's directory exists - file_path = Path(output_file) - file_path.parent.mkdir(exist_ok=True, parents=True) - # invoke the command via a subprocess and write the output to a file - with open(output_file, "w", encoding="utf-8") as f: - subprocess.call(shlex.split(command), stdout=f) # noqa: S603 - - -def export_project_data(owner: str, project: int, output_file: str) -> None: - """Export and write GitHub project data to a JSON file.""" - print(f"Exporting project data from {owner}/{project} to {output_file}") - command = ( - f"gh project item-list {project} --format json --owner {owner} -L {MAX_RECORDS}" - ) - pipe_command_output_to_file(command, output_file) - - -def export_issue_data(owner: str, repo: str, output_file: str) -> None: - """Export and write GitHub issue data to a JSON file.""" - print(f"Exporting issue data from {owner}/{repo} to {output_file}") - command = ( - f"gh issue list --json number,createdAt,closedAt,labels,title " - f"-R {owner}/{repo} -L {MAX_RECORDS} --state all" - ) - pipe_command_output_to_file(command, output_file) diff --git a/analytics/src/analytics/integrations/github/__init__.py b/analytics/src/analytics/integrations/github/__init__.py new file mode 100644 index 000000000..aa2f28b5a --- /dev/null +++ b/analytics/src/analytics/integrations/github/__init__.py @@ -0,0 +1,15 @@ +"""Export data from GitHub.""" + +__all__ = [ + "export_issue_data", + "export_project_data", + "export_roadmap_data", + "export_sprint_data", +] + +from analytics.integrations.github.main import ( + export_issue_data, + export_project_data, + export_roadmap_data, + export_sprint_data, +) diff --git a/analytics/src/analytics/integrations/github/getRoadmapData.graphql b/analytics/src/analytics/integrations/github/getRoadmapData.graphql new file mode 100644 index 000000000..866753215 --- /dev/null +++ b/analytics/src/analytics/integrations/github/getRoadmapData.graphql @@ -0,0 +1,64 @@ +query ( + $endCursor: String + $login: String! + $project: Int! + $batch: Int! + $quadField: String = "Quad" + $pillarField: String = "Pillar" +) { + # get the project by the organization login and project number + organization(login: $login) { + projectV2(number: $project) { + items(first: $batch, after: $endCursor) { + # allows us to use --paginate in the gh api call + pageInfo { + hasNextPage + endCursor + } + # fetch details per item in the list + nodes { + ... on ProjectV2Item { + content { + ...issueContent + } + quad: fieldValueByName(name: $quadField) { + ...iterationContent + } + pillar: fieldValueByName(name: $pillarField) { + ...singleSelectContent + } + } + } + } + } + } +} + +fragment issueContent on Issue { + title + url + issueType { + name + } + # information about issue open/closed status + closed + createdAt + closedAt + # details about the parent issue + parent { + title + url + } +} + +fragment iterationContent on ProjectV2ItemFieldIterationValue { + iterationId + title + startDate + duration +} + +fragment singleSelectContent on ProjectV2ItemFieldSingleSelectValue { + optionId + name +} diff --git a/analytics/src/analytics/integrations/github/getSprintData.graphql b/analytics/src/analytics/integrations/github/getSprintData.graphql new file mode 100644 index 000000000..b42d88601 --- /dev/null +++ b/analytics/src/analytics/integrations/github/getSprintData.graphql @@ -0,0 +1,69 @@ +query ( + $endCursor: String + $login: String! + $project: Int! + $batch: Int! + $sprintField: String = "Sprint" + $pointsField: String = "Points" +) { + # get the project by the organization login and project number + organization(login: $login) { + projectV2(number: $project) { + items(first: $batch, after: $endCursor) { + # allows us to use --paginate in the gh api call + pageInfo { + hasNextPage + endCursor + } + # fetch details per item in the list + nodes { + ... on ProjectV2Item { + content { + ...issueContent + } + sprint: fieldValueByName(name: $sprintField) { + ...iterationContent + } + points: fieldValueByName(name: $pointsField) { + ... on ProjectV2ItemFieldNumberValue { + number + } + } + status: fieldValueByName(name: "Status") { + ...singleSelectContent + } + } + } + } + } + } +} + +fragment issueContent on Issue { + title + url + issueType { + name + } + # information about issue open/closed status + closed + createdAt + closedAt + # details about the parent issue + parent { + title + url + } +} + +fragment iterationContent on ProjectV2ItemFieldIterationValue { + iterationId + title + startDate + duration +} + +fragment singleSelectContent on ProjectV2ItemFieldSingleSelectValue { + optionId + name +} diff --git a/analytics/src/analytics/integrations/github/main.py b/analytics/src/analytics/integrations/github/main.py new file mode 100644 index 000000000..3d1f71bcd --- /dev/null +++ b/analytics/src/analytics/integrations/github/main.py @@ -0,0 +1,182 @@ +"""Integrate with GitHub to read and write data from projects and repos.""" + +import shlex +import subprocess +from pathlib import Path + +PARENT_DIR = Path(__file__).resolve().parent +# Set the max number of records to return with CLI commands to 10,000 +# NOTE: GitHub exports data in batches of 100 so exporting 10k issues could take over a minute +# TODO(@widal001): 2023-11-29 - Switch to incremental export pattern +# related issue: https://github.com/HHS/simpler-grants-gov/issues/775 +MAX_RECORDS = 10000 + + +def pipe_command_output_to_file(command: str, output_file: str) -> None: + """Write the command line output to a file.""" + # make sure the output file's directory exists + file_path = Path(output_file) + file_path.parent.mkdir(exist_ok=True, parents=True) + # invoke the command via a subprocess and write the output to a file + with open(output_file, "w", encoding="utf-8") as f: + subprocess.call(shlex.split(command), stdout=f) # noqa: S603 + + +def export_project_data(owner: str, project: int, output_file: str) -> None: + """Export and write GitHub project data to a JSON file.""" + print(f"Exporting project data from {owner}/{project} to {output_file}") + command = ( + f"gh project item-list {project} --format json --owner {owner} -L {MAX_RECORDS}" + ) + pipe_command_output_to_file(command, output_file) + + +def export_issue_data(owner: str, repo: str, output_file: str) -> None: + """Export and write GitHub issue data to a JSON file.""" + print(f"Exporting issue data from {owner}/{repo} to {output_file}") + command = ( + f"gh issue list --json number,createdAt,closedAt,labels,title " + f"-R {owner}/{repo} -L {MAX_RECORDS} --state all" + ) + pipe_command_output_to_file(command, output_file) + + +def export_sprint_data( + owner: str, + project: int, + sprint_field: str, + points_field: str, + output_file: str, +) -> None: + """ + Export the issue and project data from a Sprint Board. + + TODO(widal001): 2024-10-25 - Replace this with a direct call to the GraphQL API + https://github.com/HHS/simpler-grants-gov/issues/2590 + """ + # Get the path script and the GraphQL query + script = PARENT_DIR / "make-graphql-query.sh" + query_path = PARENT_DIR / "getSprintData.graphql" + # Load the query + with open(query_path) as f: + query = f.read() + # Create the post-pagination transform jq + jq = """ +[ + # iterate through each project item + .[] | + # reformat each item + { + issue_title: .content.title, + issue_url: .content.url, + issue_parent: .content.parent.url, + issue_type: .content.issueType.name, + issue_is_closed: .content.closed, + issue_opened_at: .content.createdAt, + issue_closed_at: .content.closedAt, + issue_points: .points.number, + sprint_id: .sprint.iterationId, + sprint_name: .sprint.title, + sprint_start: .sprint.startDate, + sprint_length: .sprint.duration, + sprint_end: ( + if .sprint.startDate == null + then null + else ( + (.sprint.startDate | strptime(\"%Y-%m-%d\") | mktime) + + (.sprint.duration * 86400) | strftime(\"%Y-%m-%d\") + ) + end + ), + } | + # filter for task-level issues + select(.issue_type != \"Deliverable\") +] +""" + # Make the command + # fmt: off + command: list[str] = [ + str(script), + "--batch", "100", + "--field", f"login={owner}", + "--field", f"project={project}", + "--field", f"sprintField='{sprint_field}'", + "--field", f"pointsField='{points_field}'", + "--query", f"{query}", + "--paginate-jq", "'.data.organization.projectV2.items.nodes'", + "--transform-jq", jq, + ] + # fmt: on + # invoke the command via a subprocess and write the output to a file + with open(output_file, "w", encoding="utf-8") as f: + subprocess.call(command, stdout=f) # noqa: S603 + + +def export_roadmap_data( + owner: str, + project: int, + quad_field: str, + pillar_field: str, + output_file: str, +) -> None: + """ + Export the issue and project data from a Sprint Board. + + TODO(widal001): 2024-10-25 - Replace this with a direct call to the GraphQL API + https://github.com/HHS/simpler-grants-gov/issues/2590 + """ + # Get the path script and the GraphQL query + script = PARENT_DIR / "make-graphql-query.sh" + query_path = PARENT_DIR / "getRoadmapData.graphql" + # Load the query + with open(query_path) as f: + query = f.read() + # Create the post-pagination transform jq + jq = """ +[ + # iterate through each project item + .[] | + # reformat each item + { + issue_title: .content.title, + issue_url: .content.url, + issue_parent: .content.parent.url, + issue_type: .content.issueType.name, + issue_is_closed: .content.closed, + issue_opened_at: .content.createdAt, + issue_closed_at: .content.closedAt, + deliverable_pillar: .pillar.name, + quad_id: .quad.iterationId, + quad_name: .quad.title, + quad_start: .quad.startDate, + quad_length: .quad.duration, + quad_end: ( + if .quad.startDate == null + then null + else ( + (.quad.startDate | strptime(\"%Y-%m-%d\") | mktime) + + (.quad.duration * 86400) | strftime(\"%Y-%m-%d\") + ) + end + ), + } + +] +""" + # Make the command + # fmt: off + command: list[str] = [ + str(script), + "--batch", "100", + "--field", f"login={owner}", + "--field", f"project={project}", + "--field", f"quadField='{quad_field}'", + "--field", f"pillarField='{pillar_field}'", + "--query", f"{query}", + "--paginate-jq", "'.data.organization.projectV2.items.nodes'", + "--transform-jq", jq, + ] + # fmt: on + # invoke the command via a subprocess and write the output to a file + with open(output_file, "w", encoding="utf-8") as f: + subprocess.call(command, stdout=f) # noqa: S603 diff --git a/analytics/src/analytics/integrations/github/make-graphql-query.sh b/analytics/src/analytics/integrations/github/make-graphql-query.sh new file mode 100755 index 000000000..b51eabd0e --- /dev/null +++ b/analytics/src/analytics/integrations/github/make-graphql-query.sh @@ -0,0 +1,81 @@ +#! /bin/bash +# Propagate project metadata from parent issues to their children +# Usage: +# ./export-issue-metadata.sh \ +# --org HHS \ +# --roadmap-project 12 \ +# --sprint-project 13 \ +# --roadmap-file data/roadmap-data.json +# --sprint-file data/sprint-data.json + + +# ####################################################### +# Parse command line args with format `--option arg` +# ####################################################### + +batch=100 +fields=() +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + echo "Running in dry run mode" + dry_run=YES + shift # past argument + ;; + --batch) + batch="$2" + shift 2 # past argument and value + ;; + --query) + query="$2" + shift 2 # past argument and value + ;; + # jq query to include in each API request during pagination + --paginate-jq) + paginate_jq="$2" + shift 2 # past argument and value + ;; + # jq query to run after all pages have been retrieved + --transform-jq) + transform_jq="$2" + shift 2 # past argument and value + ;; + --field) + # Append field and value to newline + fields+=("--field $2") + shift 2 # past argument and value + ;; + -*|--*) + echo "Unknown option $1" + exit 1 + ;; + *) + positional_args+=("$1") # save positional arg + shift # past argument + ;; + esac +done + +# ####################################################### +# Execute a graphql query +# ####################################################### + +# Build the gh api graphql command with dynamic fields +command="gh api graphql \\ + --header 'GraphQL-Features:sub_issues' \\ + --header 'GraphQL-Features:issue_types' \\ + --paginate \\ + --field batch=$batch" + +# Loop over fields and append them individually, ensuring correct formatting +for field in "${fields[@]}"; do + command+=" \\ + $field" +done + +command+=" \\ + -f query='$query' \\ + --jq '$paginate_jq' | jq --slurp 'add'" + +# Use echo -e to interpret the newline characters +eval "$command" | jq "${transform_jq}" diff --git a/analytics/tests/conftest.py b/analytics/tests/conftest.py index dfc27bb71..4e8192e89 100644 --- a/analytics/tests/conftest.py +++ b/analytics/tests/conftest.py @@ -26,6 +26,16 @@ LABEL_10K = "deliverable: 10k ft" +def pytest_addoption(parser: pytest.Parser): + """Add a command line flag to collect tests that require a slack token.""" + parser.addoption( + "--slack-token-set", + action="store_true", + default=False, + help="Run tests that require a slack token", + ) + + class MockSlackbot: """Create a mock slackbot issue for unit tests.""" diff --git a/analytics/tests/datasets/test_issues.py b/analytics/tests/datasets/test_issues.py new file mode 100644 index 000000000..76d61641e --- /dev/null +++ b/analytics/tests/datasets/test_issues.py @@ -0,0 +1,184 @@ +"""Tests the code in datasets/issues.py.""" + +from pathlib import Path + +from analytics.datasets.issues import ( + GitHubIssues, + IssueMetadata, + IssueType, + get_parent_with_type, +) +from analytics.datasets.utils import dump_to_json + + +def issue( + name: str, + kind: IssueType = IssueType.TASK, + parent: str | None = None, + points: int | None = None, + quad: str | None = None, + epic: str | None = None, + deliverable: str | None = None, +) -> IssueMetadata: + """Create a new issue.""" + return IssueMetadata( + issue_title=name, + issue_type=kind.value, + issue_url=name, + issue_is_closed=False, + issue_opened_at="2024-02-01", + issue_closed_at=None, + issue_parent=parent, + issue_points=points, + quad_name=quad, + epic_title=epic, + epic_url=epic, + deliverable_title=deliverable, + deliverable_url=deliverable, + ) + + +class TestGitHubIssues: + """Test the GitHubIssues.load_from_json_files() class method.""" + + def test_load_from_json_files(self, tmp_path: Path): + """Class method should return the correctly transformed data.""" + # Arrange - create dummy sprint data + sprint_file = tmp_path / "sprint-data.json" + sprint_data = [ + issue(name="task1", kind=IssueType.TASK, parent="epic1", points=2), + issue(name="task2", kind=IssueType.TASK, parent="epic2", points=1), + ] + roadmap_data = [i.model_dump() for i in sprint_data] + dump_to_json(str(sprint_file), roadmap_data) + # Act - create dummy roadmap data + roadmap_file = tmp_path / "roadmap-data.json" + roadmap_data = [ + issue(name="epic1", kind=IssueType.EPIC, parent="del1"), + issue(name="epic2", kind=IssueType.EPIC, parent="del2"), + issue(name="del1", kind=IssueType.DELIVERABLE, quad="quad1"), + ] + roadmap_data = [i.model_dump() for i in roadmap_data] + dump_to_json(str(roadmap_file), roadmap_data) + # Arrange + output_data = [ + issue( + name="task1", + points=2, + parent="epic1", + deliverable="del1", + quad="quad1", + epic="epic1", + ), + issue( + name="task2", + points=1, + parent="epic2", + deliverable=None, + quad=None, + epic="epic2", + ), + ] + wanted = [i.model_dump() for i in output_data] + # Act + got = GitHubIssues.load_from_json_files( + sprint_file=str(sprint_file), + roadmap_file=str(roadmap_file), + ) + # Assert + assert got.to_dict() == wanted + + +class TestGetParentWithType: + """Test the get_parent_with_type() method.""" + + def test_return_epic_that_is_direct_parent_of_issue(self): + """Return the correct epic for an issue that is one level down.""" + # Arrange + task = "task" + lookup = { + task: issue(name=task, kind=IssueType.TASK, parent="epic"), + "epic": issue(name=task, kind=IssueType.EPIC, parent=None), + } + wanted = lookup["epic"] + # Act + got = get_parent_with_type( + child_url=task, + lookup=lookup, + type_wanted=IssueType.EPIC, + ) + # Assert + assert got == wanted + + def test_return_correct_deliverable_that_is_grandparent_of_issue(self): + """Return the correct deliverable for an issue that is two levels down.""" + # Arrange + task = "task" + lookup = { + task: issue(name=task, kind=IssueType.TASK, parent="epic"), + "epic": issue(name=task, kind=IssueType.EPIC, parent="deliverable"), + "deliverable": issue(name=task, kind=IssueType.DELIVERABLE, parent=None), + } + wanted = lookup["deliverable"] + # Act + got = get_parent_with_type( + child_url=task, + lookup=lookup, + type_wanted=IssueType.DELIVERABLE, + ) + # Assert + assert got == wanted + + def test_return_none_if_issue_has_no_parent(self): + """Return None if the input issue has no parent.""" + # Arrange + task = "task" + lookup = { + task: issue(name=task, kind=IssueType.TASK, parent=None), + } + wanted = None + # Act + got = get_parent_with_type( + child_url=task, + lookup=lookup, + type_wanted=IssueType.DELIVERABLE, + ) + # Assert + assert got == wanted + + def test_return_none_if_parents_form_a_cycle(self): + """Return None if the issue hierarchy forms a cycle.""" + # Arrange + task = "task" + lookup = { + task: issue(name=task, kind=IssueType.TASK, parent="parent"), + "parent": issue(name=task, kind=IssueType.TASK, parent=task), + } + wanted = None + # Act + got = get_parent_with_type( + child_url=task, + lookup=lookup, + type_wanted=IssueType.DELIVERABLE, + ) + # Assert + assert got == wanted + + def test_return_none_if_deliverable_is_not_found_in_parents(self): + """Return None if the desired type (e.g. epic) isn't found in the list of parents.""" + # Arrange + task = "task" + lookup = { + task: issue(name=task, kind=IssueType.TASK, parent="parent"), + "parent": issue(name=task, kind=IssueType.TASK, parent="epic"), + "epic": issue(name=task, kind=IssueType.EPIC, parent=task), + } + wanted = None + # Act + got = get_parent_with_type( + child_url=task, + lookup=lookup, + type_wanted=IssueType.DELIVERABLE, + ) + # Assert + assert got == wanted diff --git a/analytics/tests/integrations/test_slack.py b/analytics/tests/integrations/test_slack.py index 6386476ca..4dfb06bc5 100644 --- a/analytics/tests/integrations/test_slack.py +++ b/analytics/tests/integrations/test_slack.py @@ -18,7 +18,13 @@ def mock_slackbot() -> SlackBot: return SlackBot(client=client) -@pytest.mark.skip(reason="requires Slack token") +slack_token_required = pytest.mark.skipif( + "not config.getoption('--slack-token-set')", + reason="requires Slack token", +) + + +@slack_token_required def test_fetch_slack_channels(slackbot: SlackBot): """The fetch_slack_channels() function should execute correctly.""" result = slackbot.fetch_slack_channel_info( @@ -28,7 +34,7 @@ def test_fetch_slack_channels(slackbot: SlackBot): assert result["channel"]["name"] == "z_bot-analytics-ci-test" -@pytest.mark.skip(reason="requires Slack token") +@slack_token_required def test_upload_files_to_slack_channel(slackbot: SlackBot): """The upload_files_to_slack_channel() function should execute correctly.""" # setup - create test files to upload