Skip to content

Commit

Permalink
feat(core): implement pull request collector (#2)
Browse files Browse the repository at this point in the history
* chore(deps): add poetry configration

* feat(environment): add github resolver

* style(ruff): updated ignored

* feat(output): implement

* chore(ci): add makefile and coverage runner

* test(output): remove engine tocsv

* feat(collector): implemented github

* style(ruff): evaluated and adjusted rules

* chore(make): add verbose testing

* chore(init): add

* style(ruff): enforce single quotes

* style(ruff): add missing type annotations

* feat(main): implement

* test(output): cover when timestamp is not set

* chore(make): add extra verbose pytest output

* chore(make): add all and clean

* chore(make): add quiet mode to pytest

* chore(workflows): add main python workflow

* refactor(token): extract token outside of cli parser
  • Loading branch information
kiran94 authored May 4, 2023
1 parent 6f5848d commit 7ba796e
Show file tree
Hide file tree
Showing 17 changed files with 3,484 additions and 23 deletions.
54 changes: 31 additions & 23 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,36 @@ on:
pull_request:

jobs:
# build:
# runs-on: ubuntu-latest
# timeout-minutes: 10
# steps:
# - uses: actions/checkout@v3
# - uses: actions/setup-python@v4
# with:
# python-version: '3.10'
#
# - name: Install Tools
# run: make install_tools
#
# - name: Install Dependencies
# run: |
# make export_requirements
# python -m pip install -r requirements.txt
#
# - name: Lint
# run: make lint
#
# - name: Test
# run: make test
build:
runs-on: ubuntu-latest
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
python-version: ['3.10']
poetry-version: ['1.4.2']
steps:
- uses: actions/checkout@v3

- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- uses: abatilo/actions-poetry@v2
with:
poetry-version: ${{ matrix.poetry-version }}

- name: Install Dependencies
run: |
make export_requirements
python -m pip install -r requirements.txt
python -m pip install -r requirements-dev.txt
- name: Lint
run: make lint

- name: Test
run: make test

terraform-lint:
runs-on: ubuntu-latest
Expand All @@ -54,7 +62,7 @@ jobs:
release:
runs-on: ubuntu-latest
timeout-minutes: 10
needs: [terraform-lint] # TODO: build
needs: [build, terraform-lint]
permissions:
contents: write
steps:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,6 @@ override.tf.json
terraform.rc

# End of https://www.toptal.com/developers/gitignore/api/terraform

requirements.txt
requirements-dev.txt
25 changes: 25 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
all: lint coverage

test:
python -m pytest -vv

lint:
python -m ruff check .

format:
python -m ruff check --fix .

coverage:
python -m pytest -q --cov=prfiesta --cov-report=term # for local
python -m pytest -q --cov=prfiesta --cov-report=html # for local
python -m pytest -q --cov=prfiesta --cov-report=xml # for sonarqube

export_requirements:
poetry export --output requirements.txt --format requirements.txt
poetry export --with dev --output requirements-dev.txt --format requirements.txt

clean:
rm ./coverage.xml
rm -rf ./htmlcov
rm -rf ./.pytest_cache
rm -rf ./.ruff_cache
2,773 changes: 2,773 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions prfiesta/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import logging
import os

from rich.logging import RichHandler

LOGGING_LEVEL=os.environ.get('LOGGING_LEVEL', logging.INFO)
LOGGING_FORMAT=os.environ.get('LOGGING_FORMAT', '%(message)s')
SPINNER_STYLE=os.environ.get('SPINNER_STYLE', 'blue')

logging.basicConfig(
level=LOGGING_LEVEL,
format=LOGGING_FORMAT,
handlers=[RichHandler(markup=True, show_path=False, show_time=False, show_level=True)],
)
50 changes: 50 additions & 0 deletions prfiesta/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging
from datetime import datetime

import click
from rich.live import Live
from rich.spinner import Spinner
from rich.text import Text

from prfiesta import SPINNER_STYLE
from prfiesta.collectors.github import GitHubCollector
from prfiesta.environment import GitHubEnvironment
from prfiesta.output import output_frame

logger = logging.getLogger(__name__)

github_environment = GitHubEnvironment()


@click.command()
@click.option('-u', '--users', required=True, multiple=True, help='The GitHub Users to search for. Can be multiple (space delimited)')
@click.option('-t', '--token', help='The Authentication token to use')
@click.option('-x', '--url', help='The URL of the Git provider to use')
@click.option('-o', '--output_type', type=click.Choice(['csv', 'parquet']), default='csv', help='The output format')
@click.option('--after', type=click.DateTime(formats=['%Y-%m-%d']), help='Only search for pull requests after this date e.g 2023-01-01')
@click.option('--before', type=click.DateTime(formats=['%Y-%m-%d']), help='Only search for pull requests before this date e.g 2023-04-30')
def main(**kwargs) -> None:

users: tuple[str] = kwargs.get('users')
token: str = kwargs.get('token') or github_environment.get_token()
url: str = kwargs.get('url') or github_environment.get_url()
output_type: str = kwargs.get('output_type')
before: datetime = kwargs.get('before')
after: datetime = kwargs.get('after')

logger.info('[bold green]Pull Request Fiesta 🦜🥳')

spinner = Spinner('dots', text=Text('Loading', style=SPINNER_STYLE))

with Live(spinner, refresh_per_second=20, transient=True):

collector = GitHubCollector(token=token, url=url, spinner=spinner)
pr_frame = collector.collect(users, after=after, before=before)

logger.info('Found [bold green]%s[/bold green] pull requests!', pr_frame.shape[0])

if not pr_frame.empty:
output_frame(pr_frame, output_type, spinner=spinner)

if __name__ == '__main__': # pragma: nocover
main()
Empty file added prfiesta/collectors/__init__.py
Empty file.
130 changes: 130 additions & 0 deletions prfiesta/collectors/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import logging
from datetime import datetime

import pandas as pd
from github import Github
from rich.spinner import Spinner

from prfiesta import SPINNER_STYLE
from prfiesta.environment import GitHubEnvironment

logger = logging.getLogger(__name__)


class GitHubCollector:

def __init__(self, **kwargs) -> None:

environment = GitHubEnvironment()
token = kwargs.get('token') or environment.get_token()
self._url = kwargs.get('url') or environment.get_url()

self._github = Github(token, base_url=self._url)
self._spinner: Spinner = kwargs.get('spinner')

self._sort_column = ['updated_at']
self._drop_columns = [
'labels_url',
'comments_url',
'events_url',
'node_id',
'performed_via_github_app',
'active_lock_reason',
]
self._move_to_end_columns = [
'url',
'repository_url',
'html_url',
'timeline_url',
]
self._datetime_columns = [
'created_at',
'updated_at',
'closed_at',
'milestone.created_at',
'milestone.updated_at',
'milestone.due_on',
'milestone.closed_at',
]


def collect(self, users: list[str], after: datetime | None= None, before: datetime | None = None) -> pd.DataFrame:

query = self._construct_query(users, after, before)

self._update_spinner(f'Searching {self._url} with[bold blue] {query}')
pulls = self._github.search_issues(query=query)

pull_request_data: list[dict] = []
for pr in pulls:
pull_request_data.append(pr.__dict__['_rawData'])

if not pull_request_data:
logger.warning('Did not find any results for this search criteria!')
return pd.DataFrame()

self._update_spinner('Post Processing')
pr_frame = pd.json_normalize(pull_request_data)

pr_frame = pr_frame.drop(columns=self._drop_columns, errors='ignore')
pr_frame = pr_frame.sort_values(by=self._sort_column, ascending=False)
pr_frame = self._parse_datetime_columns(pr_frame)
pr_frame['repository_name'] = pr_frame['repository_url'].str.extract(r'(.*)\/repos\/(?P<repository_name>(.*))')['repository_name']
pr_frame = self._move_column_to_end(pr_frame)

return pr_frame


@staticmethod
def _construct_query(users: list[str], after: datetime | None= None, before: datetime | None = None) -> str:
"""
Constructs a GitHub Search Query
that returns pull requests made by the passed users.
Examples
--------
type:pr author:user1
type:pr author:user2 updated:<=2021-01-01
type:pr author:user1 author:user2 updated:2021-01-01..2021-03-01
All dates are inclusive.
See GitHub Docs for full optons https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests
"""
query: list[str] = []

query.append('type:pr')

for u in users:
query.append('author:' + u)

if before and after:
query.append(f"updated:{after.strftime('%Y-%m-%d')}..{before.strftime('%Y-%m-%d')}")
elif before:
query.append(f"updated:<={before.strftime('%Y-%m-%d')}")
elif after:
query.append(f"updated:>={after.strftime('%Y-%m-%d')}")

return ' '.join(query)

def _move_column_to_end(self, df: pd.DataFrame) -> pd.DataFrame:
for col in self._move_to_end_columns:
df.insert(len(df.columns)-1, col, df.pop(col))
df.drop(columns=col)

return df

def _parse_datetime_columns(self, df: pd.DataFrame) -> pd.DataFrame:
for col in self._datetime_columns:
df[col] = pd.to_datetime(df[col], errors='ignore')
return df


def _update_spinner(self, message: str, style: str=SPINNER_STYLE) -> None:
if self._spinner:
self._spinner.update(text=message, style=style)



if __name__ == '__main__': # pragma: nocover
g = GitHubCollector()
logger.info(g._construct_query(['kiran94', 'hello'], datetime(2021, 1, 1), datetime(2021, 3, 1)))
18 changes: 18 additions & 0 deletions prfiesta/environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os

from github.Consts import DEFAULT_BASE_URL as GITHUB_DEFAULT_BASE_URL


class GitHubEnvironment:

def get_token(self) -> str:
"""Gets the authentication token for this environment."""
token = os.environ.get('GITHUB_ENTERPRISE_TOKEN', os.environ.get('GITHUB_TOKEN'))
if not token:
raise ValueError('GITHUB_ENTERPRISE_TOKEN or GITHUB_TOKEN must be set')

return token

def get_url(self) -> str:
"""Gets the URL for the git provider."""
return os.environ.get('GH_HOST', GITHUB_DEFAULT_BASE_URL)
39 changes: 39 additions & 0 deletions prfiesta/output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
import os
from datetime import datetime
from typing import Literal

import pandas as pd
from rich.spinner import Spinner

from prfiesta import SPINNER_STYLE

logger = logging.getLogger(__name__)

output_directory = 'output'
OUTPUT_TYPE = Literal['csv', 'parquet']


def output_frame(frame: pd.DataFrame, output_type: OUTPUT_TYPE, spinner: Spinner, output_name: str = 'export', timestamp: datetime = None) -> None:

if not timestamp:
timestamp = datetime.now()

os.makedirs(output_directory, exist_ok=True)

output_name = str(output_name) + '.' + timestamp.strftime('%Y-%m-%d_%H:%M:%S') + '.' + output_type
output_name = os.path.join(output_directory, output_name)

spinner.update(text=f'Writing export to {output_name}', style=SPINNER_STYLE)

match output_type:
case 'csv':
frame.to_csv(output_name, index=False)

case 'parquet':
frame.to_parquet(output_name, index=False)

case _:
raise ValueError('unknown output_type %s', output_type)

logger.info('Exported to %s!', output_name)
Loading

0 comments on commit 7ba796e

Please sign in to comment.