Skip to content

Commit 0efaf80

Browse files
authored
feat(output): allow custom drop columns (#27)
* feat(collector): allow user to specify drop columns * refactor(collector): ensure move to end process gracefully handles errors * refactor(main): wire up drop columns into cli interface * docs(readme): add drop column example * test(github): ensure dummy token is passed
1 parent 37cdbee commit 0efaf80

File tree

5 files changed

+39
-8
lines changed

5 files changed

+39
-8
lines changed

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ prfiesta -u kiran94 --output_type parquet --output my_pull_requests.parquet
4242
# Get all pull requests for more then one user
4343
prfiesta -u kiran94 -u user2
4444

45+
# Get all pull requests and drop specific columns from the output
46+
prfiesta -u kiran94 -dc events_url -dc comments_url -dc node_id
47+
4548
# Get help
4649
prfiesta --help
4750
```

prfiesta/__main__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
@click.option('-x', '--url', help='The URL of the Git provider to use')
2323
@click.option('-o', '--output', default=None, help='The output location')
2424
@click.option('-ot', '--output_type', type=click.Choice(['csv', 'parquet']), default='csv', help='The output format')
25+
@click.option('-dc', '--drop_columns', multiple=True, help='Drop columns from the output dataframe')
2526
@click.option('--after', type=click.DateTime(formats=['%Y-%m-%d']), help='Only search for pull requests after this date e.g 2023-01-01')
2627
@click.option('--before', type=click.DateTime(formats=['%Y-%m-%d']), help='Only search for pull requests before this date e.g 2023-04-30')
2728
def main(**kwargs) -> None:
@@ -33,14 +34,15 @@ def main(**kwargs) -> None:
3334
output_type: str = kwargs.get('output_type')
3435
before: datetime = kwargs.get('before')
3536
after: datetime = kwargs.get('after')
37+
drop_columns: list[str] = list(kwargs.get('drop_columns'))
3638

3739
logger.info('[bold green]PR Fiesta 🦜🥳')
3840

3941
spinner = Spinner('dots', text=Text('Loading', style=SPINNER_STYLE))
4042

4143
with Live(spinner, refresh_per_second=20, transient=True):
4244

43-
collector = GitHubCollector(token=token, url=url, spinner=spinner)
45+
collector = GitHubCollector(token=token, url=url, spinner=spinner, drop_columns=drop_columns)
4446
pr_frame = collector.collect(*users, after=after, before=before)
4547

4648
if not pr_frame.empty:

prfiesta/collectors/github.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,8 @@ def __init__(self, **kwargs) -> None:
2525
self._spinner: Spinner = kwargs.get('spinner')
2626

2727
self._sort_column = ['updated_at']
28-
self._drop_columns = [
29-
'node_id',
30-
'performed_via_github_app',
31-
]
28+
self._drop_columns = kwargs.get('drop_columns') or ['node_id', 'performed_via_github_app']
29+
3230
self._move_to_end_columns = [
3331
'url',
3432
'repository_url',
@@ -113,8 +111,13 @@ def _construct_query(users: List[str], after: Optional[datetime] = None, before:
113111

114112
def _move_column_to_end(self, df: pd.DataFrame) -> pd.DataFrame:
115113
for col in self._move_to_end_columns:
116-
df.insert(len(df.columns)-1, col, df.pop(col))
117-
df.drop(columns=col)
114+
try:
115+
df.insert(len(df.columns)-1, col, df.pop(col))
116+
df.drop(columns=col)
117+
except KeyError:
118+
# This can happen if the user provides a custom _drop_columns which
119+
# removes the column before we can move it to the end
120+
logger.debug('Attempted to move column %s but it did not exist', col)
118121

119122
return df
120123

tests/collectors/test_github.py

+23
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,26 @@ def test_collect_rate_limit(mock_github: Mock) -> None:
159159
result = gc.collect('user')
160160

161161
assert result.empty
162+
163+
164+
@patch('prfiesta.collectors.github.Github')
165+
def test_collect_custom_drop_columns(mock_github: Mock) -> None:
166+
167+
mock_github.return_value.search_issues.return_value = [_mock_issue1]
168+
169+
collector_params = {
170+
'token': 'dummy_token',
171+
'url': 'dummy_url',
172+
'drop_columns': ['comments_url'],
173+
}
174+
175+
gc = GitHubCollector(**collector_params)
176+
result = gc.collect('user1')
177+
178+
columns = result.columns.tolist()
179+
assert 'comments_url' not in columns
180+
181+
# These are default drop columns
182+
# Since we are overriding it in this scenario, they should still exist in the output column
183+
assert 'node_id' in columns
184+
assert 'performed_via_github_app' in columns

tests/test_main.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_main(
9999
assert mock_live.called
100100
assert mock_spinner.called
101101

102-
assert mock_collector.call_args_list == [call(token=ANY, url='https://api.github.com', spinner=mock_spinner.return_value)]
102+
assert mock_collector.call_args_list == [call(token=ANY, url='https://api.github.com', spinner=mock_spinner.return_value, drop_columns=[])]
103103
assert mock_collector.return_value.collect.call_args_list == expected_collect_params
104104

105105
if not collect_response.empty:

0 commit comments

Comments
 (0)