-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_issues.py
97 lines (90 loc) · 3.71 KB
/
crawl_issues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import argparse
import pandas as pd
import os
import time
from github import Github
from github.GithubException import RateLimitExceededException
from utils import wrap_query, catch_rate_limit, collect, safe_load_repo, get_access_token
@wrap_query
def query_issues(row: pd.Series, id_key: str, g: Github):
"""Gets all available issues in a repository.
Args:
row (pd.Series): contains column with repository ID
id_key (str): name of column containing repository ID
g (Github): authenticated access to Github API
Returns:
pd.Series: added columns ['state', 'created_at', 'user', 'closed_at', 'closed_by']
"""
issues = {k: [] for k in ['state', 'created_at', 'user', 'closed_at', 'closed_by']}
repo = safe_load_repo(g, row[id_key], "query_issues")
if repo is None:
return None
for tries in range(2):
try:
issues_paged = repo.get_issues(state='all')
for i in issues_paged:
for inner_tries in range(2):
try:
state = i.state
created_at = i.created_at
user = i.user.login
closed_at = i.closed_at
closed_by = i.closed_by
if closed_by is not None:
closed_by = closed_by.login
except RateLimitExceededException:
if inner_tries == 0:
catch_rate_limit(g)
else:
raise
break
issues['state'].append(state)
issues['created_at'].append(created_at)
issues['user'].append(user)
issues['closed_at'].append(closed_at)
issues['closed_by'].append(closed_by)
except RateLimitExceededException:
if tries == 0:
catch_rate_limit(g)
else:
raise
break
for k, v in issues.items():
row[k] = v
return row
def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve issues and store as CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
name (str): name of column containing the identifiers
target_folder (str): path to folder to store CSV data in
verbose (bool): toggles verbose output
"""
repo_links = df[[name]]
repo_links = repo_links.drop_duplicates()
g = Github(get_access_token())
if verbose:
print(g.rate_limiting)
print("Querying issues...")
start = time.time()
collect(g, repo_links, name, query_issues,
['state'],
os.path.join(target_folder, 'issues.csv'))
if verbose:
end = time.time()
print(f"Done - {end-start:.2f} seconds.")
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.datadir, args.verbose)