-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_contributions.py
90 lines (83 loc) · 3.48 KB
/
crawl_contributions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import argparse
import pandas as pd
import os
import time
from github import Github
from github.GithubException import RateLimitExceededException
from utils import wrap_query, catch_rate_limit, collect, safe_load_repo, get_access_token
@wrap_query
def query_contributions(row: pd.Series, id_key: str, g: Github):
"""Gets contribution stats in a repository.
Args:
row (pd.Series): contains column with repository ID
id_key (str): name of column containing repository ID
g (Github): authenticated access to Github API
Returns:
pd.Series: added columns ['author', 'week_co', 'commits']
"""
contributions = {k: [] for k in ['author', 'week_co', 'commits']}
repo = safe_load_repo(g, row[id_key], "query_contributions")
if repo is None:
return None
for tries in range(2):
try:
contribution_stats = repo.get_stats_contributors()
if contribution_stats is not None:
for inner_tries in range(2):
try:
for s in contribution_stats:
for w in s.weeks:
contributions['author'].append(s.author.login)
contributions['week_co'].append(w.w)
contributions['commits'].append(w.c)
except RateLimitExceededException:
if inner_tries == 0:
catch_rate_limit(g)
else:
raise
break
except RateLimitExceededException:
if tries == 0:
catch_rate_limit(g)
else:
raise
break
for k, v in contributions.items():
row[k] = v
return row
def crawl_repos(df, name, target_folder, verbose):
"""For each repository, retrieve contributions and store as CSV.
Args:
df (pd.DataFrame): dataset containing GitHub repository identifiers
name (str): name of column containing the identifiers
target_folder (str): path to folder to store CSV data in
verbose (bool): toggles verbose output
"""
repo_links = df[[name]]
repo_links = repo_links.drop_duplicates()
g = Github(get_access_token())
if verbose:
print(g.rate_limiting)
print("Querying contributions...")
start = time.time()
collect(g, repo_links, name, query_contributions,
['author', 'week_co', 'commits'],
os.path.join(target_folder, 'contributions.csv'))
if verbose:
end = time.time()
print(f"Done - {end-start:.2f} seconds.")
def main(path, name, datadir, verbose):
df = pd.read_csv(path)
target_folder = datadir
crawl_repos(df, name, target_folder, verbose)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="crawl",
description="Given a dataframe with a column indicating the GitHub repository ID, gather data from the corresponding GitHub repository."
)
parser.add_argument("-f", "--file", required=True, type=str, help="CSV file")
parser.add_argument("-n", "--name", required=True, type=str, help="name of column containing github ID")
parser.add_argument("--datadir", default="../../data/raw/eprints/", help="directory to write ePrints data to")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.file, args.name, args.datadir, args.verbose)