@@ -14,23 +14,42 @@ def info(verbose, msg):
14
14
print (f"[INFO] { msg } " )
15
15
16
16
def clean_heading (h ):
17
+ """Clean and normalise extracted headings.
18
+
19
+ Args:
20
+ h (str): heading text
21
+
22
+ Returns:
23
+ str: cleaned heading
24
+ """
25
+ # remove leading non-word characters
17
26
to_remove = string .digits + string .whitespace + ".:"
18
27
h = h .lstrip (to_remove )
28
+ # remove markdown-style links
19
29
pattern = "\[(.+?)\]\(.+?\)"
20
30
h = re .sub (pattern , r'\1' , h , count = 0 )
31
+ # remove any punctuation and convert to lower-case
21
32
h = h .replace (string .punctuation , "" )
22
33
h = h .strip (string .punctuation )
23
34
h = h .lower ()
24
35
return h
25
36
26
37
def plot_license_type (contents , ax ):
38
+ """Plot a bar chart indicating the number of repositories with permissive, non-permissive, unknown type license or no license at all.
39
+
40
+ Args:
41
+ contents (pd.DataFrame): contents data mined from GitHub
42
+ ax (Axes): subplot to use
43
+ """
27
44
contents = contents .copy ()
28
45
permissive_licenses = ["mit" , "gpl-3.0" , "apache-2.0" , "bsd-3-clause" , "gpl-2.0" , "bsd-2-clause" ] # https://en.wikipedia.org/wiki/Permissive_software_license
29
46
contents .license = contents .license .fillna ('None' )
47
+ # If not permissive, check if it's non-existent or type other, otherwise class as non-permissive
30
48
contents ["license_type" ] = np .where (
31
49
contents .license .isin (permissive_licenses ), "permissive" , np .where (
32
50
contents .license == "None" , "None" , np .where (
33
51
contents .license == "other" , "unknown" , "non-permissive" )))
52
+ # plot value counts
34
53
contents .license_type .value_counts ().sort_index ().plot (
35
54
kind = 'bar' ,
36
55
ax = ax ,
@@ -41,6 +60,12 @@ def plot_license_type(contents, ax):
41
60
ax .set_xticklabels (ax .get_xticklabels (), rotation = 45 )
42
61
43
62
def plot_contributing_file_present (contents , ax ):
63
+ """Plot a bar chart visualising the number of repositories with contribution guidelines.
64
+
65
+ Args:
66
+ contents (pd.DataFrame): contents data mined from GitHub
67
+ ax (Axes): subplot to use
68
+ """
44
69
pd .notna (contents .contributing_added ).value_counts ().plot (
45
70
kind = 'bar' ,
46
71
ax = ax ,
@@ -51,6 +76,12 @@ def plot_contributing_file_present(contents, ax):
51
76
ax .set_xticklabels (ax .get_xticklabels (), rotation = 45 )
52
77
53
78
def plot_emojis (contents , ax ):
79
+ """Plot a histogram visualising the number of emojis found in repository READMEs.
80
+
81
+ Args:
82
+ contents (pd.DataFrame): contents data mined from GitHub
83
+ ax (Axes): subplot to use
84
+ """
54
85
bins = [0 , 1 , 2 , 5 , 10 ]
55
86
if contents .readme_emojis .max () > bins [- 1 ]:
56
87
bins .append (contents .readme_emojis .max ())
@@ -62,15 +93,24 @@ def plot_emojis(contents, ax):
62
93
ax .set (xlabel = "number of emojis in README" , ylabel = "repository count" )
63
94
64
95
def plot_team_size (metadata , contributions , ax ):
96
+ """Plot a histogram visualising the maximum team size for a repository.
97
+
98
+ Args:
99
+ metadata (pd.DataFrame): metadata mined from GitHub
100
+ contributions (pd.DataFrame): contributions (i.e. commit) data mined from GitHub
101
+ ax (Axes): subplot to use
102
+ """
65
103
contrib_df = pd .merge (metadata [["github_user_cleaned_url" , "created_at" ]], contributions )
104
+ # add week timeline info
66
105
contrib_df ["week_since_repo_creation" ] = (contrib_df .week_co - contrib_df .created_at ).dt .days // 7
67
106
team_df = contrib_df [["github_user_cleaned_url" , "author" , "week_since_repo_creation" , "commits" ]].set_index (["github_user_cleaned_url" , "author" , "week_since_repo_creation" ]).sort_index ()
68
- # user is active contributor if made at least one commit in last 12 weeks
107
+ # user is considered an active contributor if they made at least one commit in the last 12 weeks
69
108
windowed_team_df = team_df .groupby (level = "author" ).rolling (window = 12 , min_periods = 0 ).sum ().droplevel (0 )
70
109
windowed_team_df ["active contributors" ] = windowed_team_df .commits > 0
71
- # team size
110
+ # team size: number of active contributors within one week
72
111
team_size = windowed_team_df .groupby (level = ["github_user_cleaned_url" , "week_since_repo_creation" ])["active contributors" ].value_counts ()[:,:,True ]
73
112
max_team_size = team_size .groupby (level = "github_user_cleaned_url" ).max ()
113
+ # plot histogram
74
114
bins = [1 , 2 , 5 , 10 ]
75
115
if max_team_size .max () > bins [- 1 ]:
76
116
bins .append (max_team_size .max ())
@@ -82,6 +122,13 @@ def plot_team_size(metadata, contributions, ax):
82
122
ax .set (xlabel = "maximum team size" , ylabel = "repository count" )
83
123
84
124
def plot_readme_size (contents , ax , type = "bar" ):
125
+ """Plot a histogram of the size of the README file found in repositories. The bin limits were chosen empirically.
126
+
127
+ Args:
128
+ contents (pd.DataFrame): contents data mined from GitHub
129
+ ax (Axes): subplot to use
130
+ type (str, optional): plot type, can be "bar" or "pie". Defaults to "bar".
131
+ """
85
132
bins = [0 , 1 , 300 , 1500 , 10000 ]
86
133
binmeanings = ["none" , "ultra-short" , "short" , "informative" , "detailed" ]
87
134
if contents .readme_size .max () > bins [- 1 ]:
@@ -99,14 +146,23 @@ def plot_readme_size(contents, ax, type="bar"):
99
146
ax .set (xlabel = "size of README in Bytes" )
100
147
101
148
def plot_headings (readme_df , ax ):
149
+ """Plot a wordcloud from the headings used in README files. Excludes some manually defined words that skew the results too much to be meaningful.
150
+
151
+ Args:
152
+ readme_df (pd.DataFrame): readme history data mined from GitHub, including all headings ever added to the README
153
+ ax (Axes): subplot to use
154
+ """
155
+ # clean any existing headings
102
156
headings = []
103
157
for l in readme_df .added_headings .dropna ():
104
158
headings += ast .literal_eval (l )
105
159
headings = [clean_heading (h ) for h in headings ]
106
160
161
+ # manually exclude words that were found to skew the distribution
107
162
stopwords = STOPWORDS
108
163
custom = set (["trades" , "glosat" , "glosat_table_dataset" , "nilmtk" , "bert" , "lemon" , "cascadetabnet" ])
109
164
stopwords = stopwords .union (custom )
165
+ # plot wordcloud
110
166
wordcloud = WordCloud (
111
167
collocation_threshold = 15 ,
112
168
stopwords = stopwords ,
@@ -119,6 +175,14 @@ def plot_headings(readme_df, ax):
119
175
ax .set (title = "README headings" )
120
176
121
177
def plot_table (metadata , stars , forks , ax ):
178
+ """Add a table with basic stats (repository age, fork counts, star counts).
179
+
180
+ Args:
181
+ metadata (pd.DataFrame): metadata mined from GitHub.
182
+ stars (pd.DataFrame): stars data mined from GitHub.
183
+ forks (pd.DataFrame): forks data mined from GitHub.
184
+ ax (Axes): subplot to use
185
+ """
122
186
age = (datetime .today () - metadata ["created_at" ]).dt .days // 7
123
187
fork_counts = forks .groupby ("github_user_cleaned_url" )["user" ].count ()
124
188
fork_counts .rename ("forks_no" , inplace = True )
@@ -138,7 +202,7 @@ def plot_table(metadata, stars, forks, ax):
138
202
ax .set_axis_off ()
139
203
ax .set (title = "stats" )
140
204
141
- def main (data_dir , verbose , filter_path , tag ):
205
+ def main (data_dir , outdir , verbose , filter_path , tag ):
142
206
info (verbose , "Loading data..." )
143
207
contents = pd .read_csv (os .path .join (data_dir , "contents.csv" ), index_col = 0 )
144
208
metadata = pd .read_csv (os .path .join (data_dir , "metadata.csv" ), index_col = 0 )
@@ -149,7 +213,7 @@ def main(data_dir, verbose, filter_path, tag):
149
213
stars = pd .read_csv (os .path .join (data_dir , "stars.csv" ), index_col = 0 )
150
214
forks = pd .read_csv (os .path .join (data_dir , "forks.csv" ), index_col = 0 )
151
215
152
- if filter_path is not None :
216
+ if filter_path is not None : # e.g. filter for high-interest repositories based on a txt file containing a list of those
153
217
info (verbose , "Filtering data..." )
154
218
with open (filter_path , "r" ) as f :
155
219
filtered = [line .rstrip () for line in f ]
@@ -179,19 +243,20 @@ def main(data_dir, verbose, filter_path, tag):
179
243
plot_table (metadata , stars , forks , ax7 )
180
244
if tag :
181
245
plt .suptitle (f"Overall statistics for ePrints repositories ({ tag } )" )
182
- plt .savefig (os .path .join (data_dir , "overall" , f"overall_{ tag } .png" ), bbox_inches = "tight" )
246
+ plt .savefig (os .path .join (outdir , "overall" , f"overall_{ tag } .png" ), bbox_inches = "tight" )
183
247
else :
184
248
plt .suptitle ("Overall statistics for ePrints repositories" )
185
- plt .savefig (os .path .join (data_dir , "overall" , "overall.png" ), bbox_inches = "tight" )
249
+ plt .savefig (os .path .join (outdir , "overall" , "overall.png" ), bbox_inches = "tight" )
186
250
187
251
if __name__ == "__main__" :
188
252
parser = argparse .ArgumentParser (
189
253
prog = "overall" ,
190
254
description = "Plot overall repo analysis."
191
255
)
192
- parser .add_argument ("--dir" , default = "../data/analysis" , type = str , help = "path to data directory" )
256
+ parser .add_argument ("--datadir" , default = "../../data/raw/github" , type = str , help = "path to GitHub data directory" )
257
+ parser .add_argument ("--outdir" , default = "../../data/derived" , type = str , help = "path to output data directory" )
193
258
parser .add_argument ("-v" , "--verbose" , action = "store_true" , help = "enable verbose output" )
194
- parser .add_argument ("--filter" , type = str , help = "path to file with repos to consider " )
195
- parser .add_argument ("--tag" , type = str , help = "tag name to use " )
259
+ parser .add_argument ("--filter" , type = str , help = "path to file listing the repos that should be considered " )
260
+ parser .add_argument ("--tag" , type = str , help = "tag to add to the filename, e.g. to indicate that the repositories were filtered " )
196
261
args = parser .parse_args ()
197
- main (args .dir , args .verbose , args .filter , args .tag )
262
+ main (args .datadir , args . outdir , args .verbose , args .filter , args .tag )
0 commit comments