Skip to content

Commit c2fb143

Browse files
committed
finished analysis script docs
1 parent 7e9f180 commit c2fb143

File tree

3 files changed

+175
-24
lines changed

3 files changed

+175
-24
lines changed

src/analysis/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ The scripts in this directory were used to produce derived data and plots.
55
- [`aggregate_datasets.py`](./aggregate_datasets.py) aggregates all data mined from GitHub into four datasets described in the wiki. Crucially, the data is reshaped into a time-indexed format for three of those datasets.
66
- [`mention_type_timeline.py`](./mention_type_timeline.py) visualises the relationship between how a repository is cited and the difference between its creation date and the publication date.
77
- [`repo_intent.py`](./repo_intent.py) creates a dataset with all repositories mined from ePrints for which we manually determined the citation type. The resulting dataset contains data from ePrints as well as a label indicating whether the software was cited as created software.
8-
- [`overall.py`](./overall.py)
9-
- [`repository_timeline.py`](./repository_timeline.py)
8+
- [`overall.py`](./overall.py) creates one plot containing visualisations and data about all repositories. The dataset can be filtered for a subset of repositories with the `--filter` argument.
9+
- [`repository_timeline.py`](./repository_timeline.py) creates one plot for one repository, focussing on timelined data. The code to produce these uses the raw data rather than the aggregated data produced by [`aggregate_datasets.py`](./aggregate_datasets.py) as this script was written before [`aggregate_datasets.py`](./aggregate_datasets.py). Both scripts use the same data manipulation methods - directly plotting data produced by [`aggregate_datasets.py`](./aggregate_datasets.py) should result in similar graphs.
1010

1111
The schemas for any produced datasets are included in the wiki.

src/analysis/overall.py

+75-10
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,42 @@ def info(verbose, msg):
1414
print(f"[INFO] {msg}")
1515

1616
def clean_heading(h):
17+
"""Clean and normalise extracted headings.
18+
19+
Args:
20+
h (str): heading text
21+
22+
Returns:
23+
str: cleaned heading
24+
"""
25+
# remove leading non-word characters
1726
to_remove = string.digits + string.whitespace + ".:"
1827
h = h.lstrip(to_remove)
28+
# remove markdown-style links
1929
pattern = "\[(.+?)\]\(.+?\)"
2030
h = re.sub(pattern, r'\1', h, count=0)
31+
# remove any punctuation and convert to lower-case
2132
h = h.replace(string.punctuation, "")
2233
h = h.strip(string.punctuation)
2334
h = h.lower()
2435
return h
2536

2637
def plot_license_type(contents, ax):
38+
"""Plot a bar chart indicating the number of repositories with permissive, non-permissive, unknown type license or no license at all.
39+
40+
Args:
41+
contents (pd.DataFrame): contents data mined from GitHub
42+
ax (Axes): subplot to use
43+
"""
2744
contents = contents.copy()
2845
permissive_licenses = ["mit", "gpl-3.0", "apache-2.0", "bsd-3-clause", "gpl-2.0", "bsd-2-clause"] # https://en.wikipedia.org/wiki/Permissive_software_license
2946
contents.license = contents.license.fillna('None')
47+
# If not permissive, check if it's non-existent or type other, otherwise class as non-permissive
3048
contents["license_type"] = np.where(
3149
contents.license.isin(permissive_licenses), "permissive", np.where(
3250
contents.license == "None", "None", np.where(
3351
contents.license == "other", "unknown", "non-permissive")))
52+
# plot value counts
3453
contents.license_type.value_counts().sort_index().plot(
3554
kind='bar',
3655
ax=ax,
@@ -41,6 +60,12 @@ def plot_license_type(contents, ax):
4160
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
4261

4362
def plot_contributing_file_present(contents, ax):
63+
"""Plot a bar chart visualising the number of repositories with contribution guidelines.
64+
65+
Args:
66+
contents (pd.DataFrame): contents data mined from GitHub
67+
ax (Axes): subplot to use
68+
"""
4469
pd.notna(contents.contributing_added).value_counts().plot(
4570
kind='bar',
4671
ax=ax,
@@ -51,6 +76,12 @@ def plot_contributing_file_present(contents, ax):
5176
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
5277

5378
def plot_emojis(contents, ax):
79+
"""Plot a histogram visualising the number of emojis found in repository READMEs.
80+
81+
Args:
82+
contents (pd.DataFrame): contents data mined from GitHub
83+
ax (Axes): subplot to use
84+
"""
5485
bins = [0, 1, 2, 5, 10]
5586
if contents.readme_emojis.max() > bins[-1]:
5687
bins.append(contents.readme_emojis.max())
@@ -62,15 +93,24 @@ def plot_emojis(contents, ax):
6293
ax.set(xlabel="number of emojis in README", ylabel="repository count")
6394

6495
def plot_team_size(metadata, contributions, ax):
96+
"""Plot a histogram visualising the maximum team size for a repository.
97+
98+
Args:
99+
metadata (pd.DataFrame): metadata mined from GitHub
100+
contributions (pd.DataFrame): contributions (i.e. commit) data mined from GitHub
101+
ax (Axes): subplot to use
102+
"""
65103
contrib_df = pd.merge(metadata[["github_user_cleaned_url", "created_at"]], contributions)
104+
# add week timeline info
66105
contrib_df["week_since_repo_creation"] = (contrib_df.week_co - contrib_df.created_at).dt.days // 7
67106
team_df = contrib_df[["github_user_cleaned_url", "author", "week_since_repo_creation", "commits"]].set_index(["github_user_cleaned_url", "author", "week_since_repo_creation"]).sort_index()
68-
# user is active contributor if made at least one commit in last 12 weeks
107+
# user is considered an active contributor if they made at least one commit in the last 12 weeks
69108
windowed_team_df = team_df.groupby(level="author").rolling(window=12, min_periods=0).sum().droplevel(0)
70109
windowed_team_df["active contributors"] = windowed_team_df.commits > 0
71-
# team size
110+
# team size: number of active contributors within one week
72111
team_size = windowed_team_df.groupby(level=["github_user_cleaned_url", "week_since_repo_creation"])["active contributors"].value_counts()[:,:,True]
73112
max_team_size = team_size.groupby(level="github_user_cleaned_url").max()
113+
# plot histogram
74114
bins = [1, 2, 5, 10]
75115
if max_team_size.max() > bins[-1]:
76116
bins.append(max_team_size.max())
@@ -82,6 +122,13 @@ def plot_team_size(metadata, contributions, ax):
82122
ax.set(xlabel="maximum team size", ylabel="repository count")
83123

84124
def plot_readme_size(contents, ax, type="bar"):
125+
"""Plot a histogram of the size of the README file found in repositories. The bin limits were chosen empirically.
126+
127+
Args:
128+
contents (pd.DataFrame): contents data mined from GitHub
129+
ax (Axes): subplot to use
130+
type (str, optional): plot type, can be "bar" or "pie". Defaults to "bar".
131+
"""
85132
bins = [0, 1, 300, 1500, 10000]
86133
binmeanings = ["none", "ultra-short", "short", "informative", "detailed"]
87134
if contents.readme_size.max() > bins[-1]:
@@ -99,14 +146,23 @@ def plot_readme_size(contents, ax, type="bar"):
99146
ax.set(xlabel="size of README in Bytes")
100147

101148
def plot_headings(readme_df, ax):
149+
"""Plot a wordcloud from the headings used in README files. Excludes some manually defined words that skew the results too much to be meaningful.
150+
151+
Args:
152+
readme_df (pd.DataFrame): readme history data mined from GitHub, including all headings ever added to the README
153+
ax (Axes): subplot to use
154+
"""
155+
# clean any existing headings
102156
headings = []
103157
for l in readme_df.added_headings.dropna():
104158
headings += ast.literal_eval(l)
105159
headings = [clean_heading(h) for h in headings]
106160

161+
# manually exclude words that were found to skew the distribution
107162
stopwords = STOPWORDS
108163
custom = set(["trades", "glosat", "glosat_table_dataset", "nilmtk", "bert", "lemon", "cascadetabnet"])
109164
stopwords = stopwords.union(custom)
165+
# plot wordcloud
110166
wordcloud = WordCloud(
111167
collocation_threshold=15,
112168
stopwords=stopwords,
@@ -119,6 +175,14 @@ def plot_headings(readme_df, ax):
119175
ax.set(title="README headings")
120176

121177
def plot_table(metadata, stars, forks, ax):
178+
"""Add a table with basic stats (repository age, fork counts, star counts).
179+
180+
Args:
181+
metadata (pd.DataFrame): metadata mined from GitHub.
182+
stars (pd.DataFrame): stars data mined from GitHub.
183+
forks (pd.DataFrame): forks data mined from GitHub.
184+
ax (Axes): subplot to use
185+
"""
122186
age = (datetime.today() - metadata["created_at"]).dt.days // 7
123187
fork_counts = forks.groupby("github_user_cleaned_url")["user"].count()
124188
fork_counts.rename("forks_no", inplace=True)
@@ -138,7 +202,7 @@ def plot_table(metadata, stars, forks, ax):
138202
ax.set_axis_off()
139203
ax.set(title="stats")
140204

141-
def main(data_dir, verbose, filter_path, tag):
205+
def main(data_dir, outdir, verbose, filter_path, tag):
142206
info(verbose, "Loading data...")
143207
contents = pd.read_csv(os.path.join(data_dir, "contents.csv"), index_col=0)
144208
metadata = pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0)
@@ -149,7 +213,7 @@ def main(data_dir, verbose, filter_path, tag):
149213
stars = pd.read_csv(os.path.join(data_dir, "stars.csv"), index_col=0)
150214
forks = pd.read_csv(os.path.join(data_dir, "forks.csv"), index_col=0)
151215

152-
if filter_path is not None:
216+
if filter_path is not None: # e.g. filter for high-interest repositories based on a txt file containing a list of those
153217
info(verbose, "Filtering data...")
154218
with open(filter_path, "r") as f:
155219
filtered = [line.rstrip() for line in f]
@@ -179,19 +243,20 @@ def main(data_dir, verbose, filter_path, tag):
179243
plot_table(metadata, stars, forks, ax7)
180244
if tag:
181245
plt.suptitle(f"Overall statistics for ePrints repositories ({tag})")
182-
plt.savefig(os.path.join(data_dir, "overall", f"overall_{tag}.png"), bbox_inches="tight")
246+
plt.savefig(os.path.join(outdir, "overall", f"overall_{tag}.png"), bbox_inches="tight")
183247
else:
184248
plt.suptitle("Overall statistics for ePrints repositories")
185-
plt.savefig(os.path.join(data_dir, "overall", "overall.png"), bbox_inches="tight")
249+
plt.savefig(os.path.join(outdir, "overall", "overall.png"), bbox_inches="tight")
186250

187251
if __name__=="__main__":
188252
parser = argparse.ArgumentParser(
189253
prog="overall",
190254
description="Plot overall repo analysis."
191255
)
192-
parser.add_argument("--dir", default="../data/analysis", type=str, help="path to data directory")
256+
parser.add_argument("--datadir", default="../../data/raw/github", type=str, help="path to GitHub data directory")
257+
parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to output data directory")
193258
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
194-
parser.add_argument("--filter", type=str, help="path to file with repos to consider")
195-
parser.add_argument("--tag", type=str, help="tag name to use")
259+
parser.add_argument("--filter", type=str, help="path to file listing the repos that should be considered")
260+
parser.add_argument("--tag", type=str, help="tag to add to the filename, e.g. to indicate that the repositories were filtered")
196261
args = parser.parse_args()
197-
main(args.dir, args.verbose, args.filter, args.tag)
262+
main(args.datadir, args.outdir, args.verbose, args.filter, args.tag)

0 commit comments

Comments
 (0)