Skip to content

Commit 14dceef

Browse files
committed
Fix git commitCount to only contain unique hashes
1 parent 7ea6c28 commit 14dceef

File tree

2 files changed

+28
-5
lines changed

2 files changed

+28
-5
lines changed

cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ UNWIND git_files AS git_file
1414
RETURN git_repository.name + '/' + git_file.relativePath AS filePath
1515
,split(git_commit.author, ' <')[0] AS author
1616
,count(DISTINCT git_commit.sha) AS commitCount
17+
,collect(DISTINCT git_commit.sha) AS commitHashes
1718
,date(max(git_commit.date)) AS lastCommitDate
1819
,max(date(fileCreatedAtTimestamp)) AS lastCreationDate
1920
,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate

jupyter/GitHistoryGeneral.ipynb

+27-5
Original file line numberDiff line numberDiff line change
@@ -493,9 +493,18 @@
493493
" \"\"\"\n",
494494
" return values.iloc[1] if len(values) > 1 else None\n",
495495
"\n",
496-
"def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
496+
"def get_flattened_unique_values(values: pd.Series):\n",
497497
" \"\"\"\n",
498-
" Return the file count from an array of array of file paths.\n",
498+
" Return an array of unique string values from an array of array of strings.\n",
499+
" Meant to be used as an aggregation function for dataframe grouping.\n",
500+
" values : Series : The pandas Series of values\n",
501+
" return : Series : The pandas Series of values\n",
502+
" \"\"\"\n",
503+
" return np.unique(np.concatenate(values.to_list()))\n",
504+
"\n",
505+
"def count_unique_aggregated_values(values: pd.Series):\n",
506+
" \"\"\"\n",
507+
" Return the number of unique values from an array of array of strings.\n",
499508
" Meant to be used as an aggregation function for dataframe grouping.\n",
500509
" values : Series : The pandas Series of values\n",
501510
" return : int : The number of files\n",
@@ -573,7 +582,6 @@
573582
"# Define how common non-grouped columns will be aggregated.\n",
574583
"# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
575584
"common_named_aggregation = dict(\n",
576-
" commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
577585
" daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
578586
" daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
579587
" daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
@@ -588,12 +596,14 @@
588596
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
589597
" firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
590598
" fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
599+
" commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n",
600+
" intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n",
591601
" **common_named_aggregation\n",
592602
")\n",
593603
"\n",
594604
"# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
595605
"# The author with the most commits will then be listed first for each directory.\n",
596-
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
606+
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n",
597607
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
598608
"\n",
599609
"# Debug\n",
@@ -603,12 +613,13 @@
603613
"# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
604614
"# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
605615
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
606-
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
616+
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
607617
" firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
608618
" mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
609619
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
610620
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
611621
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
622+
" commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n",
612623
" **common_named_aggregation\n",
613624
")\n",
614625
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
@@ -669,6 +680,17 @@
669680
"git_files_with_commit_statistics.head(30)"
670681
]
671682
},
683+
{
684+
"cell_type": "code",
685+
"execution_count": null,
686+
"id": "53fcd8b2",
687+
"metadata": {},
688+
"outputs": [],
689+
"source": [
690+
"# Print prepared data frame to CSV file\n",
691+
"# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)"
692+
]
693+
},
672694
{
673695
"cell_type": "markdown",
674696
"id": "ccc11f52",

0 commit comments

Comments
 (0)