Fix git commitCount to only contain unique hashes

JohT · JohT · commit 14dceef6c7eb · 2025-04-25T19:35:10.000+02:00
diff --git a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher
@@ -14,6 +14,7 @@ UNWIND git_files AS git_file
 RETURN git_repository.name + '/' + git_file.relativePath AS filePath
       ,split(git_commit.author, ' <')[0]                 AS author
       ,count(DISTINCT git_commit.sha)                    AS commitCount
+      ,collect(DISTINCT git_commit.sha)                  AS commitHashes
       ,date(max(git_commit.date))                        AS lastCommitDate
       ,max(date(fileCreatedAtTimestamp))                 AS lastCreationDate
       ,max(date(fileLastModificationAtTimestamp))        AS lastModificationDate
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -493,9 +493,18 @@
     "    \"\"\"\n",
     "    return values.iloc[1] if len(values) > 1 else None\n",
     "\n",
-    "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
+    "def get_flattened_unique_values(values: pd.Series):\n",
     "    \"\"\"\n",
-    "    Return the file count from an array of array of file paths.\n",
+    "    Return an array of unique string values from an array of array of strings.\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return :  Series : The pandas Series of values\n",
+    "    \"\"\"\n",
+    "    return np.unique(np.concatenate(values.to_list()))\n",
+    "\n",
+    "def count_unique_aggregated_values(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Return the number of unique values from an array of array of strings.\n",
     "    Meant to be used as an aggregation function for dataframe grouping.\n",
     "    values : Series : The pandas Series of values\n",
     "    return : int : The number of files\n",
@@ -573,7 +582,6 @@
     "# Define how common non-grouped columns will be aggregated.\n",
     "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
     "common_named_aggregation = dict(\n",
-    "    commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
     "    daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
     "    daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
     "    daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
@@ -588,12 +596,14 @@
     "    filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
     "    firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
     "    fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
+    "    commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n",
+    "    intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "\n",
     "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
     "# The author with the most commits will then be listed first for each directory.\n",
-    "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
     "\n",
     "# Debug\n",
@@ -603,12 +613,13 @@
     "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
     "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
-    "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
+    "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
     "    firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
     "    mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
     "    authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
     "    mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
     "    secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
+    "    commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
@@ -669,6 +680,17 @@
     "git_files_with_commit_statistics.head(30)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53fcd8b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print prepared data frame to CSV file\n",
+    "# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ccc11f52",