Merge pull request #372 from JohT/feature/git-history-csv-reports

JohT · web-flow · commit cf81a25a4eb2 · 2025-04-25T20:15:12.000+02:00
Add git history csv reports
diff --git a/cypher/GitLog/List_git_file_directories_with_commit_statistics.cypher b/cypher/GitLog/List_git_file_directories_with_commit_statistics.cypher
@@ -0,0 +1,85 @@
+// List git file directories and their statistics
+
+ MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file:Git&File&!Repository)
+ WHERE git_file.deletedAt IS NULL // filter out deleted files
+ ORDER BY git_file.relativePath
+  WITH *
+      ,datetime.fromepochMillis(git_file.createdAtEpoch)                                             AS fileCreatedAtTimestamp
+      ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch)) AS fileLastModificationAtTimestamp
+  WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
+  WITH *, split(filePath, '/')                              AS pathElements
+  WITH *, pathElements[-1]                                  AS fileName
+ MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-->(old_files_included:Git&File&!Repository)-[:HAS_NEW_NAME*0..3]->(git_file)
+  WITH pathElements
+      ,fileCreatedAtTimestamp
+      ,fileLastModificationAtTimestamp
+      ,fileName
+      ,filePath                                         AS fileRelativePath
+      ,split(git_commit.author, ' <')[0]                AS author
+      ,max(git_commit.sha)                              AS maxCommitSha
+      ,collect(DISTINCT git_commit.sha)                 AS commitHashes
+      ,date(max(git_commit.date))                       AS lastCommitDate
+UNWIND pathElements AS pathElement
+  WITH *
+      ,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '')  AS parent
+  WITH *
+      ,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory
+ WHERE pathElement <> fileName
+  WITH directory                                  AS directoryPath
+      ,split(directory, '/')[-1]                  AS directoryName
+      ,parent                                     AS directoryParentPath
+      ,split(parent, '/')[-1]                     AS directoryParentName
+      ,size(split(directory, '/'))                AS directoryPathLength
+      ,author
+      ,collect(DISTINCT fileRelativePath)         AS files
+      ,max(date(fileCreatedAtTimestamp) )         AS lastCreationDate
+      ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate
+      ,apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes))) AS commitHashes
+      ,max(maxCommitSha)                          AS maxCommitSha
+      ,max(lastCommitDate)                        AS lastCommitDate
+      ,max(fileRelativePath)                      AS maxFileRelativePath
+      ,duration.inDays(max(lastCommitDate), date()).days                      AS daysSinceLastCommit
+      ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days          AS daysSinceLastCreation
+      ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification
+// Assure that the authors are ordered by their commit count descending per directory
+ORDER BY directoryPath ASCENDING, size(commitHashes) DESCENDING
+  WITH directoryPath
+      ,directoryName
+      ,directoryParentPath
+      ,directoryParentName
+      ,directoryPathLength
+      ,collect(author)[0]                         AS mainAuthor
+      ,collect(author)[1]                         AS secondAuthor
+      ,collect(author)[2]                         AS thirdAuthor
+      ,count(DISTINCT author)                     AS authorCount
+      ,size(apoc.coll.toSet(apoc.coll.flatten(collect(files))))        AS fileCount
+      ,size(apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes)))) AS commitCount
+      ,max(lastCreationDate)                      AS lastCreationDate
+      ,max(lastModificationDate)                  AS lastModificationDate
+      ,max(maxCommitSha)                          AS maxCommitSha
+      ,max(lastCommitDate)                        AS lastCommitDate
+      ,min(daysSinceLastCommit)                   AS daysSinceLastCommit
+      ,min(daysSinceLastCreation)                 AS daysSinceLastCreation
+      ,min(daysSinceLastModification)             AS daysSinceLastModification
+      ,max(maxFileRelativePath)                   AS maxFileRelativePath
+// The final results are grouped by the statistic values like file count,...
+RETURN collect(directoryPath)[-1]                  AS directoryPath
+      ,apoc.text.join(collect(directoryName), '/') AS directoryName
+      ,collect(directoryParentPath)[0]             AS directoryParentPath
+      ,collect(directoryParentName)[0]             AS directoryParentName
+      ,mainAuthor
+      ,secondAuthor
+      ,thirdAuthor
+      ,authorCount
+      ,fileCount
+      ,commitCount
+      ,lastCreationDate
+      ,lastModificationDate
+      ,lastCommitDate
+      ,daysSinceLastCommit
+      ,daysSinceLastCreation
+      ,daysSinceLastModification
+      ,maxCommitSha
+      ,maxFileRelativePath
+      ,max(directoryPathLength)                          AS directoryPathLength
+      ,count(DISTINCT directoryPath)                     AS combinedDirectoriesCount
diff --git a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher
@@ -14,6 +14,7 @@ UNWIND git_files AS git_file
 RETURN git_repository.name + '/' + git_file.relativePath AS filePath
       ,split(git_commit.author, ' <')[0]                 AS author
       ,count(DISTINCT git_commit.sha)                    AS commitCount
+      ,collect(DISTINCT git_commit.sha)                  AS commitHashes
       ,date(max(git_commit.date))                        AS lastCommitDate
       ,max(date(fileCreatedAtTimestamp))                 AS lastCreationDate
       ,max(date(fileLastModificationAtTimestamp))        AS lastModificationDate
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -493,9 +493,18 @@
     "    \"\"\"\n",
     "    return values.iloc[1] if len(values) > 1 else None\n",
     "\n",
-    "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
+    "def get_flattened_unique_values(values: pd.Series):\n",
     "    \"\"\"\n",
-    "    Return the file count from an array of array of file paths.\n",
+    "    Return an array of unique string values from an array of array of strings.\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return :  Series : The pandas Series of values\n",
+    "    \"\"\"\n",
+    "    return np.unique(np.concatenate(values.to_list()))\n",
+    "\n",
+    "def count_unique_aggregated_values(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Return the number of unique values from an array of array of strings.\n",
     "    Meant to be used as an aggregation function for dataframe grouping.\n",
     "    values : Series : The pandas Series of values\n",
     "    return : int : The number of files\n",
@@ -573,7 +582,6 @@
     "# Define how common non-grouped columns will be aggregated.\n",
     "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
     "common_named_aggregation = dict(\n",
-    "    commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
     "    daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
     "    daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
     "    daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
@@ -588,12 +596,14 @@
     "    filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
     "    firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
     "    fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
+    "    commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n",
+    "    intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "\n",
     "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
     "# The author with the most commits will then be listed first for each directory.\n",
-    "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
     "\n",
     "# Debug\n",
@@ -603,12 +613,13 @@
     "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
     "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
-    "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
+    "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
     "    firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
     "    mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
     "    authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
     "    mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
     "    secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
+    "    commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
@@ -669,6 +680,17 @@
     "git_files_with_commit_statistics.head(30)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53fcd8b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print prepared data frame to CSV file\n",
+    "# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ccc11f52",
diff --git a/scripts/activateCondaEnvironment.sh b/scripts/activateCondaEnvironment.sh
@@ -37,10 +37,10 @@ echo "activateCondaEnvironment: CONDA_PREFIX=${CONDA_PREFIX}"
 echo "activateCondaEnvironment: Current conda environment=${CONDA_DEFAULT_ENV}"
 echo "activateCondaEnvironment: Target conda environment=${CODEGRAPH_CONDA_ENVIRONMENT}"
 
-PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare then Conda environment if needed (default, "true") or use an already prepared Conda environment ("false")
+PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare a Python environment with Conda if needed (default, "true") or use an already prepared Conda environment ("false")
 
-if [ "${CONDA_DEFAULT_ENV}" = "${CODEGRAPH_CONDA_ENVIRONMENT}" ] && [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then
-    echo "activateCondaEnvironment: Skipping activation. Target conda environment ${CODEGRAPH_CONDA_ENVIRONMENT} is already activated."
+if [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then
+    echo "activateCondaEnvironment: Skipping activation. ${PREPARE_CONDA_ENVIRONMENT} is set to false."
     # "return" needs to be used here instead of "exit".
     # This script is included in another script by using "source". 
     # "exit" would end the main script, "return" just ends this sub script.
diff --git a/scripts/reports/GitHistoryCsv.sh b/scripts/reports/GitHistoryCsv.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+# Executes "GitLog" Cypher queries to get the "git-history-csv" CSV reports.
+# It contains lists of files with only one author, last changed or created files, pairwise changed files,...
+
+# Requires executeQueryFunctions.sh, cleanupAfterReportGeneration.sh
+
+# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
+set -o errexit -o pipefail
+
+# Overrideable Constants (defaults also defined in sub scripts)
+REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
+
+## Get this "scripts/reports" directory if not already set
+# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. 
+# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
+# This way non-standard tools like readlink aren't needed.
+REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
+echo "GitHistoryCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
+
+# Get the "scripts" directory by taking the path of this script and going one directory up.
+SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts
+echo "GitHistoryCsv: SCRIPTS_DIR=${SCRIPTS_DIR}"
+
+# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
+CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"}
+echo "GitHistoryCsv: CYPHER_DIR=${CYPHER_DIR}"
+
+# Define functions to execute cypher queries from within a given file
+source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
+
+# Create report directory
+REPORT_NAME="git-history-csv"
+FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
+mkdir -p "${FULL_REPORT_DIRECTORY}"
+
+# Local Constants
+GIT_LOG_CYPHER_DIR="${CYPHER_DIR}/GitLog"
+
+echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Processing git history..."
+
+# Detailed git file statistics
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_with_commit_statistics_by_author.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_with_commit_statistics_by_author.csv"
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_that_were_changed_together_with_another_file.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_that_were_changed_together_with_another_file.csv"
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_file_directories_with_commit_statistics.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_file_directories_with_commit_statistics.csv"
+
+# Overall distribution of how many files were changed with one git commit, how many were changed with two, etc. 
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_per_commit_distribution.csv"
+
+# Data basis for finding out if there is a correlation between pairwise changed files and their dependencies
+execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv"
+
+# Clean-up after report generation. Empty reports will be deleted.
+source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"
+
+echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."