Skip to content

Commit cf81a25

Browse files
authored
Merge pull request #372 from JohT/feature/git-history-csv-reports
Add git history csv reports
2 parents 73e8869 + 2d0b800 commit cf81a25

5 files changed

+172
-8
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// List git file directories and their statistics
2+
3+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file:Git&File&!Repository)
4+
WHERE git_file.deletedAt IS NULL // filter out deleted files
5+
ORDER BY git_file.relativePath
6+
WITH *
7+
,datetime.fromepochMillis(git_file.createdAtEpoch) AS fileCreatedAtTimestamp
8+
,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch)) AS fileLastModificationAtTimestamp
9+
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
10+
WITH *, split(filePath, '/') AS pathElements
11+
WITH *, pathElements[-1] AS fileName
12+
MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-->(old_files_included:Git&File&!Repository)-[:HAS_NEW_NAME*0..3]->(git_file)
13+
WITH pathElements
14+
,fileCreatedAtTimestamp
15+
,fileLastModificationAtTimestamp
16+
,fileName
17+
,filePath AS fileRelativePath
18+
,split(git_commit.author, ' <')[0] AS author
19+
,max(git_commit.sha) AS maxCommitSha
20+
,collect(DISTINCT git_commit.sha) AS commitHashes
21+
,date(max(git_commit.date)) AS lastCommitDate
22+
UNWIND pathElements AS pathElement
23+
WITH *
24+
,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent
25+
WITH *
26+
,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory
27+
WHERE pathElement <> fileName
28+
WITH directory AS directoryPath
29+
,split(directory, '/')[-1] AS directoryName
30+
,parent AS directoryParentPath
31+
,split(parent, '/')[-1] AS directoryParentName
32+
,size(split(directory, '/')) AS directoryPathLength
33+
,author
34+
,collect(DISTINCT fileRelativePath) AS files
35+
,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate
36+
,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate
37+
,apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes))) AS commitHashes
38+
,max(maxCommitSha) AS maxCommitSha
39+
,max(lastCommitDate) AS lastCommitDate
40+
,max(fileRelativePath) AS maxFileRelativePath
41+
,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit
42+
,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation
43+
,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification
44+
// Assure that the authors are ordered by their commit count descending per directory
45+
ORDER BY directoryPath ASCENDING, size(commitHashes) DESCENDING
46+
WITH directoryPath
47+
,directoryName
48+
,directoryParentPath
49+
,directoryParentName
50+
,directoryPathLength
51+
,collect(author)[0] AS mainAuthor
52+
,collect(author)[1] AS secondAuthor
53+
,collect(author)[2] AS thirdAuthor
54+
,count(DISTINCT author) AS authorCount
55+
,size(apoc.coll.toSet(apoc.coll.flatten(collect(files)))) AS fileCount
56+
,size(apoc.coll.toSet(apoc.coll.flatten(collect(commitHashes)))) AS commitCount
57+
,max(lastCreationDate) AS lastCreationDate
58+
,max(lastModificationDate) AS lastModificationDate
59+
,max(maxCommitSha) AS maxCommitSha
60+
,max(lastCommitDate) AS lastCommitDate
61+
,min(daysSinceLastCommit) AS daysSinceLastCommit
62+
,min(daysSinceLastCreation) AS daysSinceLastCreation
63+
,min(daysSinceLastModification) AS daysSinceLastModification
64+
,max(maxFileRelativePath) AS maxFileRelativePath
65+
// The final results are grouped by the statistic values like file count,...
66+
RETURN collect(directoryPath)[-1] AS directoryPath
67+
,apoc.text.join(collect(directoryName), '/') AS directoryName
68+
,collect(directoryParentPath)[0] AS directoryParentPath
69+
,collect(directoryParentName)[0] AS directoryParentName
70+
,mainAuthor
71+
,secondAuthor
72+
,thirdAuthor
73+
,authorCount
74+
,fileCount
75+
,commitCount
76+
,lastCreationDate
77+
,lastModificationDate
78+
,lastCommitDate
79+
,daysSinceLastCommit
80+
,daysSinceLastCreation
81+
,daysSinceLastModification
82+
,maxCommitSha
83+
,maxFileRelativePath
84+
,max(directoryPathLength) AS directoryPathLength
85+
,count(DISTINCT directoryPath) AS combinedDirectoriesCount

cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ UNWIND git_files AS git_file
1414
RETURN git_repository.name + '/' + git_file.relativePath AS filePath
1515
,split(git_commit.author, ' <')[0] AS author
1616
,count(DISTINCT git_commit.sha) AS commitCount
17+
,collect(DISTINCT git_commit.sha) AS commitHashes
1718
,date(max(git_commit.date)) AS lastCommitDate
1819
,max(date(fileCreatedAtTimestamp)) AS lastCreationDate
1920
,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate

jupyter/GitHistoryGeneral.ipynb

+27-5
Original file line numberDiff line numberDiff line change
@@ -493,9 +493,18 @@
493493
" \"\"\"\n",
494494
" return values.iloc[1] if len(values) > 1 else None\n",
495495
"\n",
496-
"def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
496+
"def get_flattened_unique_values(values: pd.Series):\n",
497497
" \"\"\"\n",
498-
" Return the file count from an array of array of file paths.\n",
498+
" Return an array of unique string values from an array of array of strings.\n",
499+
" Meant to be used as an aggregation function for dataframe grouping.\n",
500+
" values : Series : The pandas Series of values\n",
501+
" return : Series : The pandas Series of values\n",
502+
" \"\"\"\n",
503+
" return np.unique(np.concatenate(values.to_list()))\n",
504+
"\n",
505+
"def count_unique_aggregated_values(values: pd.Series):\n",
506+
" \"\"\"\n",
507+
" Return the number of unique values from an array of array of strings.\n",
499508
" Meant to be used as an aggregation function for dataframe grouping.\n",
500509
" values : Series : The pandas Series of values\n",
501510
" return : int : The number of files\n",
@@ -573,7 +582,6 @@
573582
"# Define how common non-grouped columns will be aggregated.\n",
574583
"# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
575584
"common_named_aggregation = dict(\n",
576-
" commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
577585
" daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
578586
" daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
579587
" daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
@@ -588,12 +596,14 @@
588596
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
589597
" firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
590598
" fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
599+
" commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n",
600+
" intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n",
591601
" **common_named_aggregation\n",
592602
")\n",
593603
"\n",
594604
"# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
595605
"# The author with the most commits will then be listed first for each directory.\n",
596-
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
606+
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n",
597607
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
598608
"\n",
599609
"# Debug\n",
@@ -603,12 +613,13 @@
603613
"# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
604614
"# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
605615
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
606-
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
616+
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
607617
" firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
608618
" mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
609619
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
610620
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
611621
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
622+
" commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n",
612623
" **common_named_aggregation\n",
613624
")\n",
614625
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
@@ -669,6 +680,17 @@
669680
"git_files_with_commit_statistics.head(30)"
670681
]
671682
},
683+
{
684+
"cell_type": "code",
685+
"execution_count": null,
686+
"id": "53fcd8b2",
687+
"metadata": {},
688+
"outputs": [],
689+
"source": [
690+
"# Print prepared data frame to CSV file\n",
691+
"# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)"
692+
]
693+
},
672694
{
673695
"cell_type": "markdown",
674696
"id": "ccc11f52",

scripts/activateCondaEnvironment.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ echo "activateCondaEnvironment: CONDA_PREFIX=${CONDA_PREFIX}"
3737
echo "activateCondaEnvironment: Current conda environment=${CONDA_DEFAULT_ENV}"
3838
echo "activateCondaEnvironment: Target conda environment=${CODEGRAPH_CONDA_ENVIRONMENT}"
3939

40-
PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare then Conda environment if needed (default, "true") or use an already prepared Conda environment ("false")
40+
PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare a Python environment with Conda if needed (default, "true") or use an already prepared Conda environment ("false")
4141

42-
if [ "${CONDA_DEFAULT_ENV}" = "${CODEGRAPH_CONDA_ENVIRONMENT}" ] && [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then
43-
echo "activateCondaEnvironment: Skipping activation. Target conda environment ${CODEGRAPH_CONDA_ENVIRONMENT} is already activated."
42+
if [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then
43+
echo "activateCondaEnvironment: Skipping activation. ${PREPARE_CONDA_ENVIRONMENT} is set to false."
4444
# "return" needs to be used here instead of "exit".
4545
# This script is included in another script by using "source".
4646
# "exit" would end the main script, "return" just ends this sub script.

scripts/reports/GitHistoryCsv.sh

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env bash
2+
3+
# Executes "GitLog" Cypher queries to get the "git-history-csv" CSV reports.
4+
# It contains lists of files with only one author, last changed or created files, pairwise changed files,...
5+
6+
# Requires executeQueryFunctions.sh, cleanupAfterReportGeneration.sh
7+
8+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
9+
set -o errexit -o pipefail
10+
11+
# Overrideable Constants (defaults also defined in sub scripts)
12+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
13+
14+
## Get this "scripts/reports" directory if not already set
15+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
16+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
17+
# This way non-standard tools like readlink aren't needed.
18+
REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )}
19+
echo "GitHistoryCsv: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}"
20+
21+
# Get the "scripts" directory by taking the path of this script and going one directory up.
22+
SCRIPTS_DIR=${SCRIPTS_DIR:-"${REPORTS_SCRIPT_DIR}/.."} # Repository directory containing the shell scripts
23+
echo "GitHistoryCsv: SCRIPTS_DIR=${SCRIPTS_DIR}"
24+
25+
# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
26+
CYPHER_DIR=${CYPHER_DIR:-"${REPORTS_SCRIPT_DIR}/../../cypher"}
27+
echo "GitHistoryCsv: CYPHER_DIR=${CYPHER_DIR}"
28+
29+
# Define functions to execute cypher queries from within a given file
30+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
31+
32+
# Create report directory
33+
REPORT_NAME="git-history-csv"
34+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
35+
mkdir -p "${FULL_REPORT_DIRECTORY}"
36+
37+
# Local Constants
38+
GIT_LOG_CYPHER_DIR="${CYPHER_DIR}/GitLog"
39+
40+
echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Processing git history..."
41+
42+
# Detailed git file statistics
43+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_with_commit_statistics_by_author.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_with_commit_statistics_by_author.csv"
44+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_that_were_changed_together_with_another_file.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_that_were_changed_together_with_another_file.csv"
45+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_file_directories_with_commit_statistics.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_file_directories_with_commit_statistics.csv"
46+
47+
# Overall distribution of how many files were changed with one git commit, how many were changed with two, etc.
48+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cypher" > "${FULL_REPORT_DIRECTORY}/List_git_files_per_commit_distribution.csv"
49+
50+
# Data basis for finding out if there is a correlation between pairwise changed files and their dependencies
51+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv"
52+
53+
# Clean-up after report generation. Empty reports will be deleted.
54+
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"
55+
56+
echo "GitHistoryCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."

0 commit comments

Comments
 (0)