Skip to content

Commit 8cc715c

Browse files
authored
Merge pull request #352 from JohT/feature/improve-git-history-treemap-visualizations
Improve git history treemap visualizations and uncover pairwise changed files
2 parents b43fd72 + 10e202e commit 8cc715c

8 files changed

+299
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
2+
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
5+
MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository)
6+
MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository)
7+
CALL (firstCodeFile, secondCodeFile, gitChange) {
8+
MERGE (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
9+
SET pairwiseChange = properties(gitChange)
10+
} IN TRANSACTIONS
11+
RETURN count(*) AS pairCount
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6+
WHERE git_file.deletedAt IS NULL
7+
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
8+
ORDER BY git_commit.sha, git_file.relativePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT git_file) AS filesInCommit
12+
// Limit the file count to min. 2 (changed together) and
13+
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
14+
WHERE size(filesInCommit) >= 2
15+
AND size(filesInCommit) <= 50
16+
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
17+
WITH globalCommitCount
18+
,commitHash
19+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
20+
UNWIND fileCombinations AS fileCombination
21+
WITH globalCommitCount
22+
,fileCombination
23+
,count(DISTINCT commitHash) AS commitCount
24+
,collect(DISTINCT commitHash) AS commitHashes
25+
// Filter out file pairs that where changed not very often together
26+
// In detail: More than 0.1 per mille compared to overall commit count
27+
WHERE commitCount > globalCommitCount * 0.001
28+
WITH fileCombination[0] AS firstFile
29+
,fileCombination[1] AS secondFile
30+
,commitCount
31+
,commitHashes
32+
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
33+
CALL (firstFile, secondFile, commitCount, commitHashes) {
34+
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
35+
SET pairwiseChange.commitCount = commitCount
36+
,pairwiseChange.commitHashes = commitHashes
37+
} IN TRANSACTIONS
38+
// Return one row with some statistics about the found pairs and their commit counts
39+
RETURN max(commitCount) AS maxCommitCount
40+
,avg(commitCount) AS avgCommitCount
41+
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
42+
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
43+
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
44+
,count(*) AS pairCount
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// List git files that where changed together frequently. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
2+
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
5+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
6+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
7+
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
8+
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
9+
,gitChange.commitCount AS commitCount
10+
ORDER BY commitCount DESC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// List git files that where changed together frequently
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
7+
WHERE git_file.deletedAt IS NULL
8+
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT filePath) AS filesInCommit
12+
WHERE size(filesInCommit) >= 2
13+
AND size(filesInCommit) <= 50
14+
WITH globalCommitCount
15+
,commitHash
16+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
17+
UNWIND fileCombinations AS fileCombination
18+
WITH globalCommitCount
19+
,apoc.coll.sort(fileCombination) AS fileCombination
20+
,count(DISTINCT commitHash) AS commitCount
21+
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
22+
RETURN fileCombination[0] AS firstFile
23+
,fileCombination[1] AS secondFile
24+
,commitCount
25+
ORDER BY commitCount DESC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
2+
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
5+
UNWIND gitChange.commitHashes AS commitHash
6+
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
7+
,count(DISTINCT commitHash) AS commitCount
8+
ORDER BY commitCount DESC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// List git files that where changed together frequently
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)
6+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
7+
WHERE git_file.deletedAt IS NULL
8+
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT filePath) AS filesInCommit
12+
WHERE size(filesInCommit) >= 2
13+
AND size(filesInCommit) <= 50
14+
WITH globalCommitCount
15+
,commitHash
16+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
17+
UNWIND fileCombinations AS fileCombination
18+
UNWIND fileCombination AS filePath
19+
WITH globalCommitCount
20+
,filePath
21+
,count(DISTINCT commitHash) AS commitCount
22+
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
23+
RETURN filePath
24+
,commitCount
25+
ORDER BY commitCount DESC

jupyter/GitHistoryGeneral.ipynb

+172-2
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,8 @@
208208
")\n",
209209
"plotly_treemap_figure_show_settings = dict(\n",
210210
" renderer=\"svg\" if is_command_line_execution() else None,\n",
211-
" width=1000,\n",
212-
" height=800\n",
211+
" width=1080,\n",
212+
" height=1080\n",
213213
")\n",
214214
"\n",
215215
"plotly_treemap_marker_base_style = dict(\n",
@@ -766,6 +766,89 @@
766766
"figure.show(**plotly_treemap_figure_show_settings)"
767767
]
768768
},
769+
{
770+
"cell_type": "markdown",
771+
"id": "485b5194",
772+
"metadata": {},
773+
"source": [
774+
"### Directories with very few different authors"
775+
]
776+
},
777+
{
778+
"cell_type": "code",
779+
"execution_count": null,
780+
"id": "3175be23",
781+
"metadata": {},
782+
"outputs": [],
783+
"source": [
784+
"git_commit_authors_per_directory_low_focus = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.33)\n",
785+
"\n",
786+
"author_count_top_limit = git_commit_authors_per_directory_low_focus['authorCount_limited'].max().astype(int).astype(str)\n",
787+
"author_count_top_limit_label_alias = {author_count_top_limit: author_count_top_limit + ' or more'}\n",
788+
"\n",
789+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
790+
" create_treemap_commit_statistics_settings(git_commit_authors_per_directory_low_focus),\n",
791+
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
792+
" # values = git_commit_authors_per_directory['fileCount'],\n",
793+
" marker=dict(\n",
794+
" **plotly_treemap_marker_base_colorscale,\n",
795+
" colors=git_commit_authors_per_directory_low_focus['authorCount_limited'], \n",
796+
" colorbar=dict(title=\"Authors\",\n",
797+
" tickmode=\"auto\",\n",
798+
" labelalias=author_count_top_limit_label_alias\n",
799+
" ),\n",
800+
" reversescale=True\n",
801+
" ),\n",
802+
"))\n",
803+
"figure.update_layout(\n",
804+
" **plotly_treemap_layout_base_settings,\n",
805+
" title='Number of distinct commit authors (red/black = only one or very few authors)',\n",
806+
")\n",
807+
"\n",
808+
"figure.show(**plotly_treemap_figure_show_settings)"
809+
]
810+
},
811+
{
812+
"cell_type": "code",
813+
"execution_count": null,
814+
"id": "e11947c5",
815+
"metadata": {},
816+
"outputs": [],
817+
"source": [
818+
"import plotly.graph_objects as go\n",
819+
"\n",
820+
"# Example data\n",
821+
"labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n",
822+
"parents = [\"\", \"A\", \"A\", \"B\", \"B\"]\n",
823+
"values = [10, 20, 30, 40, 50] # Color scale values\n",
824+
"max_value = max(values)\n",
825+
"\n",
826+
"# Create treemap\n",
827+
"fig = go.Figure(go.Treemap(\n",
828+
" labels=labels,\n",
829+
" parents=parents,\n",
830+
" values=values,\n",
831+
" marker=dict(\n",
832+
" colors=values,\n",
833+
" colorscale=\"Blues\",\n",
834+
" colorbar=dict(\n",
835+
" title=\"Value\",\n",
836+
" tickmode=\"auto\", # Let Plotly auto-select ticks\n",
837+
" ticklabelposition=\"outside top\",\n",
838+
" tickformat=\",\", # Use default formatting\n",
839+
" ticklabeloverflow=\"allow\", # Ensure long labels are displayed\n",
840+
" ticklabelstep=1 # Show all labels\n",
841+
" )\n",
842+
" )\n",
843+
"))\n",
844+
"\n",
845+
"# Add an alias for the highest tick value dynamically\n",
846+
"fig.update_layout(coloraxis_colorbar_tickvals=[max_value])\n",
847+
"fig.update_layout(coloraxis_colorbar_ticktext=[f\"{max_value} or more\"])\n",
848+
"\n",
849+
"fig.show()\n"
850+
]
851+
},
769852
{
770853
"cell_type": "markdown",
771854
"id": "5dbceaef",
@@ -1045,6 +1128,93 @@
10451128
"figure.show(**plotly_treemap_figure_show_settings)"
10461129
]
10471130
},
1131+
{
1132+
"cell_type": "markdown",
1133+
"id": "80bd7c28",
1134+
"metadata": {},
1135+
"source": [
1136+
"### File changed frequently with other files"
1137+
]
1138+
},
1139+
{
1140+
"cell_type": "code",
1141+
"execution_count": null,
1142+
"id": "24055998",
1143+
"metadata": {},
1144+
"outputs": [],
1145+
"source": [
1146+
"pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n",
1147+
"\n",
1148+
"# Debug\n",
1149+
"# display(\"1. pairwise changed files --------------\")\n",
1150+
"# display(pairwise_changed_git_files)\n",
1151+
"\n",
1152+
"# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
1153+
"pairwise_changed_git_files = add_directory_column(pairwise_changed_git_files, 'filePath', 'directoryPath')\n",
1154+
"\n",
1155+
"# Debug\n",
1156+
"# display(\"2. added directories --------------\")\n",
1157+
"# display(pairwise_changed_git_files)\n",
1158+
"\n",
1159+
"# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
1160+
"pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n",
1161+
" pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
1162+
" pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n",
1163+
")\n",
1164+
"pairwise_changed_git_files.reset_index(inplace=True)\n",
1165+
"\n",
1166+
"# Debug\n",
1167+
"# display(\"3. after grouping --------------\")\n",
1168+
"# display(pairwise_changed_git_files)\n",
1169+
"\n",
1170+
"pairwise_changed_git_files = pd.merge(\n",
1171+
" git_files_with_commit_statistics, \n",
1172+
" pairwise_changed_git_files, \n",
1173+
" left_on='directoryPath', \n",
1174+
" right_on=\"directoryPath\",\n",
1175+
" how=\"left\",\n",
1176+
" validate=\"m:1\"\n",
1177+
")\n",
1178+
"\n",
1179+
"# Debug\n",
1180+
"# display(\"4. after merging --------------\")\n",
1181+
"# display(pairwise_changed_git_files)\n",
1182+
"\n",
1183+
"pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
1184+
"pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
1185+
"pairwise_changed_git_files.reset_index(inplace=True)\n",
1186+
"\n",
1187+
"# Debug\n",
1188+
"# display(\"5. after NaN fill --------------\")\n",
1189+
"# display(pairwise_changed_git_files)"
1190+
]
1191+
},
1192+
{
1193+
"cell_type": "code",
1194+
"execution_count": null,
1195+
"id": "19b5a98a",
1196+
"metadata": {},
1197+
"outputs": [],
1198+
"source": [
1199+
"pairwise_changed_git_files = add_quantile_limited_column(pairwise_changed_git_files, \"pairwiseChangeCommitCount\", 0.98)\n",
1200+
"\n",
1201+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
1202+
" create_treemap_commit_statistics_settings(pairwise_changed_git_files),\n",
1203+
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
1204+
" # values = pairwise_changed_git_files['fileCount'],\n",
1205+
" marker=dict(\n",
1206+
" **plotly_treemap_marker_base_colorscale,\n",
1207+
" colors=pairwise_changed_git_files['pairwiseChangeCommitCount_limited'], \n",
1208+
" colorbar=dict(title=\"Changes\"),\n",
1209+
" ),\n",
1210+
"))\n",
1211+
"figure.update_layout(\n",
1212+
" **plotly_treemap_layout_base_settings,\n",
1213+
" title='Pairwise file changes',\n",
1214+
")\n",
1215+
"figure.show(**plotly_treemap_figure_show_settings)"
1216+
]
1217+
},
10481218
{
10491219
"cell_type": "markdown",
10501220
"id": "d8c6ccee",

scripts/importGit.sh

+4
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ commonPostGitImport() {
125125
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher"
126126
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher"
127127

128+
echo "importGit: Creating relationships to file nodes that where changed together..."
129+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher"
130+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher"
131+
128132
# Since it's currently not possible to rule out ambiguity in git<->code file matching,
129133
# the following verifications are only an additional info in the log rather than an error.
130134
echo "importGit: Running verification queries for troubleshooting (non failing)..."

0 commit comments

Comments
 (0)