Skip to content

Commit 10e202e

Browse files
committed
Add CHANGED_TOGETHER_WITH edge for git file nodes
1 parent 30349a7 commit 10e202e

8 files changed

+103
-26
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
2+
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
5+
MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository)
6+
MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository)
7+
CALL (firstCodeFile, secondCodeFile, gitChange) {
8+
MERGE (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
9+
SET pairwiseChange = properties(gitChange)
10+
} IN TRANSACTIONS
11+
RETURN count(*) AS pairCount
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6+
WHERE git_file.deletedAt IS NULL
7+
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
8+
ORDER BY git_commit.sha, git_file.relativePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT git_file) AS filesInCommit
12+
// Limit the file count to min. 2 (changed together) and
13+
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
14+
WHERE size(filesInCommit) >= 2
15+
AND size(filesInCommit) <= 50
16+
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
17+
WITH globalCommitCount
18+
,commitHash
19+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
20+
UNWIND fileCombinations AS fileCombination
21+
WITH globalCommitCount
22+
,fileCombination
23+
,count(DISTINCT commitHash) AS commitCount
24+
,collect(DISTINCT commitHash) AS commitHashes
25+
// Filter out file pairs that where changed not very often together
26+
// In detail: More than 0.1 per mille compared to overall commit count
27+
WHERE commitCount > globalCommitCount * 0.001
28+
WITH fileCombination[0] AS firstFile
29+
,fileCombination[1] AS secondFile
30+
,commitCount
31+
,commitHashes
32+
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
33+
CALL (firstFile, secondFile, commitCount, commitHashes) {
34+
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
35+
SET pairwiseChange.commitCount = commitCount
36+
,pairwiseChange.commitHashes = commitHashes
37+
} IN TRANSACTIONS
38+
// Return one row with some statistics about the found pairs and their commit counts
39+
RETURN max(commitCount) AS maxCommitCount
40+
,avg(commitCount) AS avgCommitCount
41+
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
42+
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
43+
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
44+
,count(*) AS pairCount
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,10 @@
1-
// List git files that where changed together frequently
1+
// List git files that where changed together frequently. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
22

3-
MATCH (global_git_commit:Git:Commit)
4-
WITH count(global_git_commit) AS globalCommitCount
5-
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6-
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
7-
WHERE git_file.deletedAt IS NULL
8-
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
9-
WITH globalCommitCount
10-
,git_commit.sha AS commitHash
11-
,collect(DISTINCT filePath) AS filesInCommit
12-
WHERE size(filesInCommit) >= 2
13-
AND size(filesInCommit) <= 50
14-
WITH globalCommitCount
15-
,commitHash
16-
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
17-
UNWIND fileCombinations AS fileCombination
18-
WITH globalCommitCount
19-
,apoc.coll.sort(fileCombination) AS fileCombination
20-
,count(DISTINCT commitHash) AS commitCount
21-
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
22-
RETURN fileCombination[0] AS firstFile
23-
,fileCombination[1] AS secondFile
24-
,commitCount
25-
ORDER BY commitCount DESC
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
5+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
6+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
7+
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
8+
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
9+
,gitChange.commitCount AS commitCount
10+
ORDER BY commitCount DESC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// List git files that where changed together frequently
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
7+
WHERE git_file.deletedAt IS NULL
8+
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT filePath) AS filesInCommit
12+
WHERE size(filesInCommit) >= 2
13+
AND size(filesInCommit) <= 50
14+
WITH globalCommitCount
15+
,commitHash
16+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
17+
UNWIND fileCombinations AS fileCombination
18+
WITH globalCommitCount
19+
,apoc.coll.sort(fileCombination) AS fileCombination
20+
,count(DISTINCT commitHash) AS commitCount
21+
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
22+
RETURN fileCombination[0] AS firstFile
23+
,fileCombination[1] AS secondFile
24+
,commitCount
25+
ORDER BY commitCount DESC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
2+
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
5+
UNWIND gitChange.commitHashes AS commitHash
6+
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
7+
,count(DISTINCT commitHash) AS commitCount
8+
ORDER BY commitCount DESC

jupyter/GitHistoryGeneral.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@
802802
"))\n",
803803
"figure.update_layout(\n",
804804
" **plotly_treemap_layout_base_settings,\n",
805-
" title='Number of distinct commit authors (red/black = ony one or very few authors)',\n",
805+
" title='Number of distinct commit authors (red/black = only one or very few authors)',\n",
806806
")\n",
807807
"\n",
808808
"figure.show(**plotly_treemap_figure_show_settings)"
@@ -1143,7 +1143,7 @@
11431143
"metadata": {},
11441144
"outputs": [],
11451145
"source": [
1146-
"pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher\")\n",
1146+
"pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n",
11471147
"\n",
11481148
"# Debug\n",
11491149
"# display(\"1. pairwise changed files --------------\")\n",

scripts/importGit.sh

+4
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ commonPostGitImport() {
125125
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher"
126126
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher"
127127

128+
echo "importGit: Creating relationships to file nodes that where changed together..."
129+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher"
130+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher"
131+
128132
# Since it's currently not possible to rule out ambiguity in git<->code file matching,
129133
# the following verifications are only an additional info in the log rather than an error.
130134
echo "importGit: Running verification queries for troubleshooting (non failing)..."

0 commit comments

Comments
 (0)