Skip to content

Commit f70210f

Browse files
authored
Merge pull request #362 from JohT/feature/pairwise-changes-vs-dependency-weights
Compare pairwise changed files with their dependency weights
2 parents 3f8593d + 7e58869 commit f70210f

5 files changed

+137
-3
lines changed

cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
MATCH (global_git_commit:Git:Commit)
44
WITH count(global_git_commit) AS globalCommitCount
5-
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
6+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
67
WHERE git_file.deletedAt IS NULL
78
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
89
ORDER BY git_commit.sha, git_file.relativePath

cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
MATCH (global_git_commit:Git:Commit)
44
WITH count(global_git_commit) AS globalCommitCount
5-
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
66
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
77
WHERE git_file.deletedAt IS NULL
88
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath

cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
MATCH (global_git_commit:Git:Commit)
44
WITH count(global_git_commit) AS globalCommitCount
5-
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[:UPDATES]->(git_file:Git:File)
66
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
77
WHERE git_file.deletedAt IS NULL
88
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// List pair of files that were changed together and that have a declared dependency between each other.
2+
3+
MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
4+
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
5+
WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
6+
WITH firstCodeFile.fileName AS firstFileName
7+
,secondCodeFile.fileName AS secondFileName
8+
,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight
9+
,pairwiseChange.commitCount AS commitCount
10+
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands
11+
RETURN dependencyWeight
12+
,commitCount
13+
,fileDistanceAsFewestChangeDirectoryCommands
14+
// ,count(*) AS occurrences
15+
// ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples
16+
ORDER BY dependencyWeight, commitCount
17+
18+
// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
19+
// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
20+
// WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
21+
// RETURN firstCodeFile.fileName AS firstFileName
22+
// ,secondCodeFile.fileName AS secondFileName
23+
// ,dependency.weight AS dependencyWeight
24+
// ,pairwiseChange.commitCount AS commitCount
25+
// ORDER BY dependencyWeight, commitCount
26+
27+
// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File)
28+
// WITH count(DISTINCT relation) AS relatedFilesCount
29+
// ,collect(DISTINCT relation) AS relations
30+
// UNWIND relations AS relation
31+
// WITH relatedFilesCount
32+
// ,coalesce(relation.commitCount, 0) AS commitCount
33+
// ,coalesce(relation.weight, 0) AS dependencyWeight
34+
// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands
35+
// RETURN dependencyWeight
36+
// ,commitCount
37+
// ,fileDistanceAsFewestChangeDirectoryCommands
38+
// ORDER BY dependencyWeight, commitCount

jupyter/GitHistoryGeneral.ipynb

+95
Original file line numberDiff line numberDiff line change
@@ -1281,6 +1281,101 @@
12811281
" figure.show(**plotly_treemap_figure_show_settings)"
12821282
]
12831283
},
1284+
{
1285+
"cell_type": "markdown",
1286+
"id": "c15669ef",
1287+
"metadata": {},
1288+
"source": [
1289+
"## Pairwise Changed Files vs. Dependency Weight\n",
1290+
"\n",
1291+
"This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n",
1292+
"\n",
1293+
"### Considerations\n",
1294+
"- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n",
1295+
"- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n",
1296+
"- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n",
1297+
"- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes."
1298+
]
1299+
},
1300+
{
1301+
"cell_type": "markdown",
1302+
"id": "98a2feea",
1303+
"metadata": {},
1304+
"source": [
1305+
"#### Data Preview"
1306+
]
1307+
},
1308+
{
1309+
"cell_type": "code",
1310+
"execution_count": null,
1311+
"id": "a067f8e6",
1312+
"metadata": {},
1313+
"outputs": [],
1314+
"source": [
1315+
"pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n",
1316+
"pairwise_changed_git_files_with_dependencies.head(20)"
1317+
]
1318+
},
1319+
{
1320+
"cell_type": "markdown",
1321+
"id": "01db2db9",
1322+
"metadata": {},
1323+
"source": [
1324+
"#### Data Statistics"
1325+
]
1326+
},
1327+
{
1328+
"cell_type": "code",
1329+
"execution_count": null,
1330+
"id": "9fe48db8",
1331+
"metadata": {},
1332+
"outputs": [],
1333+
"source": [
1334+
"display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n",
1335+
"display(pairwise_changed_git_files_with_dependencies.describe())\n",
1336+
"\n",
1337+
"display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n",
1338+
"display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n",
1339+
"\n",
1340+
"display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n",
1341+
"display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))\n",
1342+
"\n",
1343+
"from scipy.stats import pearsonr, spearmanr\n",
1344+
"\n",
1345+
"display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n",
1346+
"display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
1347+
"\n",
1348+
"display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n",
1349+
"display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
1350+
]
1351+
},
1352+
{
1353+
"cell_type": "code",
1354+
"execution_count": null,
1355+
"id": "747f9590",
1356+
"metadata": {},
1357+
"outputs": [],
1358+
"source": [
1359+
"# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n",
1360+
"\n",
1361+
"if pairwise_changed_git_files_with_dependencies.empty:\n",
1362+
" print(\"No data to plot\")\n",
1363+
"else:\n",
1364+
" figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n",
1365+
" x=pairwise_changed_git_files_with_dependencies['commitCount'], \n",
1366+
" y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n",
1367+
" mode='markers',\n",
1368+
" # marker=dict(size=pairwise_changed_git_files_with_dependencies['occurrences'] + 8)\n",
1369+
" ))\n",
1370+
" figure.update_layout(\n",
1371+
" **plotly_bar_layout_base_settings,\n",
1372+
" title='Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n",
1373+
" xaxis_title='commit count',\n",
1374+
" yaxis_title='dependency weight',\n",
1375+
" )\n",
1376+
" figure.show(**plotly_treemap_figure_show_settings)"
1377+
]
1378+
},
12841379
{
12851380
"cell_type": "markdown",
12861381
"id": "14e87aff",

0 commit comments

Comments
 (0)