|
1281 | 1281 | " figure.show(**plotly_treemap_figure_show_settings)"
|
1282 | 1282 | ]
|
1283 | 1283 | },
|
| 1284 | + { |
| 1285 | + "cell_type": "markdown", |
| 1286 | + "id": "c15669ef", |
| 1287 | + "metadata": {}, |
| 1288 | + "source": [ |
| 1289 | + "## Pairwise Changed Files vs. Dependency Weight\n", |
| 1290 | + "\n", |
| 1291 | + "This section explores the correlation between how often pairs of files are changed together (common commit count) and their dependency weight. Note that these results should be interpreted cautiously, as comparing pairwise changes and dependencies is inherently challenging.\n", |
| 1292 | + "\n", |
| 1293 | + "### Considerations\n", |
| 1294 | + "- **Historical vs. Current State**: Pairwise changes reflect the entire git history, while dependency weight represents the current state of the codebase.\n", |
| 1295 | + "- **Commit Granularity**: Developers may use different commit strategies, such as squashing changes into a single commit or creating fine-grained commits. Ideally, each commit should represent a single semantic change for accurate analysis.\n", |
| 1296 | + "- **Dependency Representation**: Some file types (e.g., Java files with import statements) clearly define dependencies, while others (e.g., shell scripts, XML, YAML) lack explicit dependency relationships.\n", |
| 1297 | + "- **Repository Characteristics**: Repositories with generated code may have many large commits, while stabilized repositories may only update configuration files for dependency changes." |
| 1298 | + ] |
| 1299 | + }, |
| 1300 | + { |
| 1301 | + "cell_type": "markdown", |
| 1302 | + "id": "98a2feea", |
| 1303 | + "metadata": {}, |
| 1304 | + "source": [ |
| 1305 | + "#### Data Preview" |
| 1306 | + ] |
| 1307 | + }, |
| 1308 | + { |
| 1309 | + "cell_type": "code", |
| 1310 | + "execution_count": null, |
| 1311 | + "id": "a067f8e6", |
| 1312 | + "metadata": {}, |
| 1313 | + "outputs": [], |
| 1314 | + "source": [ |
| 1315 | + "pairwise_changed_git_files_with_dependencies = query_cypher_to_data_frame(\"../cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher\")\n", |
| 1316 | + "pairwise_changed_git_files_with_dependencies.head(20)" |
| 1317 | + ] |
| 1318 | + }, |
| 1319 | + { |
| 1320 | + "cell_type": "markdown", |
| 1321 | + "id": "01db2db9", |
| 1322 | + "metadata": {}, |
| 1323 | + "source": [ |
| 1324 | + "#### Data Statistics" |
| 1325 | + ] |
| 1326 | + }, |
| 1327 | + { |
| 1328 | + "cell_type": "code", |
| 1329 | + "execution_count": null, |
| 1330 | + "id": "9fe48db8", |
| 1331 | + "metadata": {}, |
| 1332 | + "outputs": [], |
| 1333 | + "source": [ |
| 1334 | + "display(\"Pairwise changed git files compared to dependency weights - Overall statistics\")\n", |
| 1335 | + "display(pairwise_changed_git_files_with_dependencies.describe())\n", |
| 1336 | + "\n", |
| 1337 | + "display(\"Pairwise changed git files compared to dependency weights - Pearson Correlation\")\n", |
| 1338 | + "display(pairwise_changed_git_files_with_dependencies.corr(method='pearson'))\n", |
| 1339 | + "\n", |
| 1340 | + "display(\"Pairwise changed git files compared to dependency weights - Spearman Correlation\")\n", |
| 1341 | + "display(pairwise_changed_git_files_with_dependencies.corr(method='spearman'))\n", |
| 1342 | + "\n", |
| 1343 | + "from scipy.stats import pearsonr, spearmanr\n", |
| 1344 | + "\n", |
| 1345 | + "display(\"Pearson Correlation with p-value for commitCount and dependencyWeight\")\n", |
| 1346 | + "display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n", |
| 1347 | + "\n", |
| 1348 | + "display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n", |
| 1349 | + "display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))" |
| 1350 | + ] |
| 1351 | + }, |
| 1352 | + { |
| 1353 | + "cell_type": "code", |
| 1354 | + "execution_count": null, |
| 1355 | + "id": "747f9590", |
| 1356 | + "metadata": {}, |
| 1357 | + "outputs": [], |
| 1358 | + "source": [ |
| 1359 | + "# Scatter plot of all pairs of files with their commit count on the x axis and dependency weight on the y axis\n", |
| 1360 | + "\n", |
| 1361 | + "if pairwise_changed_git_files_with_dependencies.empty:\n", |
| 1362 | + " print(\"No data to plot\")\n", |
| 1363 | + "else:\n", |
| 1364 | + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n", |
| 1365 | + " x=pairwise_changed_git_files_with_dependencies['commitCount'], \n", |
| 1366 | + " y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n", |
| 1367 | + " mode='markers',\n", |
| 1368 | + " # marker=dict(size=pairwise_changed_git_files_with_dependencies['occurrences'] + 8)\n", |
| 1369 | + " ))\n", |
| 1370 | + " figure.update_layout(\n", |
| 1371 | + " **plotly_bar_layout_base_settings,\n", |
| 1372 | + " title='Pairwise changed files: Number of changes (commitCount) vs. dependency weight',\n", |
| 1373 | + " xaxis_title='commit count',\n", |
| 1374 | + " yaxis_title='dependency weight',\n", |
| 1375 | + " )\n", |
| 1376 | + " figure.show(**plotly_treemap_figure_show_settings)" |
| 1377 | + ] |
| 1378 | + }, |
1284 | 1379 | {
|
1285 | 1380 | "cell_type": "markdown",
|
1286 | 1381 | "id": "14e87aff",
|
|
0 commit comments