|
24 | 24 | "source": [
|
25 | 25 | "import os\n",
|
26 | 26 | "import pandas as pd\n",
|
| 27 | + "pd.options.mode.copy_on_write = True\n", |
| 28 | + "\n", |
27 | 29 | "from neo4j import GraphDatabase"
|
28 | 30 | ]
|
29 | 31 | },
|
|
37 | 39 | "import matplotlib.pyplot as plot\n",
|
38 | 40 | "import numpy as np\n",
|
39 | 41 | "import plotly.express as plotly_express\n",
|
40 |
| - "import plotly.io as plotly_io" |
| 42 | + "from plotly import graph_objects as plotly_graph_objects" |
41 | 43 | ]
|
42 | 44 | },
|
43 | 45 | {
|
|
185 | 187 | ]
|
186 | 188 | },
|
187 | 189 | {
|
188 |
| - "cell_type": "code", |
189 |
| - "execution_count": null, |
190 |
| - "id": "83077395", |
| 190 | + "cell_type": "markdown", |
| 191 | + "id": "01da524e", |
191 | 192 | "metadata": {},
|
192 |
| - "outputs": [], |
193 | 193 | "source": [
|
194 |
| - "def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:\n", |
195 |
| - " \"\"\"\n", |
196 |
| - " Limits the values of the given column in the input data frame to the given quantile.\n", |
197 |
| - " The values are not filtered out but set to the limited (integer quantile value).\n", |
198 |
| - " input_data_frame : pd.DataFrame : The input data frame\n", |
199 |
| - " column_name : str : The name of the column to limit\n", |
200 |
| - " quantile : float : The quantile to limit the values to (default: 0.95)\n", |
201 |
| - " return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')\n", |
202 |
| - " \"\"\"\n", |
203 |
| - " data_frame=input_data_frame.copy()\n", |
204 |
| - " column_values = data_frame[column_name]\n", |
205 |
| - " column_limit = column_values.quantile(quantile)\n", |
206 |
| - " data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)\n", |
207 |
| - " return data_frame" |
| 194 | + "### Treemap Layout Functions and Constants" |
208 | 195 | ]
|
209 | 196 | },
|
210 | 197 | {
|
|
218 | 205 | "\n",
|
219 | 206 | "plotly_treemap_base_settings = dict(\n",
|
220 | 207 | " color_continuous_scale='Hot_r', # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n",
|
221 |
| - " path=[plotly_express.Constant(\"root\"), 'gitRepositoryName', 'directoryParentName', 'directoryName'],\n", |
| 208 | + " # path=[plotly_express.Constant(\"root\"), 'gitRepositoryName', 'directoryParentName', 'directoryName'],\n", |
| 209 | + " path=['gitRepositoryName', 'directoryParentName', 'directoryName'],\n", |
222 | 210 | " maxdepth=-1\n",
|
223 | 211 | ")\n",
|
224 | 212 | "plotly_treemap_traces_base_settings = dict(\n",
|
|
231 | 219 | ")\n",
|
232 | 220 | "plotly_treemap_figure_base_settings = dict(\n",
|
233 | 221 | " renderer=\"svg\" if is_command_line_execution() else None,\n",
|
234 |
| - " width=1000,\n", |
| 222 | + " width=2000, #1000\n", |
235 | 223 | " height=550\n",
|
236 | 224 | ")"
|
237 | 225 | ]
|
|
262 | 250 | ]
|
263 | 251 | },
|
264 | 252 | {
|
265 |
| - "cell_type": "markdown", |
266 |
| - "id": "acacc415", |
| 253 | + "cell_type": "code", |
| 254 | + "execution_count": null, |
| 255 | + "id": "b8cc624a", |
267 | 256 | "metadata": {},
|
| 257 | + "outputs": [], |
268 | 258 | "source": [
|
269 |
| - "### Data Preview" |
| 259 | + "def create_treemap_commit_statistics_settings(data_frame: pd.DataFrame):\n", |
| 260 | + " \"\"\"\n", |
| 261 | + " Creates a Plotly Treemap with the given settings and data frame.\n", |
| 262 | + " data_frame : pd.DataFrame : The input data frame\n", |
| 263 | + " return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n", |
| 264 | + " \"\"\"\n", |
| 265 | + " return plotly_graph_objects.Treemap(\n", |
| 266 | + " labels=treemap_data['directoryPathName'],\n", |
| 267 | + " parents=treemap_data['directoryParentPath'],\n", |
| 268 | + " ids=treemap_data['gitDirectoryPath'],\n", |
| 269 | + " customdata=treemap_data[['fileCount', 'commitCount', 'authorCount', 'latestCommitDate', 'daysSinceLatestCommit', 'latestCreationDate', 'daysSinceLatestCreation', 'latestModificationDate', 'daysSinceLatestModification', 'gitDirectoryPath']],\n", |
| 270 | + " hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[2]}<br>Latest Commit: %{customdata[3]} (%{customdata[4]} days ago)<br>Last Created: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Modified: %{customdata[7]} (%{customdata[8]} days ago)<br>Path: %{customdata[9]}',\n", |
| 271 | + " maxdepth=-1,\n", |
| 272 | + " root_color=\"lightgrey\",\n", |
| 273 | + " marker=dict(cornerradius=5),\n", |
| 274 | + " )" |
270 | 275 | ]
|
271 | 276 | },
|
272 | 277 | {
|
273 |
| - "cell_type": "code", |
274 |
| - "execution_count": null, |
275 |
| - "id": "9c9de7c5", |
| 278 | + "cell_type": "markdown", |
| 279 | + "id": "acacc415", |
276 | 280 | "metadata": {},
|
277 |
| - "outputs": [], |
278 | 281 | "source": [
|
279 |
| - "git_file_directories_with_commit_statistics = query_cypher_to_data_frame(\n", |
280 |
| - " \"../cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher\", limit=70).sort_values(by=\"fileCount\", ascending=False)\n", |
281 |
| - "git_file_directories_with_commit_statistics.sort_values(by=\"commitCount\", ascending=False).head(20)" |
| 282 | + "### Data Preparation Functions" |
282 | 283 | ]
|
283 | 284 | },
|
284 | 285 | {
|
285 |
| - "cell_type": "markdown", |
286 |
| - "id": "80338c9c", |
| 286 | + "cell_type": "code", |
| 287 | + "execution_count": null, |
| 288 | + "id": "83077395", |
287 | 289 | "metadata": {},
|
| 290 | + "outputs": [], |
288 | 291 | "source": [
|
289 |
| - "### Null Checks" |
| 292 | + "def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:\n", |
| 293 | + " \"\"\"\n", |
| 294 | + " Limits the values of the given column in the input data frame to the given quantile.\n", |
| 295 | + " The values are not filtered out but set to the limited (integer quantile value).\n", |
| 296 | + " input_data_frame : pd.DataFrame : The input data frame\n", |
| 297 | + " column_name : str : The name of the column to limit\n", |
| 298 | + " quantile : float : The quantile to limit the values to (default: 0.95)\n", |
| 299 | + " return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')\n", |
| 300 | + " \"\"\"\n", |
| 301 | + " data_frame=input_data_frame.copy()\n", |
| 302 | + " column_values = data_frame[column_name]\n", |
| 303 | + " column_limit = column_values.quantile(quantile)\n", |
| 304 | + " data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)\n", |
| 305 | + " return data_frame" |
290 | 306 | ]
|
291 | 307 | },
|
292 | 308 | {
|
293 | 309 | "cell_type": "code",
|
294 | 310 | "execution_count": null,
|
295 |
| - "id": "6f95993e", |
| 311 | + "id": "009a7222", |
296 | 312 | "metadata": {},
|
297 | 313 | "outputs": [],
|
298 | 314 | "source": [
|
299 |
| - "# Null values in the DataFrame\n", |
300 |
| - "git_file_directories_with_commit_statistics.isnull().sum() " |
| 315 | + "def filter_out_non_existing_parent_ids(data_frame: pd.DataFrame, parent_column: str, id_column: str):\n", |
| 316 | + " \"\"\"\n", |
| 317 | + " Filters out all rows with a parent ID where there is no entry in the ID column.\n", |
| 318 | + " data_frame : pd.DataFrame : The input data frame\n", |
| 319 | + " parent_column : str : The name of the parent column\n", |
| 320 | + " id_column : str : The name of the ID column\n", |
| 321 | + " return : pd.DataFrame : The filtered data frame\n", |
| 322 | + " \"\"\"\n", |
| 323 | + " list_of_ids = data_frame[id_column].tolist() + ['']\n", |
| 324 | + " # For Debugging\n", |
| 325 | + " problems = data_frame[~data_frame[parent_column].isin(list_of_ids)]\n", |
| 326 | + " display(\"Filtered out rows with non-existing parent IDs:\")\n", |
| 327 | + " display(problems)\n", |
| 328 | + " return data_frame[data_frame[parent_column].isin(list_of_ids)]\n", |
| 329 | + "\n", |
| 330 | + "def replace_empty_parent_by_repository_name(data_frame: pd.DataFrame, column_name: str, repository_column_name: str = ''):\n", |
| 331 | + " \"\"\"\n", |
| 332 | + " Replaces the value 'root' in the given column by the repository name.\n", |
| 333 | + " data_frame : pd.DataFrame : The input data frame\n", |
| 334 | + " column_name : str : The name of the column\n", |
| 335 | + " gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n", |
| 336 | + " return : pd.DataFrame : The modified data frame\n", |
| 337 | + " \"\"\"\n", |
| 338 | + " repository_names = data_frame[repository_column_name]\n", |
| 339 | + " data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n", |
| 340 | + "\n", |
| 341 | + " return data_frame\n", |
| 342 | + "\n", |
| 343 | + "def prepare_treemap_commit_statistics_data(data_frame: pd.DataFrame) -> pd.DataFrame:\n", |
| 344 | + " \"\"\"\n", |
| 345 | + " data_frame : pd.DataFrame : The input data frame\n", |
| 346 | + " return : pd.DataFrame : The data frame prepared for treemap visualization\n", |
| 347 | + " \"\"\"\n", |
| 348 | + " prepared_data = data_frame\n", |
| 349 | + " prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'gitDirectoryPath')\n", |
| 350 | + " prepared_data = replace_empty_parent_by_repository_name(prepared_data, 'directoryParentPath', 'gitRepositoryName')\n", |
| 351 | + " return prepared_data" |
301 | 352 | ]
|
302 | 353 | },
|
303 | 354 | {
|
304 | 355 | "cell_type": "markdown",
|
305 |
| - "id": "f02b5d42", |
| 356 | + "id": "0b717f80", |
306 | 357 | "metadata": {},
|
307 | 358 | "source": [
|
308 | 359 | "### Function to split file path levels"
|
|
311 | 362 | {
|
312 | 363 | "cell_type": "code",
|
313 | 364 | "execution_count": null,
|
314 |
| - "id": "906f2ab6", |
| 365 | + "id": "6581ec23", |
315 | 366 | "metadata": {},
|
316 | 367 | "outputs": [],
|
317 | 368 | "source": [
|
| 369 | + "# TODO Still needed?\n", |
| 370 | + "\n", |
318 | 371 | "def fill_array_to_length(length: int, fill_value=''):\n",
|
319 | 372 | " \"\"\"\n",
|
320 | 373 | " Fills the input array with the given fill value to the given length.\n",
|
|
351 | 404 | },
|
352 | 405 | {
|
353 | 406 | "cell_type": "markdown",
|
354 |
| - "id": "ece07655", |
| 407 | + "id": "2d0df211", |
355 | 408 | "metadata": {},
|
356 | 409 | "source": [
|
357 |
| - "### Directories by file count" |
| 410 | + "### Data Preview" |
358 | 411 | ]
|
359 | 412 | },
|
360 | 413 | {
|
361 | 414 | "cell_type": "code",
|
362 | 415 | "execution_count": null,
|
363 |
| - "id": "8a3725c0", |
| 416 | + "id": "9c9de7c5", |
364 | 417 | "metadata": {},
|
365 | 418 | "outputs": [],
|
366 | 419 | "source": [
|
367 |
| - "git_file_directories_with_commit_statistics_with_levels, level_column_names = add_file_path_levels(git_file_directories_with_commit_statistics, 'gitDirectoryPath', delimiter='/')\n", |
368 |
| - "#display(git_file_directories_with_commit_statistics_with_levels.head(13))\n", |
| 420 | + "git_file_directories_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_directories_with_commit_statistics_new.cypher\")\n", |
| 421 | + "git_file_directories_with_commit_statistics = prepare_treemap_commit_statistics_data(git_file_directories_with_commit_statistics)\n", |
369 | 422 | "\n",
|
370 |
| - "figure = plotly_express.treemap(\n", |
371 |
| - " git_file_directories_with_commit_statistics_with_levels, \n", |
372 |
| - " # path=['gitRepositoryName', 'directoryParentName', 'directoryName'], \n", |
373 |
| - " path=['gitRepositoryName'] + level_column_names,\n", |
374 |
| - " hover_data=['gitDirectoryPath', 'commitCount', 'authorCount', 'latestCommitDate', 'daysSinceLatestCommit', 'latestCreationDate', 'daysSinceLatestCreation', 'latestModificationDate', 'daysSinceLatestModification'],\n", |
375 |
| - " values='fileCount', \n", |
| 423 | + "# Show a preview of the first 20 directories with the highest file count\n", |
| 424 | + "git_file_directories_with_commit_statistics.sort_values(by=\"fileCount\", ascending=False).head(10)" |
| 425 | + ] |
| 426 | + }, |
| 427 | + { |
| 428 | + "cell_type": "markdown", |
| 429 | + "id": "80338c9c", |
| 430 | + "metadata": {}, |
| 431 | + "source": [ |
| 432 | + "### Null Checks" |
| 433 | + ] |
| 434 | + }, |
| 435 | + { |
| 436 | + "cell_type": "code", |
| 437 | + "execution_count": null, |
| 438 | + "id": "6f95993e", |
| 439 | + "metadata": {}, |
| 440 | + "outputs": [], |
| 441 | + "source": [ |
| 442 | + "# Null values in the DataFrame\n", |
| 443 | + "git_file_directories_with_commit_statistics.isnull().sum() " |
| 444 | + ] |
| 445 | + }, |
| 446 | + { |
| 447 | + "cell_type": "markdown", |
| 448 | + "id": "ccc11f52", |
| 449 | + "metadata": {}, |
| 450 | + "source": [ |
| 451 | + "### Directories by file count" |
| 452 | + ] |
| 453 | + }, |
| 454 | + { |
| 455 | + "cell_type": "code", |
| 456 | + "execution_count": null, |
| 457 | + "id": "19d108bb", |
| 458 | + "metadata": {}, |
| 459 | + "outputs": [], |
| 460 | + "source": [ |
| 461 | + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", |
| 462 | + " create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n", |
| 463 | + " values = git_file_directories_with_commit_statistics['fileCount'],\n", |
| 464 | + " #marker=dict(cornerradius=5, colors=git_file_directories_with_commit_statistics['daysSinceLatestCommit'], colorscale='Hot_r'),\n", |
| 465 | + "))\n", |
| 466 | + "figure.update_layout(\n", |
| 467 | + " **plotly_treemap_layout_base_settings,\n", |
376 | 468 | " title='Directories and their file count'\n",
|
377 | 469 | ")\n",
|
378 |
| - "figure.update_traces(**plotly_treemap_traces_base_settings)\n", |
379 |
| - "figure.update_layout(**plotly_treemap_layout_base_settings)\n", |
380 |
| - "figure.show(**plotly_treemap_figure_base_settings)" |
| 470 | + "figure.show()" |
381 | 471 | ]
|
382 | 472 | },
|
383 | 473 | {
|
|
0 commit comments