|
239 | 239 | " labels=data_frame['directoryName'],\n",
|
240 | 240 | " parents=data_frame['directoryParentPath'],\n",
|
241 | 241 | " ids=data_frame['directoryPath'],\n",
|
242 |
| - " customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", |
243 |
| - " hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[3]}, %{customdata[4]},.. (%{customdata[2]})<br>Last Commit: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Created: %{customdata[7]} (%{customdata[8]} days ago)<br>Last Modified: %{customdata[9]} (%{customdata[10]} days ago)<br>Path: %{customdata[11]}',\n", |
| 242 | + " customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", |
| 243 | + " hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]}, %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',\n", |
244 | 244 | " maxdepth=-1,\n",
|
245 | 245 | " root_color=\"lightgrey\",\n",
|
246 | 246 | " marker=dict(**plotly_treemap_marker_base_style),\n",
|
247 | 247 | " )"
|
248 | 248 | ]
|
249 | 249 | },
|
| 250 | + { |
| 251 | + "cell_type": "code", |
| 252 | + "execution_count": null, |
| 253 | + "id": "641fa05c", |
| 254 | + "metadata": {}, |
| 255 | + "outputs": [], |
| 256 | + "source": [ |
| 257 | + "def create_rank_colorbar_for_graph_objects_treemap_marker(data_frame: pd.DataFrame, name_column: str, rank_column: str):\n", |
| 258 | + " \"\"\"\n", |
| 259 | + " Creates a plotly graph_objects.Treemap marker object for a colorbar representing ranked names.\n", |
| 260 | + " data_frame : pd.DataFrame : The DataFrame that contains the name and the count column\n", |
| 261 | + " name_column : str : The name of the column containing the ranking \n", |
| 262 | + " rank_column : str : The name of the column containing the ranking \n", |
| 263 | + " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", |
| 264 | + " \"\"\"\n", |
| 265 | + " # The rank is inverted so that the first rank is shown on the top of the colorbar.\n", |
| 266 | + " inverse_ranked = data_frame[rank_column].max() + 1 - data_frame[rank_column]\n", |
| 267 | + "\n", |
| 268 | + " return dict(\n", |
| 269 | + " cornerradius=5, \n", |
| 270 | + " colors=inverse_ranked,\n", |
| 271 | + " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", |
| 272 | + " colorbar=dict(\n", |
| 273 | + " title=\"Rank\",\n", |
| 274 | + " tickmode=\"array\",\n", |
| 275 | + " ticktext=data_frame[name_column],\n", |
| 276 | + " tickvals=inverse_ranked,\n", |
| 277 | + " tickfont_size=10\n", |
| 278 | + " ),\n", |
| 279 | + " )" |
| 280 | + ] |
| 281 | + }, |
250 | 282 | {
|
251 | 283 | "cell_type": "markdown",
|
252 | 284 | "id": "acacc415",
|
|
312 | 344 | "metadata": {},
|
313 | 345 | "outputs": [],
|
314 | 346 | "source": [
|
| 347 | + "def get_last_entry(values: pd.Series):\n", |
| 348 | + " \"\"\"\n", |
| 349 | + " Get the last element of an array and converts therefore an array to a single element\n", |
| 350 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 351 | + " values : Series : The pandas Series of values\n", |
| 352 | + " return : any : The last entry\n", |
| 353 | + " \"\"\"\n", |
| 354 | + " return values[-1]\n", |
| 355 | + "\n", |
| 356 | + "\n", |
| 357 | + "def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n", |
| 358 | + " \"\"\"\n", |
| 359 | + " Adds a fileExtension column to the input DataFrame based on the file path column.\n", |
| 360 | + " input_dataframe : pd.DataFrame : The input DataFrame\n", |
| 361 | + " file_path_column : str : The name of the file path column\n", |
| 362 | + " file_extension_column : str : The name of the file extension column to be added\n", |
| 363 | + " return : pd.DataFrame : The DataFrame with added directory column\n", |
| 364 | + " \"\"\"\n", |
| 365 | + " if file_extension_column in input_dataframe.columns:\n", |
| 366 | + " return input_dataframe # Column already exists\n", |
| 367 | + " \n", |
| 368 | + " # What is the correct extension in the following cases?\n", |
| 369 | + " # - /main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n", |
| 370 | + " # - MyReactComponent.test.tsx\n", |
| 371 | + " # Currently, it would be\n", |
| 372 | + " # - HandlerEnhancerDefinition\n", |
| 373 | + " # - tsx\n", |
| 374 | + " # which is not perfect but good enough to start with.#\n", |
| 375 | + " \n", |
| 376 | + " file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n", |
| 377 | + " file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n", |
| 378 | + " file_extensions=file_extensions.str.split('.').map(get_last_entry)\n", |
| 379 | + " input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n", |
| 380 | + " return input_dataframe\n", |
| 381 | + "\n", |
315 | 382 | "def remove_last_file_path_element(file_path_elements: list) -> list:\n",
|
316 | 383 | " \"\"\"\n",
|
317 | 384 | " Removes the last element of the file path so that only the directory names retain.\n",
|
|
378 | 445 | " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
|
379 | 446 | " return input_dataframe\n",
|
380 | 447 | "\n",
|
| 448 | + "\n", |
| 449 | + "def collect_as_array(values: pd.Series):\n", |
| 450 | + " \"\"\"\n", |
| 451 | + " Just collect all values (no operation, \"noop\")\n", |
| 452 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 453 | + " values : Series : The pandas Series of values\n", |
| 454 | + " return : any : The second entry\n", |
| 455 | + " \"\"\"\n", |
| 456 | + " return np.asanyarray(values.to_list())\n", |
| 457 | + "\n", |
381 | 458 | "def second_entry(values: pd.Series):\n",
|
382 | 459 | " \"\"\"\n",
|
383 | 460 | " Returns the second entry of a list of values.\n",
|
|
394 | 471 | " values : Series : The pandas Series of values\n",
|
395 | 472 | " return : int : The number of files\n",
|
396 | 473 | " \"\"\"\n",
|
397 |
| - " return len(np.unique(np.concatenate(values.to_list())))" |
| 474 | + " return len(np.unique(np.concatenate(values.to_list())))\n", |
| 475 | + "\n", |
| 476 | + "\n", |
| 477 | + "def get_most_frequent_entry(input_values: pd.Series):\n", |
| 478 | + " \"\"\"\n", |
| 479 | + " Flattens the array of arrays and return the most frequent entry .\n", |
| 480 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 481 | + " input_values : Series : The pandas Series of values\n", |
| 482 | + " return : str : The most frequent entry\n", |
| 483 | + " \"\"\"\n", |
| 484 | + " # flatten the array of arrays \n", |
| 485 | + " values = np.concatenate(input_values.to_list())\n", |
| 486 | + " # find frequency of each value\n", |
| 487 | + " values, counts = np.unique(values, return_counts=True)\n", |
| 488 | + " #display all values with highest frequencies\n", |
| 489 | + " return values[counts.argmax()]" |
398 | 490 | ]
|
399 | 491 | },
|
400 | 492 | {
|
|
419 | 511 | "git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(\n",
|
420 | 512 | " authorCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
|
421 | 513 | " ).sort_values(by='authorCommitCount', ascending=False).reset_index()\n",
|
422 |
| - "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=True, method='dense').astype(int)\n", |
| 514 | + "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=False, method='dense').astype(int)\n", |
423 | 515 | "\n",
|
424 | 516 | "# Debug\n",
|
425 | 517 | "# display(git_file_authors)\n",
|
|
428 | 520 | "# display(\"1. query result ---------------------\")\n",
|
429 | 521 | "# display(git_files_with_commit_statistics)\n",
|
430 | 522 | "\n",
|
| 523 | + "# Add new column 'fileExtension' for every 'filePath'\n", |
| 524 | + "git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n", |
| 525 | + "\n", |
| 526 | + "# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n", |
| 527 | + "git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n", |
| 528 | + "git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=False, method='dense').astype(int)\n", |
| 529 | + "\n", |
| 530 | + "# Debug\n", |
| 531 | + "# display(git_file_extensions)\n", |
| 532 | + "\n", |
431 | 533 | "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
|
432 | 534 | "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
|
433 | 535 | "\n",
|
434 | 536 | "# Debug\n",
|
435 |
| - "# display(\"2. added directoryPath --------------\")\n", |
| 537 | + "# display(\"2. added directoryPath and fileExtension --------------\")\n", |
436 | 538 | "# display(git_files_with_commit_statistics)\n",
|
437 | 539 | "\n",
|
438 | 540 | "# Define how common non-grouped columns will be aggregated.\n",
|
|
452 | 554 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
|
453 | 555 | " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
|
454 | 556 | " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
|
| 557 | + " fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n", |
455 | 558 | " **common_named_aggregation\n",
|
456 | 559 | ")\n",
|
457 | 560 | "\n",
|
|
469 | 572 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
|
470 | 573 | " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
|
471 | 574 | " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
|
| 575 | + " mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n", |
472 | 576 | " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
|
473 | 577 | " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
|
474 | 578 | " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
|
|
558 | 662 | "figure.show(**plotly_treemap_figure_show_settings)"
|
559 | 663 | ]
|
560 | 664 | },
|
| 665 | + { |
| 666 | + "cell_type": "markdown", |
| 667 | + "id": "e93d944a", |
| 668 | + "metadata": {}, |
| 669 | + "source": [ |
| 670 | + "### Most frequent file extension per directory" |
| 671 | + ] |
| 672 | + }, |
| 673 | + { |
| 674 | + "cell_type": "code", |
| 675 | + "execution_count": null, |
| 676 | + "id": "0147c747", |
| 677 | + "metadata": {}, |
| 678 | + "outputs": [], |
| 679 | + "source": [ |
| 680 | + "git_files_with_commit_statistics_and_file_extension_rank = pd.merge(\n", |
| 681 | + " git_files_with_commit_statistics, \n", |
| 682 | + " git_file_extensions, \n", |
| 683 | + " left_on='mostFrequentFileExtension', \n", |
| 684 | + " right_on=\"fileExtension\",\n", |
| 685 | + " how=\"left\",\n", |
| 686 | + " validate=\"m:1\"\n", |
| 687 | + ")\n", |
| 688 | + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", |
| 689 | + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", |
| 690 | + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", |
| 691 | + " # values = git_files_with_commit_statistics['fileCount'],\n", |
| 692 | + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_file_extension_rank, 'fileExtension', 'fileExtensionCountRank')\n", |
| 693 | + "))\n", |
| 694 | + "figure.update_layout(\n", |
| 695 | + " **plotly_treemap_layout_base_settings,\n", |
| 696 | + " title='Most frequent file extension per directory'\n", |
| 697 | + ")\n", |
| 698 | + "figure.show(**plotly_treemap_figure_show_settings)" |
| 699 | + ] |
| 700 | + }, |
561 | 701 | {
|
562 | 702 | "cell_type": "markdown",
|
563 | 703 | "id": "e98ca7b1",
|
|
634 | 774 | "### Main author per directory"
|
635 | 775 | ]
|
636 | 776 | },
|
637 |
| - { |
638 |
| - "cell_type": "code", |
639 |
| - "execution_count": null, |
640 |
| - "id": "259f7278", |
641 |
| - "metadata": {}, |
642 |
| - "outputs": [], |
643 |
| - "source": [ |
644 |
| - "def create_git_authors_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n", |
645 |
| - " \"\"\"\n", |
646 |
| - " Creates a plotly graph_objects.Treemap marker object for git author plots.\n", |
647 |
| - " main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n", |
648 |
| - " author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n", |
649 |
| - " author_column_name : str : The name of the (aggregated) author column for coloring the plot\n", |
650 |
| - " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", |
651 |
| - " \"\"\"\n", |
652 |
| - " data_frame_with_authors=pd.merge(\n", |
653 |
| - " main_data_frame, \n", |
654 |
| - " author_rank_data_frame, \n", |
655 |
| - " left_on=author_column_name, \n", |
656 |
| - " right_on=\"author\",\n", |
657 |
| - " how=\"left\",\n", |
658 |
| - " validate=\"m:1\"\n", |
659 |
| - " )\n", |
660 |
| - " #display(data_frame_with_author_ranks)\n", |
661 |
| - "\n", |
662 |
| - " data_frame_with_author_ranks=data_frame_with_authors['authorCommitCountRank']\n", |
663 |
| - "\n", |
664 |
| - " return dict(\n", |
665 |
| - " cornerradius=5, \n", |
666 |
| - " colors=data_frame_with_author_ranks,\n", |
667 |
| - " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", |
668 |
| - " colorbar=dict(\n", |
669 |
| - " title=\"Rank\",\n", |
670 |
| - " tickmode=\"array\",\n", |
671 |
| - " ticktext=data_frame_with_authors[author_column_name],\n", |
672 |
| - " tickvals=data_frame_with_author_ranks,\n", |
673 |
| - " tickfont_size=8\n", |
674 |
| - " ),\n", |
675 |
| - " )\n" |
676 |
| - ] |
677 |
| - }, |
678 | 777 | {
|
679 | 778 | "cell_type": "code",
|
680 | 779 | "execution_count": null,
|
681 | 780 | "id": "e97c0d87",
|
682 | 781 | "metadata": {},
|
683 | 782 | "outputs": [],
|
684 | 783 | "source": [
|
| 784 | + "git_files_with_commit_statistics_and_main_author_rank = pd.merge(\n", |
| 785 | + " git_files_with_commit_statistics, \n", |
| 786 | + " git_file_authors, \n", |
| 787 | + " left_on='mainAuthor', \n", |
| 788 | + " right_on=\"author\",\n", |
| 789 | + " how=\"left\",\n", |
| 790 | + " validate=\"m:1\"\n", |
| 791 | + ")\n", |
685 | 792 | "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
|
686 | 793 | " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
|
687 | 794 | " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
|
688 | 795 | " # values = git_files_with_commit_statistics['fileCount'],\n",
|
689 |
| - " marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"mainAuthor\")\n", |
| 796 | + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_main_author_rank, 'mainAuthor', 'authorCommitCountRank')\n", |
690 | 797 | "))\n",
|
691 | 798 | "figure.update_layout(\n",
|
692 | 799 | " **plotly_treemap_layout_base_settings,\n",
|
693 |
| - " title='Main author (highest number of commits)'\n", |
| 800 | + " title='Main authors with highest number of commits'\n", |
694 | 801 | ")\n",
|
695 | 802 | "figure.show(**plotly_treemap_figure_show_settings)"
|
696 | 803 | ]
|
|
710 | 817 | "metadata": {},
|
711 | 818 | "outputs": [],
|
712 | 819 | "source": [
|
| 820 | + "git_files_with_commit_statistics_and_second_author_rank = pd.merge(\n", |
| 821 | + " git_files_with_commit_statistics, \n", |
| 822 | + " git_file_authors, \n", |
| 823 | + " left_on='secondAuthor', \n", |
| 824 | + " right_on=\"author\",\n", |
| 825 | + " how=\"left\",\n", |
| 826 | + " validate=\"m:1\"\n", |
| 827 | + ")\n", |
| 828 | + "\n", |
713 | 829 | "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
|
714 | 830 | " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
|
715 | 831 | " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
|
716 | 832 | " # values = git_files_with_commit_statistics['fileCount'],\n",
|
717 |
| - " marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"secondAuthor\")\n", |
| 833 | + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_second_author_rank, 'secondAuthor', 'authorCommitCountRank')\n", |
718 | 834 | "))\n",
|
719 | 835 | "figure.update_layout(\n",
|
720 | 836 | " **plotly_treemap_layout_base_settings,\n",
|
721 |
| - " title='Second author (second highest number of commits)'\n", |
| 837 | + " title='Second author with the second highest number of commits'\n", |
722 | 838 | ")\n",
|
723 | 839 | "figure.show(**plotly_treemap_figure_show_settings)"
|
724 | 840 | ]
|
|
0 commit comments