|
239 | 239 | " labels=data_frame['directoryName'],\n",
|
240 | 240 | " parents=data_frame['directoryParentPath'],\n",
|
241 | 241 | " ids=data_frame['directoryPath'],\n",
|
242 |
| - " customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", |
243 |
| - " hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[3]}, %{customdata[4]},.. (%{customdata[2]})<br>Last Commit: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Created: %{customdata[7]} (%{customdata[8]} days ago)<br>Last Modified: %{customdata[9]} (%{customdata[10]} days ago)<br>Path: %{customdata[11]}',\n", |
| 242 | + " customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", |
| 243 | + " hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]}, %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',\n", |
244 | 244 | " maxdepth=-1,\n",
|
245 | 245 | " root_color=\"lightgrey\",\n",
|
246 | 246 | " marker=dict(**plotly_treemap_marker_base_style),\n",
|
|
312 | 312 | "metadata": {},
|
313 | 313 | "outputs": [],
|
314 | 314 | "source": [
|
| 315 | + "def get_last_entry(values: pd.Series):\n", |
| 316 | + " \"\"\"\n", |
| 317 | + " Get the last element of an array and converts therefore an array to a single element\n", |
| 318 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 319 | + " values : Series : The pandas Series of values\n", |
| 320 | + " return : any : The last entry\n", |
| 321 | + " \"\"\"\n", |
| 322 | + " return values[-1]\n", |
| 323 | + "\n", |
| 324 | + "\n", |
| 325 | + "def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n", |
| 326 | + " \"\"\"\n", |
| 327 | + " Adds a fileExtension column to the input DataFrame based on the file path column.\n", |
| 328 | + " input_dataframe : pd.DataFrame : The input DataFrame\n", |
| 329 | + " file_path_column : str : The name of the file path column\n", |
| 330 | + " file_extension_column : str : The name of the file extension column to be added\n", |
| 331 | + " return : pd.DataFrame : The DataFrame with added directory column\n", |
| 332 | + " \"\"\"\n", |
| 333 | + " if file_extension_column in input_dataframe.columns:\n", |
| 334 | + " return input_dataframe # Column already exists\n", |
| 335 | + " \n", |
| 336 | + " file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n", |
| 337 | + " file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n", |
| 338 | + " file_extensions=file_extensions.str.split('.').map(get_last_entry)\n", |
| 339 | + " input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n", |
| 340 | + " return input_dataframe\n", |
| 341 | + "\n", |
315 | 342 | "def remove_last_file_path_element(file_path_elements: list) -> list:\n",
|
316 | 343 | " \"\"\"\n",
|
317 | 344 | " Removes the last element of the file path so that only the directory names retain.\n",
|
|
378 | 405 | " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
|
379 | 406 | " return input_dataframe\n",
|
380 | 407 | "\n",
|
| 408 | + "\n", |
| 409 | + "def collect_as_array(values: pd.Series):\n", |
| 410 | + " \"\"\"\n", |
| 411 | + " Just collect all values (no operation, \"noop\")\n", |
| 412 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 413 | + " values : Series : The pandas Series of values\n", |
| 414 | + " return : any : The second entry\n", |
| 415 | + " \"\"\"\n", |
| 416 | + " return np.asanyarray(values.to_list())\n", |
| 417 | + "\n", |
381 | 418 | "def second_entry(values: pd.Series):\n",
|
382 | 419 | " \"\"\"\n",
|
383 | 420 | " Returns the second entry of a list of values.\n",
|
|
394 | 431 | " values : Series : The pandas Series of values\n",
|
395 | 432 | " return : int : The number of files\n",
|
396 | 433 | " \"\"\"\n",
|
397 |
| - " return len(np.unique(np.concatenate(values.to_list())))" |
| 434 | + " return len(np.unique(np.concatenate(values.to_list())))\n", |
| 435 | + "\n", |
| 436 | + "\n", |
| 437 | + "def get_most_frequent_entry(input_values: pd.Series):\n", |
| 438 | + " \"\"\"\n", |
| 439 | + " Flattens the array of arrays and return the most frequent entry .\n", |
| 440 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 441 | + " input_values : Series : The pandas Series of values\n", |
| 442 | + " return : str : The most frequent entry\n", |
| 443 | + " \"\"\"\n", |
| 444 | + " # flatten the array of arrays \n", |
| 445 | + " values = np.concatenate(input_values.to_list())\n", |
| 446 | + " # find frequency of each value\n", |
| 447 | + " values, counts = np.unique(values, return_counts=True)\n", |
| 448 | + " #display all values with highest frequencies\n", |
| 449 | + " return values[counts.argmax()]" |
398 | 450 | ]
|
399 | 451 | },
|
400 | 452 | {
|
|
428 | 480 | "# display(\"1. query result ---------------------\")\n",
|
429 | 481 | "# display(git_files_with_commit_statistics)\n",
|
430 | 482 | "\n",
|
| 483 | + "# Add new column 'fileExtension' for every 'filePath'\n", |
| 484 | + "git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n", |
| 485 | + "\n", |
| 486 | + "# TODO What is the correct extension in the following cases?\n", |
| 487 | + "# - AxonFramework-4.11.0/messaging/src/main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n", |
| 488 | + "# - MyReactComponent.test.tsx\n", |
| 489 | + "# display(git_files_with_commit_statistics[git_files_with_commit_statistics['fileExtension'] == 'HandlerEnhancerDefinition'])\n", |
| 490 | + "\n", |
| 491 | + "# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n", |
| 492 | + "git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n", |
| 493 | + "git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=True, method='dense').astype(int)\n", |
| 494 | + "\n", |
| 495 | + "# Debug\n", |
| 496 | + "# display(git_file_extensions)\n", |
| 497 | + "\n", |
431 | 498 | "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
|
432 | 499 | "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
|
433 | 500 | "\n",
|
434 | 501 | "# Debug\n",
|
435 |
| - "# display(\"2. added directoryPath --------------\")\n", |
| 502 | + "# display(\"2. added directoryPath and fileExtension --------------\")\n", |
436 | 503 | "# display(git_files_with_commit_statistics)\n",
|
437 | 504 | "\n",
|
438 | 505 | "# Define how common non-grouped columns will be aggregated.\n",
|
|
452 | 519 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
|
453 | 520 | " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
|
454 | 521 | " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
|
| 522 | + " fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n", |
455 | 523 | " **common_named_aggregation\n",
|
456 | 524 | ")\n",
|
457 | 525 | "\n",
|
|
469 | 537 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
|
470 | 538 | " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
|
471 | 539 | " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
|
| 540 | + " mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n", |
472 | 541 | " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
|
473 | 542 | " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
|
474 | 543 | " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
|
|
723 | 792 | "figure.show(**plotly_treemap_figure_show_settings)"
|
724 | 793 | ]
|
725 | 794 | },
|
| 795 | + { |
| 796 | + "cell_type": "markdown", |
| 797 | + "id": "6b5cf97c", |
| 798 | + "metadata": {}, |
| 799 | + "source": [ |
| 800 | + "### Most frequent file extension per directory" |
| 801 | + ] |
| 802 | + }, |
| 803 | + { |
| 804 | + "cell_type": "code", |
| 805 | + "execution_count": null, |
| 806 | + "id": "c9497d80", |
| 807 | + "metadata": {}, |
| 808 | + "outputs": [], |
| 809 | + "source": [ |
| 810 | + "# TODO combine/abstract it with the helping function for git authors\n", |
| 811 | + "def create_git_file_extension_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n", |
| 812 | + " \"\"\"\n", |
| 813 | + " Creates a plotly graph_objects.Treemap marker object for git author plots.\n", |
| 814 | + " main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n", |
| 815 | + " author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n", |
| 816 | + " author_column_name : str : The name of the (aggregated) author column for coloring the plot\n", |
| 817 | + " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", |
| 818 | + " \"\"\"\n", |
| 819 | + " data_frame_with_merged_rank=pd.merge(\n", |
| 820 | + " main_data_frame, \n", |
| 821 | + " author_rank_data_frame, \n", |
| 822 | + " left_on=author_column_name, \n", |
| 823 | + " right_on=\"fileExtension\",\n", |
| 824 | + " how=\"left\",\n", |
| 825 | + " validate=\"m:1\"\n", |
| 826 | + " )\n", |
| 827 | + " #display(data_frame_with_author_ranks)\n", |
| 828 | + "\n", |
| 829 | + " return dict(\n", |
| 830 | + " cornerradius=5, \n", |
| 831 | + " colors=data_frame_with_merged_rank['fileExtensionCountRank'],\n", |
| 832 | + " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", |
| 833 | + " colorbar=dict(\n", |
| 834 | + " title=\"Rank\",\n", |
| 835 | + " tickmode=\"array\",\n", |
| 836 | + " ticktext=data_frame_with_merged_rank[author_column_name],\n", |
| 837 | + " tickvals=data_frame_with_merged_rank['fileExtensionCountRank'],\n", |
| 838 | + " tickfont_size=8\n", |
| 839 | + " ),\n", |
| 840 | + " )\n" |
| 841 | + ] |
| 842 | + }, |
| 843 | + { |
| 844 | + "cell_type": "code", |
| 845 | + "execution_count": null, |
| 846 | + "id": "3b2dfe7b", |
| 847 | + "metadata": {}, |
| 848 | + "outputs": [], |
| 849 | + "source": [ |
| 850 | + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", |
| 851 | + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", |
| 852 | + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", |
| 853 | + " # values = git_files_with_commit_statistics['fileCount'],\n", |
| 854 | + " marker=create_git_file_extension_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_extensions, \"mostFrequentFileExtension\")\n", |
| 855 | + "))\n", |
| 856 | + "figure.update_layout(\n", |
| 857 | + " **plotly_treemap_layout_base_settings,\n", |
| 858 | + " title='Most frequent file extension per directory'\n", |
| 859 | + ")\n", |
| 860 | + "figure.show(**plotly_treemap_figure_show_settings)" |
| 861 | + ] |
| 862 | + }, |
726 | 863 | {
|
727 | 864 | "cell_type": "markdown",
|
728 | 865 | "id": "0ed919b0",
|
|
0 commit comments