Skip to content

Commit 29f4b35

Browse files
committed
Add most frequent file extension treemap
1 parent 1a1832f commit 29f4b35

File tree

1 file changed

+141
-4
lines changed

1 file changed

+141
-4
lines changed

jupyter/GitHistoryGeneral.ipynb

+141-4
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,8 @@
239239
" labels=data_frame['directoryName'],\n",
240240
" parents=data_frame['directoryParentPath'],\n",
241241
" ids=data_frame['directoryPath'],\n",
242-
" customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
243-
" hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[3]}, %{customdata[4]},.. (%{customdata[2]})<br>Last Commit: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Created: %{customdata[7]} (%{customdata[8]} days ago)<br>Last Modified: %{customdata[9]} (%{customdata[10]} days ago)<br>Path: %{customdata[11]}',\n",
242+
" customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
243+
" hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]}, %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',\n",
244244
" maxdepth=-1,\n",
245245
" root_color=\"lightgrey\",\n",
246246
" marker=dict(**plotly_treemap_marker_base_style),\n",
@@ -312,6 +312,33 @@
312312
"metadata": {},
313313
"outputs": [],
314314
"source": [
315+
"def get_last_entry(values: pd.Series):\n",
316+
" \"\"\"\n",
317+
" Get the last element of an array and converts therefore an array to a single element\n",
318+
" Meant to be used as an aggregation function for dataframe grouping.\n",
319+
" values : Series : The pandas Series of values\n",
320+
" return : any : The last entry\n",
321+
" \"\"\"\n",
322+
" return values[-1]\n",
323+
"\n",
324+
"\n",
325+
"def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n",
326+
" \"\"\"\n",
327+
" Adds a fileExtension column to the input DataFrame based on the file path column.\n",
328+
" input_dataframe : pd.DataFrame : The input DataFrame\n",
329+
" file_path_column : str : The name of the file path column\n",
330+
" file_extension_column : str : The name of the file extension column to be added\n",
331+
" return : pd.DataFrame : The DataFrame with added directory column\n",
332+
" \"\"\"\n",
333+
" if file_extension_column in input_dataframe.columns:\n",
334+
" return input_dataframe # Column already exists\n",
335+
" \n",
336+
" file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n",
337+
" file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n",
338+
" file_extensions=file_extensions.str.split('.').map(get_last_entry)\n",
339+
" input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n",
340+
" return input_dataframe\n",
341+
"\n",
315342
"def remove_last_file_path_element(file_path_elements: list) -> list:\n",
316343
" \"\"\"\n",
317344
" Removes the last element of the file path so that only the directory names retain.\n",
@@ -378,6 +405,16 @@
378405
" input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
379406
" return input_dataframe\n",
380407
"\n",
408+
"\n",
409+
"def collect_as_array(values: pd.Series):\n",
410+
" \"\"\"\n",
411+
" Just collect all values (no operation, \"noop\")\n",
412+
" Meant to be used as an aggregation function for dataframe grouping.\n",
413+
" values : Series : The pandas Series of values\n",
414+
" return : any : The second entry\n",
415+
" \"\"\"\n",
416+
" return np.asanyarray(values.to_list())\n",
417+
"\n",
381418
"def second_entry(values: pd.Series):\n",
382419
" \"\"\"\n",
383420
" Returns the second entry of a list of values.\n",
@@ -394,7 +431,22 @@
394431
" values : Series : The pandas Series of values\n",
395432
" return : int : The number of files\n",
396433
" \"\"\"\n",
397-
" return len(np.unique(np.concatenate(values.to_list())))"
434+
" return len(np.unique(np.concatenate(values.to_list())))\n",
435+
"\n",
436+
"\n",
437+
"def get_most_frequent_entry(input_values: pd.Series):\n",
438+
" \"\"\"\n",
439+
" Flattens the array of arrays and return the most frequent entry .\n",
440+
" Meant to be used as an aggregation function for dataframe grouping.\n",
441+
" input_values : Series : The pandas Series of values\n",
442+
" return : str : The most frequent entry\n",
443+
" \"\"\"\n",
444+
" # flatten the array of arrays \n",
445+
" values = np.concatenate(input_values.to_list())\n",
446+
" # find frequency of each value\n",
447+
" values, counts = np.unique(values, return_counts=True)\n",
448+
" #display all values with highest frequencies\n",
449+
" return values[counts.argmax()]"
398450
]
399451
},
400452
{
@@ -428,11 +480,26 @@
428480
"# display(\"1. query result ---------------------\")\n",
429481
"# display(git_files_with_commit_statistics)\n",
430482
"\n",
483+
"# Add new column 'fileExtension' for every 'filePath'\n",
484+
"git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n",
485+
"\n",
486+
"# TODO What is the correct extension in the following cases?\n",
487+
"# - AxonFramework-4.11.0/messaging/src/main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n",
488+
"# - MyReactComponent.test.tsx\n",
489+
"# display(git_files_with_commit_statistics[git_files_with_commit_statistics['fileExtension'] == 'HandlerEnhancerDefinition'])\n",
490+
"\n",
491+
"# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n",
492+
"git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n",
493+
"git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=True, method='dense').astype(int)\n",
494+
"\n",
495+
"# Debug\n",
496+
"# display(git_file_extensions)\n",
497+
"\n",
431498
"# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
432499
"git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
433500
"\n",
434501
"# Debug\n",
435-
"# display(\"2. added directoryPath --------------\")\n",
502+
"# display(\"2. added directoryPath and fileExtension --------------\")\n",
436503
"# display(git_files_with_commit_statistics)\n",
437504
"\n",
438505
"# Define how common non-grouped columns will be aggregated.\n",
@@ -452,6 +519,7 @@
452519
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
453520
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
454521
" firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
522+
" fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
455523
" **common_named_aggregation\n",
456524
")\n",
457525
"\n",
@@ -469,6 +537,7 @@
469537
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
470538
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
471539
" firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
540+
" mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
472541
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
473542
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
474543
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
@@ -723,6 +792,74 @@
723792
"figure.show(**plotly_treemap_figure_show_settings)"
724793
]
725794
},
795+
{
796+
"cell_type": "markdown",
797+
"id": "6b5cf97c",
798+
"metadata": {},
799+
"source": [
800+
"### Most frequent file extension per directory"
801+
]
802+
},
803+
{
804+
"cell_type": "code",
805+
"execution_count": null,
806+
"id": "c9497d80",
807+
"metadata": {},
808+
"outputs": [],
809+
"source": [
810+
"# TODO combine/abstract it with the helping function for git authors\n",
811+
"def create_git_file_extension_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n",
812+
" \"\"\"\n",
813+
" Creates a plotly graph_objects.Treemap marker object for git author plots.\n",
814+
" main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n",
815+
" author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n",
816+
" author_column_name : str : The name of the (aggregated) author column for coloring the plot\n",
817+
" return : plotly_graph_objects.treemap.Marker : The created Marker object\n",
818+
" \"\"\"\n",
819+
" data_frame_with_merged_rank=pd.merge(\n",
820+
" main_data_frame, \n",
821+
" author_rank_data_frame, \n",
822+
" left_on=author_column_name, \n",
823+
" right_on=\"fileExtension\",\n",
824+
" how=\"left\",\n",
825+
" validate=\"m:1\"\n",
826+
" )\n",
827+
" #display(data_frame_with_author_ranks)\n",
828+
"\n",
829+
" return dict(\n",
830+
" cornerradius=5, \n",
831+
" colors=data_frame_with_merged_rank['fileExtensionCountRank'],\n",
832+
" colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n",
833+
" colorbar=dict(\n",
834+
" title=\"Rank\",\n",
835+
" tickmode=\"array\",\n",
836+
" ticktext=data_frame_with_merged_rank[author_column_name],\n",
837+
" tickvals=data_frame_with_merged_rank['fileExtensionCountRank'],\n",
838+
" tickfont_size=8\n",
839+
" ),\n",
840+
" )\n"
841+
]
842+
},
843+
{
844+
"cell_type": "code",
845+
"execution_count": null,
846+
"id": "3b2dfe7b",
847+
"metadata": {},
848+
"outputs": [],
849+
"source": [
850+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
851+
" create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
852+
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
853+
" # values = git_files_with_commit_statistics['fileCount'],\n",
854+
" marker=create_git_file_extension_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_extensions, \"mostFrequentFileExtension\")\n",
855+
"))\n",
856+
"figure.update_layout(\n",
857+
" **plotly_treemap_layout_base_settings,\n",
858+
" title='Most frequent file extension per directory'\n",
859+
")\n",
860+
"figure.show(**plotly_treemap_figure_show_settings)"
861+
]
862+
},
726863
{
727864
"cell_type": "markdown",
728865
"id": "0ed919b0",

0 commit comments

Comments
 (0)