Skip to content

Commit 0e7c645

Browse files
committed
Add most frequent file extension treemap
1 parent 1a1832f commit 0e7c645

File tree

1 file changed

+166
-50
lines changed

1 file changed

+166
-50
lines changed

jupyter/GitHistoryGeneral.ipynb

+166-50
Original file line numberDiff line numberDiff line change
@@ -239,14 +239,46 @@
239239
" labels=data_frame['directoryName'],\n",
240240
" parents=data_frame['directoryParentPath'],\n",
241241
" ids=data_frame['directoryPath'],\n",
242-
" customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
243-
" hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[3]}, %{customdata[4]},.. (%{customdata[2]})<br>Last Commit: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Created: %{customdata[7]} (%{customdata[8]} days ago)<br>Last Modified: %{customdata[9]} (%{customdata[10]} days ago)<br>Path: %{customdata[11]}',\n",
242+
" customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
243+
" hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]}, %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',\n",
244244
" maxdepth=-1,\n",
245245
" root_color=\"lightgrey\",\n",
246246
" marker=dict(**plotly_treemap_marker_base_style),\n",
247247
" )"
248248
]
249249
},
250+
{
251+
"cell_type": "code",
252+
"execution_count": null,
253+
"id": "641fa05c",
254+
"metadata": {},
255+
"outputs": [],
256+
"source": [
257+
"def create_rank_colorbar_for_graph_objects_treemap_marker(data_frame: pd.DataFrame, name_column: str, rank_column: str):\n",
258+
" \"\"\"\n",
259+
" Creates a plotly graph_objects.Treemap marker object for a colorbar representing ranked names.\n",
260+
" data_frame : pd.DataFrame : The DataFrame that contains the name and the count column\n",
261+
" name_column : str : The name of the column containing the ranking \n",
262+
" rank_column : str : The name of the column containing the ranking \n",
263+
" return : plotly_graph_objects.treemap.Marker : The created Marker object\n",
264+
" \"\"\"\n",
265+
" # The rank is inverted so that the first rank is shown on the top of the colorbar.\n",
266+
" inverse_ranked = data_frame[rank_column].max() + 1 - data_frame[rank_column]\n",
267+
"\n",
268+
" return dict(\n",
269+
" cornerradius=5, \n",
270+
" colors=inverse_ranked,\n",
271+
" colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n",
272+
" colorbar=dict(\n",
273+
" title=\"Rank\",\n",
274+
" tickmode=\"array\",\n",
275+
" ticktext=data_frame[name_column],\n",
276+
" tickvals=inverse_ranked,\n",
277+
" tickfont_size=10\n",
278+
" ),\n",
279+
" )"
280+
]
281+
},
250282
{
251283
"cell_type": "markdown",
252284
"id": "acacc415",
@@ -312,6 +344,41 @@
312344
"metadata": {},
313345
"outputs": [],
314346
"source": [
347+
"def get_last_entry(values: pd.Series):\n",
348+
" \"\"\"\n",
349+
" Get the last element of an array and converts therefore an array to a single element\n",
350+
" Meant to be used as an aggregation function for dataframe grouping.\n",
351+
" values : Series : The pandas Series of values\n",
352+
" return : any : The last entry\n",
353+
" \"\"\"\n",
354+
" return values[-1]\n",
355+
"\n",
356+
"\n",
357+
"def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n",
358+
" \"\"\"\n",
359+
" Adds a fileExtension column to the input DataFrame based on the file path column.\n",
360+
" input_dataframe : pd.DataFrame : The input DataFrame\n",
361+
" file_path_column : str : The name of the file path column\n",
362+
" file_extension_column : str : The name of the file extension column to be added\n",
363+
" return : pd.DataFrame : The DataFrame with added directory column\n",
364+
" \"\"\"\n",
365+
" if file_extension_column in input_dataframe.columns:\n",
366+
" return input_dataframe # Column already exists\n",
367+
" \n",
368+
" # What is the correct extension in the following cases?\n",
369+
" # - /main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n",
370+
" # - MyReactComponent.test.tsx\n",
371+
" # Currently, it would be\n",
372+
" # - HandlerEnhancerDefinition\n",
373+
" # - tsx\n",
374+
" # which is not perfect but good enough to start with.#\n",
375+
" \n",
376+
" file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n",
377+
" file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n",
378+
" file_extensions=file_extensions.str.split('.').map(get_last_entry)\n",
379+
" input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n",
380+
" return input_dataframe\n",
381+
"\n",
315382
"def remove_last_file_path_element(file_path_elements: list) -> list:\n",
316383
" \"\"\"\n",
317384
" Removes the last element of the file path so that only the directory names retain.\n",
@@ -378,6 +445,16 @@
378445
" input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
379446
" return input_dataframe\n",
380447
"\n",
448+
"\n",
449+
"def collect_as_array(values: pd.Series):\n",
450+
" \"\"\"\n",
451+
" Just collect all values (no operation, \"noop\")\n",
452+
" Meant to be used as an aggregation function for dataframe grouping.\n",
453+
" values : Series : The pandas Series of values\n",
454+
" return : any : The second entry\n",
455+
" \"\"\"\n",
456+
" return np.asanyarray(values.to_list())\n",
457+
"\n",
381458
"def second_entry(values: pd.Series):\n",
382459
" \"\"\"\n",
383460
" Returns the second entry of a list of values.\n",
@@ -394,7 +471,22 @@
394471
" values : Series : The pandas Series of values\n",
395472
" return : int : The number of files\n",
396473
" \"\"\"\n",
397-
" return len(np.unique(np.concatenate(values.to_list())))"
474+
" return len(np.unique(np.concatenate(values.to_list())))\n",
475+
"\n",
476+
"\n",
477+
"def get_most_frequent_entry(input_values: pd.Series):\n",
478+
" \"\"\"\n",
479+
" Flattens the array of arrays and return the most frequent entry .\n",
480+
" Meant to be used as an aggregation function for dataframe grouping.\n",
481+
" input_values : Series : The pandas Series of values\n",
482+
" return : str : The most frequent entry\n",
483+
" \"\"\"\n",
484+
" # flatten the array of arrays \n",
485+
" values = np.concatenate(input_values.to_list())\n",
486+
" # find frequency of each value\n",
487+
" values, counts = np.unique(values, return_counts=True)\n",
488+
" #display all values with highest frequencies\n",
489+
" return values[counts.argmax()]"
398490
]
399491
},
400492
{
@@ -419,7 +511,7 @@
419511
"git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(\n",
420512
" authorCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
421513
" ).sort_values(by='authorCommitCount', ascending=False).reset_index()\n",
422-
"git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=True, method='dense').astype(int)\n",
514+
"git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=False, method='dense').astype(int)\n",
423515
"\n",
424516
"# Debug\n",
425517
"# display(git_file_authors)\n",
@@ -428,11 +520,21 @@
428520
"# display(\"1. query result ---------------------\")\n",
429521
"# display(git_files_with_commit_statistics)\n",
430522
"\n",
523+
"# Add new column 'fileExtension' for every 'filePath'\n",
524+
"git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n",
525+
"\n",
526+
"# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n",
527+
"git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n",
528+
"git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=False, method='dense').astype(int)\n",
529+
"\n",
530+
"# Debug\n",
531+
"# display(git_file_extensions)\n",
532+
"\n",
431533
"# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
432534
"git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
433535
"\n",
434536
"# Debug\n",
435-
"# display(\"2. added directoryPath --------------\")\n",
537+
"# display(\"2. added directoryPath and fileExtension --------------\")\n",
436538
"# display(git_files_with_commit_statistics)\n",
437539
"\n",
438540
"# Define how common non-grouped columns will be aggregated.\n",
@@ -452,6 +554,7 @@
452554
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
453555
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
454556
" firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
557+
" fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
455558
" **common_named_aggregation\n",
456559
")\n",
457560
"\n",
@@ -469,6 +572,7 @@
469572
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
470573
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
471574
" firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
575+
" mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
472576
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
473577
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
474578
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
@@ -558,6 +662,42 @@
558662
"figure.show(**plotly_treemap_figure_show_settings)"
559663
]
560664
},
665+
{
666+
"cell_type": "markdown",
667+
"id": "e93d944a",
668+
"metadata": {},
669+
"source": [
670+
"### Most frequent file extension per directory"
671+
]
672+
},
673+
{
674+
"cell_type": "code",
675+
"execution_count": null,
676+
"id": "0147c747",
677+
"metadata": {},
678+
"outputs": [],
679+
"source": [
680+
"git_files_with_commit_statistics_and_file_extension_rank = pd.merge(\n",
681+
" git_files_with_commit_statistics, \n",
682+
" git_file_extensions, \n",
683+
" left_on='mostFrequentFileExtension', \n",
684+
" right_on=\"fileExtension\",\n",
685+
" how=\"left\",\n",
686+
" validate=\"m:1\"\n",
687+
")\n",
688+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
689+
" create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
690+
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
691+
" # values = git_files_with_commit_statistics['fileCount'],\n",
692+
" marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_file_extension_rank, 'fileExtension', 'fileExtensionCountRank')\n",
693+
"))\n",
694+
"figure.update_layout(\n",
695+
" **plotly_treemap_layout_base_settings,\n",
696+
" title='Most frequent file extension per directory'\n",
697+
")\n",
698+
"figure.show(**plotly_treemap_figure_show_settings)"
699+
]
700+
},
561701
{
562702
"cell_type": "markdown",
563703
"id": "e98ca7b1",
@@ -634,63 +774,30 @@
634774
"### Main author per directory"
635775
]
636776
},
637-
{
638-
"cell_type": "code",
639-
"execution_count": null,
640-
"id": "259f7278",
641-
"metadata": {},
642-
"outputs": [],
643-
"source": [
644-
"def create_git_authors_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n",
645-
" \"\"\"\n",
646-
" Creates a plotly graph_objects.Treemap marker object for git author plots.\n",
647-
" main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n",
648-
" author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n",
649-
" author_column_name : str : The name of the (aggregated) author column for coloring the plot\n",
650-
" return : plotly_graph_objects.treemap.Marker : The created Marker object\n",
651-
" \"\"\"\n",
652-
" data_frame_with_authors=pd.merge(\n",
653-
" main_data_frame, \n",
654-
" author_rank_data_frame, \n",
655-
" left_on=author_column_name, \n",
656-
" right_on=\"author\",\n",
657-
" how=\"left\",\n",
658-
" validate=\"m:1\"\n",
659-
" )\n",
660-
" #display(data_frame_with_author_ranks)\n",
661-
"\n",
662-
" data_frame_with_author_ranks=data_frame_with_authors['authorCommitCountRank']\n",
663-
"\n",
664-
" return dict(\n",
665-
" cornerradius=5, \n",
666-
" colors=data_frame_with_author_ranks,\n",
667-
" colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n",
668-
" colorbar=dict(\n",
669-
" title=\"Rank\",\n",
670-
" tickmode=\"array\",\n",
671-
" ticktext=data_frame_with_authors[author_column_name],\n",
672-
" tickvals=data_frame_with_author_ranks,\n",
673-
" tickfont_size=8\n",
674-
" ),\n",
675-
" )\n"
676-
]
677-
},
678777
{
679778
"cell_type": "code",
680779
"execution_count": null,
681780
"id": "e97c0d87",
682781
"metadata": {},
683782
"outputs": [],
684783
"source": [
784+
"git_files_with_commit_statistics_and_main_author_rank = pd.merge(\n",
785+
" git_files_with_commit_statistics, \n",
786+
" git_file_authors, \n",
787+
" left_on='mainAuthor', \n",
788+
" right_on=\"author\",\n",
789+
" how=\"left\",\n",
790+
" validate=\"m:1\"\n",
791+
")\n",
685792
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
686793
" create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
687794
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
688795
" # values = git_files_with_commit_statistics['fileCount'],\n",
689-
" marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"mainAuthor\")\n",
796+
" marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_main_author_rank, 'mainAuthor', 'authorCommitCountRank')\n",
690797
"))\n",
691798
"figure.update_layout(\n",
692799
" **plotly_treemap_layout_base_settings,\n",
693-
" title='Main author (highest number of commits)'\n",
800+
" title='Main authors with highest number of commits'\n",
694801
")\n",
695802
"figure.show(**plotly_treemap_figure_show_settings)"
696803
]
@@ -710,15 +817,24 @@
710817
"metadata": {},
711818
"outputs": [],
712819
"source": [
820+
"git_files_with_commit_statistics_and_second_author_rank = pd.merge(\n",
821+
" git_files_with_commit_statistics, \n",
822+
" git_file_authors, \n",
823+
" left_on='secondAuthor', \n",
824+
" right_on=\"author\",\n",
825+
" how=\"left\",\n",
826+
" validate=\"m:1\"\n",
827+
")\n",
828+
"\n",
713829
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
714830
" create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
715831
" # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
716832
" # values = git_files_with_commit_statistics['fileCount'],\n",
717-
" marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"secondAuthor\")\n",
833+
" marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_second_author_rank, 'secondAuthor', 'authorCommitCountRank')\n",
718834
"))\n",
719835
"figure.update_layout(\n",
720836
" **plotly_treemap_layout_base_settings,\n",
721-
" title='Second author (second highest number of commits)'\n",
837+
" title='Second author with the second highest number of commits'\n",
722838
")\n",
723839
"figure.show(**plotly_treemap_figure_show_settings)"
724840
]

0 commit comments

Comments
 (0)