Add most frequent file extension treemap

JohT · JohT · commit 29f4b3580af7 · 2025-03-21T08:21:57.000+01:00
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -239,8 +239,8 @@
     "        labels=data_frame['directoryName'],\n",
     "        parents=data_frame['directoryParentPath'],\n",
     "        ids=data_frame['directoryPath'],\n",
-    "        customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
-    "        hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[3]},  %{customdata[4]},.. (%{customdata[2]})<br>Last Commit: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Created: %{customdata[7]} (%{customdata[8]} days ago)<br>Last Modified: %{customdata[9]} (%{customdata[10]} days ago)<br>Path: %{customdata[11]}',\n",
+    "        customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
+    "        hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]},  %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',\n",
     "        maxdepth=-1,\n",
     "        root_color=\"lightgrey\",\n",
     "        marker=dict(**plotly_treemap_marker_base_style),\n",
@@ -312,6 +312,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "def get_last_entry(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Get the last element of an array and converts therefore an array to a single element\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : any : The last entry\n",
+    "    \"\"\"\n",
+    "    return values[-1]\n",
+    "\n",
+    "\n",
+    "def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n",
+    "    \"\"\"\n",
+    "    Adds a fileExtension column to the input DataFrame based on the file path column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    file_path_column : str : The name of the file path column\n",
+    "    file_extension_column : str : The name of the file extension column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory column\n",
+    "    \"\"\"\n",
+    "    if file_extension_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n",
+    "    file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n",
+    "    file_extensions=file_extensions.str.split('.').map(get_last_entry)\n",
+    "    input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n",
+    "    return input_dataframe\n",
+    "\n",
     "def remove_last_file_path_element(file_path_elements: list) -> list:\n",
     "    \"\"\"\n",
     "    Removes the last element of the file path so that only the directory names retain.\n",
@@ -378,6 +405,16 @@
     "    input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
     "    return input_dataframe\n",
     "\n",
+    "\n",
+    "def collect_as_array(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Just collect all values (no operation, \"noop\")\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : any : The second entry\n",
+    "    \"\"\"\n",
+    "    return np.asanyarray(values.to_list())\n",
+    "\n",
     "def second_entry(values: pd.Series):\n",
     "    \"\"\"\n",
     "    Returns the second entry of a list of values.\n",
@@ -394,7 +431,22 @@
     "    values : Series : The pandas Series of values\n",
     "    return : int : The number of files\n",
     "    \"\"\"\n",
-    "    return len(np.unique(np.concatenate(values.to_list())))"
+    "    return len(np.unique(np.concatenate(values.to_list())))\n",
+    "\n",
+    "\n",
+    "def get_most_frequent_entry(input_values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Flattens the array of arrays and return the most frequent entry .\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    input_values : Series : The pandas Series of values\n",
+    "    return : str : The most frequent entry\n",
+    "    \"\"\"\n",
+    "    # flatten the array of arrays \n",
+    "    values = np.concatenate(input_values.to_list())\n",
+    "    # find frequency of each value\n",
+    "    values, counts = np.unique(values, return_counts=True)\n",
+    "    #display all values with highest frequencies\n",
+    "    return values[counts.argmax()]"
    ]
   },
   {
@@ -428,11 +480,26 @@
     "# display(\"1. query result ---------------------\")\n",
     "# display(git_files_with_commit_statistics)\n",
     "\n",
+    "# Add new column 'fileExtension' for every 'filePath'\n",
+    "git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n",
+    "\n",
+    "# TODO What is the correct extension in the following cases?\n",
+    "#  - AxonFramework-4.11.0/messaging/src/main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n",
+    "#  - MyReactComponent.test.tsx\n",
+    "# display(git_files_with_commit_statistics[git_files_with_commit_statistics['fileExtension'] == 'HandlerEnhancerDefinition'])\n",
+    "\n",
+    "# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n",
+    "git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n",
+    "git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=True, method='dense').astype(int)\n",
+    "\n",
+    "# Debug\n",
+    "# display(git_file_extensions)\n",
+    "\n",
     "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
     "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
     "\n",
     "# Debug\n",
-    "# display(\"2. added directoryPath --------------\")\n",
+    "# display(\"2. added directoryPath and fileExtension --------------\")\n",
     "# display(git_files_with_commit_statistics)\n",
     "\n",
     "# Define how common non-grouped columns will be aggregated.\n",
@@ -452,6 +519,7 @@
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
     "    filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
     "    firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
+    "    fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "\n",
@@ -469,6 +537,7 @@
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
     "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
     "    firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
+    "    mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
     "    authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
     "    mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
     "    secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
@@ -723,6 +792,74 @@
     "figure.show(**plotly_treemap_figure_show_settings)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6b5cf97c",
+   "metadata": {},
+   "source": [
+    "### Most frequent file extension per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9497d80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO combine/abstract it with the helping function for git authors\n",
+    "def create_git_file_extension_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n",
+    "    \"\"\"\n",
+    "    Creates a plotly graph_objects.Treemap marker object for git author plots.\n",
+    "    main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n",
+    "    author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n",
+    "    author_column_name : str : The name of the (aggregated) author column for coloring the plot\n",
+    "    return : plotly_graph_objects.treemap.Marker : The created Marker object\n",
+    "    \"\"\"\n",
+    "    data_frame_with_merged_rank=pd.merge(\n",
+    "        main_data_frame, \n",
+    "        author_rank_data_frame, \n",
+    "        left_on=author_column_name, \n",
+    "        right_on=\"fileExtension\",\n",
+    "        how=\"left\",\n",
+    "        validate=\"m:1\"\n",
+    "    )\n",
+    "    #display(data_frame_with_author_ranks)\n",
+    "\n",
+    "    return dict(\n",
+    "        cornerradius=5, \n",
+    "        colors=data_frame_with_merged_rank['fileExtensionCountRank'],\n",
+    "        colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n",
+    "        colorbar=dict(\n",
+    "            title=\"Rank\",\n",
+    "            tickmode=\"array\",\n",
+    "            ticktext=data_frame_with_merged_rank[author_column_name],\n",
+    "            tickvals=data_frame_with_merged_rank['fileExtensionCountRank'],\n",
+    "            tickfont_size=8\n",
+    "        ),\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b2dfe7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+    "    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
+    "    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
+    "    # values = git_files_with_commit_statistics['fileCount'],\n",
+    "    marker=create_git_file_extension_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_extensions, \"mostFrequentFileExtension\")\n",
+    "))\n",
+    "figure.update_layout(\n",
+    "    **plotly_treemap_layout_base_settings,\n",
+    "    title='Most frequent file extension per directory'\n",
+    ")\n",
+    "figure.show(**plotly_treemap_figure_show_settings)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "0ed919b0",