Add most frequent file extension treemap

JohT · JohT · commit 0e7c6453cf46 · 2025-03-22T09:42:51.000+01:00
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -239,14 +239,46 @@
     "        labels=data_frame['directoryName'],\n",
     "        parents=data_frame['directoryParentPath'],\n",
     "        ids=data_frame['directoryPath'],\n",
-    "        customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
-    "        hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[3]},  %{customdata[4]},.. (%{customdata[2]})<br>Last Commit: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Created: %{customdata[7]} (%{customdata[8]} days ago)<br>Last Modified: %{customdata[9]} (%{customdata[10]} days ago)<br>Path: %{customdata[11]}',\n",
+    "        customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n",
+    "        hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]} (%{customdata[1]})<br>Commits: %{customdata[2]}<br>Authors: %{customdata[4]},  %{customdata[5]},.. (%{customdata[3]})<br>Last Commit: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Created: %{customdata[8]} (%{customdata[9]} days ago)<br>Last Modified: %{customdata[10]} (%{customdata[11]} days ago)<br>Path: %{customdata[12]}',\n",
     "        maxdepth=-1,\n",
     "        root_color=\"lightgrey\",\n",
     "        marker=dict(**plotly_treemap_marker_base_style),\n",
     "    )"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "641fa05c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_rank_colorbar_for_graph_objects_treemap_marker(data_frame: pd.DataFrame, name_column: str, rank_column: str):\n",
+    "    \"\"\"\n",
+    "    Creates a plotly graph_objects.Treemap marker object for a colorbar representing ranked names.\n",
+    "    data_frame : pd.DataFrame : The DataFrame that contains the name and the count column\n",
+    "    name_column : str : The name of the column containing the ranking \n",
+    "    rank_column : str : The name of the column containing the ranking \n",
+    "    return : plotly_graph_objects.treemap.Marker : The created Marker object\n",
+    "    \"\"\"\n",
+    "    # The rank is inverted so that the first rank is shown on the top of the colorbar.\n",
+    "    inverse_ranked = data_frame[rank_column].max() + 1 - data_frame[rank_column]\n",
+    "\n",
+    "    return dict(\n",
+    "        cornerradius=5, \n",
+    "        colors=inverse_ranked,\n",
+    "        colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n",
+    "        colorbar=dict(\n",
+    "            title=\"Rank\",\n",
+    "            tickmode=\"array\",\n",
+    "            ticktext=data_frame[name_column],\n",
+    "            tickvals=inverse_ranked,\n",
+    "            tickfont_size=10\n",
+    "        ),\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "acacc415",
@@ -312,6 +344,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "def get_last_entry(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Get the last element of an array and converts therefore an array to a single element\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : any : The last entry\n",
+    "    \"\"\"\n",
+    "    return values[-1]\n",
+    "\n",
+    "\n",
+    "def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n",
+    "    \"\"\"\n",
+    "    Adds a fileExtension column to the input DataFrame based on the file path column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    file_path_column : str : The name of the file path column\n",
+    "    file_extension_column : str : The name of the file extension column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory column\n",
+    "    \"\"\"\n",
+    "    if file_extension_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    # What is the correct extension in the following cases?\n",
+    "    #  - /main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n",
+    "    #  - MyReactComponent.test.tsx\n",
+    "    # Currently, it would be\n",
+    "    #  - HandlerEnhancerDefinition\n",
+    "    #  - tsx\n",
+    "    # which is not perfect but good enough to start with.#\n",
+    "    \n",
+    "    file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n",
+    "    file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n",
+    "    file_extensions=file_extensions.str.split('.').map(get_last_entry)\n",
+    "    input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n",
+    "    return input_dataframe\n",
+    "\n",
     "def remove_last_file_path_element(file_path_elements: list) -> list:\n",
     "    \"\"\"\n",
     "    Removes the last element of the file path so that only the directory names retain.\n",
@@ -378,6 +445,16 @@
     "    input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
     "    return input_dataframe\n",
     "\n",
+    "\n",
+    "def collect_as_array(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Just collect all values (no operation, \"noop\")\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : any : The second entry\n",
+    "    \"\"\"\n",
+    "    return np.asanyarray(values.to_list())\n",
+    "\n",
     "def second_entry(values: pd.Series):\n",
     "    \"\"\"\n",
     "    Returns the second entry of a list of values.\n",
@@ -394,7 +471,22 @@
     "    values : Series : The pandas Series of values\n",
     "    return : int : The number of files\n",
     "    \"\"\"\n",
-    "    return len(np.unique(np.concatenate(values.to_list())))"
+    "    return len(np.unique(np.concatenate(values.to_list())))\n",
+    "\n",
+    "\n",
+    "def get_most_frequent_entry(input_values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Flattens the array of arrays and return the most frequent entry .\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    input_values : Series : The pandas Series of values\n",
+    "    return : str : The most frequent entry\n",
+    "    \"\"\"\n",
+    "    # flatten the array of arrays \n",
+    "    values = np.concatenate(input_values.to_list())\n",
+    "    # find frequency of each value\n",
+    "    values, counts = np.unique(values, return_counts=True)\n",
+    "    #display all values with highest frequencies\n",
+    "    return values[counts.argmax()]"
    ]
   },
   {
@@ -419,7 +511,7 @@
     "git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(\n",
     "    authorCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
     "    ).sort_values(by='authorCommitCount', ascending=False).reset_index()\n",
-    "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=True, method='dense').astype(int)\n",
+    "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=False, method='dense').astype(int)\n",
     "\n",
     "# Debug\n",
     "# display(git_file_authors)\n",
@@ -428,11 +520,21 @@
     "# display(\"1. query result ---------------------\")\n",
     "# display(git_files_with_commit_statistics)\n",
     "\n",
+    "# Add new column 'fileExtension' for every 'filePath'\n",
+    "git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n",
+    "\n",
+    "# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n",
+    "git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n",
+    "git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=False, method='dense').astype(int)\n",
+    "\n",
+    "# Debug\n",
+    "# display(git_file_extensions)\n",
+    "\n",
     "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
     "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
     "\n",
     "# Debug\n",
-    "# display(\"2. added directoryPath --------------\")\n",
+    "# display(\"2. added directoryPath and fileExtension --------------\")\n",
     "# display(git_files_with_commit_statistics)\n",
     "\n",
     "# Define how common non-grouped columns will be aggregated.\n",
@@ -452,6 +554,7 @@
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
     "    filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
     "    firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
+    "    fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "\n",
@@ -469,6 +572,7 @@
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
     "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
     "    firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
+    "    mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
     "    authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
     "    mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
     "    secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
@@ -558,6 +662,42 @@
     "figure.show(**plotly_treemap_figure_show_settings)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e93d944a",
+   "metadata": {},
+   "source": [
+    "### Most frequent file extension per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0147c747",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "git_files_with_commit_statistics_and_file_extension_rank = pd.merge(\n",
+    "    git_files_with_commit_statistics, \n",
+    "    git_file_extensions, \n",
+    "    left_on='mostFrequentFileExtension', \n",
+    "    right_on=\"fileExtension\",\n",
+    "    how=\"left\",\n",
+    "    validate=\"m:1\"\n",
+    ")\n",
+    "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+    "    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
+    "    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
+    "    # values = git_files_with_commit_statistics['fileCount'],\n",
+    "    marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_file_extension_rank, 'fileExtension', 'fileExtensionCountRank')\n",
+    "))\n",
+    "figure.update_layout(\n",
+    "    **plotly_treemap_layout_base_settings,\n",
+    "    title='Most frequent file extension per directory'\n",
+    ")\n",
+    "figure.show(**plotly_treemap_figure_show_settings)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "e98ca7b1",
@@ -634,63 +774,30 @@
     "### Main author per directory"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "259f7278",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_git_authors_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n",
-    "    \"\"\"\n",
-    "    Creates a plotly graph_objects.Treemap marker object for git author plots.\n",
-    "    main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n",
-    "    author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n",
-    "    author_column_name : str : The name of the (aggregated) author column for coloring the plot\n",
-    "    return : plotly_graph_objects.treemap.Marker : The created Marker object\n",
-    "    \"\"\"\n",
-    "    data_frame_with_authors=pd.merge(\n",
-    "        main_data_frame, \n",
-    "        author_rank_data_frame, \n",
-    "        left_on=author_column_name, \n",
-    "        right_on=\"author\",\n",
-    "        how=\"left\",\n",
-    "        validate=\"m:1\"\n",
-    "    )\n",
-    "    #display(data_frame_with_author_ranks)\n",
-    "\n",
-    "    data_frame_with_author_ranks=data_frame_with_authors['authorCommitCountRank']\n",
-    "\n",
-    "    return dict(\n",
-    "        cornerradius=5, \n",
-    "        colors=data_frame_with_author_ranks,\n",
-    "        colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n",
-    "        colorbar=dict(\n",
-    "            title=\"Rank\",\n",
-    "            tickmode=\"array\",\n",
-    "            ticktext=data_frame_with_authors[author_column_name],\n",
-    "            tickvals=data_frame_with_author_ranks,\n",
-    "            tickfont_size=8\n",
-    "        ),\n",
-    "    )\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "e97c0d87",
    "metadata": {},
    "outputs": [],
    "source": [
+    "git_files_with_commit_statistics_and_main_author_rank = pd.merge(\n",
+    "    git_files_with_commit_statistics, \n",
+    "    git_file_authors, \n",
+    "    left_on='mainAuthor', \n",
+    "    right_on=\"author\",\n",
+    "    how=\"left\",\n",
+    "    validate=\"m:1\"\n",
+    ")\n",
     "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
     "    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
     "    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
     "    # values = git_files_with_commit_statistics['fileCount'],\n",
-    "    marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"mainAuthor\")\n",
+    "    marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_main_author_rank, 'mainAuthor', 'authorCommitCountRank')\n",
     "))\n",
     "figure.update_layout(\n",
     "    **plotly_treemap_layout_base_settings,\n",
-    "    title='Main author (highest number of commits)'\n",
+    "    title='Main authors with highest number of commits'\n",
     ")\n",
     "figure.show(**plotly_treemap_figure_show_settings)"
    ]
@@ -710,15 +817,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "git_files_with_commit_statistics_and_second_author_rank = pd.merge(\n",
+    "    git_files_with_commit_statistics, \n",
+    "    git_file_authors, \n",
+    "    left_on='secondAuthor', \n",
+    "    right_on=\"author\",\n",
+    "    how=\"left\",\n",
+    "    validate=\"m:1\"\n",
+    ")\n",
+    "\n",
     "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
     "    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
     "    # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n",
     "    # values = git_files_with_commit_statistics['fileCount'],\n",
-    "    marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"secondAuthor\")\n",
+    "    marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_second_author_rank, 'secondAuthor', 'authorCommitCountRank')\n",
     "))\n",
     "figure.update_layout(\n",
     "    **plotly_treemap_layout_base_settings,\n",
-    "    title='Second author (second highest number of commits)'\n",
+    "    title='Second author with the second highest number of commits'\n",
     ")\n",
     "figure.show(**plotly_treemap_figure_show_settings)"
    ]