Add wordcloud to git history

JohT · JohT · commit 7d165bc3af94 · 2025-03-18T07:39:16.000+01:00
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -91,16 +91,9 @@
    "source": [
     "def get_cypher_query_from_file(cypher_file_name : str):\n",
     "    with open(cypher_file_name) as file:\n",
-    "        return ' '.join(file.readlines())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "59310f6f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        return ' '.join(file.readlines())\n",
+    "\n",
+    "\n",
     "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n",
     "    \"\"\"\n",
     "    Execute the Cypher query of the given file and returns the result.\n",
@@ -111,16 +104,9 @@
     "    if limit > 0:\n",
     "        cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n",
     "    records, summary, keys = driver.execute_query(cypher_query)\n",
-    "    return pd.DataFrame([r.values() for r in records], columns=keys)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c09da482",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    return pd.DataFrame([r.values() for r in records], columns=keys)\n",
+    "\n",
+    "\n",
     "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n",
     "    \"\"\"\n",
     "    Executes the Cypher queries of the given files and returns the first result that is not empty.\n",
@@ -625,74 +611,6 @@
     "### Main author per directory"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "29069753",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TODO delete unused code"
-   ]
-  },
-  {
-   "cell_type": "raw",
-   "id": "7ccca44e",
-   "metadata": {},
-   "source": [
-    "# TODO experiment again with plotly express\n",
-    "\n",
-    "import plotly.express as plotly_express\n",
-    "\n",
-    "plotly_treemap_color_settings = dict(\n",
-    "    color_continuous_scale='Hot_r',  # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n",
-    "    color_discrete_sequence=plotly_express.colors.qualitative.Vivid,\n",
-    ")\n",
-    "plotly_treemap_commit_statistics_custom_data= dict(\n",
-    "    custom_data=['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath'],\n",
-    ")\n",
-    "plotly_treemap_traces_base_settings = dict(\n",
-    "    root_color=\"lightgrey\",\n",
-    "    textinfo=\"label+value\",\n",
-    "    marker=dict(cornerradius=5),\n",
-    ")\n",
-    "plotly_treemap_traces_commit_statistics_settings = dict(\n",
-    "    **plotly_treemap_traces_base_settings,\n",
-    "    hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[2]}<br>Main Author: %{customdata[3]}<br>Last Commit: %{customdata[4]} (%{customdata[5]} days ago)<br>Last Created: %{customdata[6]} (%{customdata[7]} days ago)<br>Last Modified: %{customdata[8]} (%{customdata[9]} days ago)<br>Path: %{customdata[10]}',\n",
-    ")\n",
-    "plotly_treemap_layout_base_settings = dict(\n",
-    "    margin=dict(t=50, l=15, r=15, b=15),\n",
-    ")\n",
-    "\n",
-    "# Extract unique authors for category orders\n",
-    "#unique_authors = git_files_with_commit_statistics['mainAuthor'].unique()\n",
-    "\n",
-    "figure = plotly_express.treemap(\n",
-    "    git_files_with_commit_statistics,\n",
-    "    **plotly_treemap_color_settings,\n",
-    "    **plotly_treemap_commit_statistics_custom_data,\n",
-    "    ids='directoryPath',\n",
-    "    names='directoryName',\n",
-    "    parents='directoryParentPath',\n",
-    "    # Without values, much more squares are shown which gives a much better overview\n",
-    "    # values='fileCount', \n",
-    "    color='mainAuthor',\n",
-    "    title='Directories and their main author (discrete coloring, no legend?)',\n",
-    ")\n",
-    "figure.update_traces(\n",
-    "    **plotly_treemap_traces_commit_statistics_settings,\n",
-    ")\n",
-    "figure.update_layout(\n",
-    "    **plotly_treemap_layout_base_settings,\n",
-    "    # coloraxis_colorbar=dict(title=\"Author\"),\n",
-    "    legend_title_text='Main Author',\n",
-    "    showlegend=True,\n",
-    "    legend_visible=True,\n",
-    ") \n",
-    "\n",
-    "figure.show(**plotly_treemap_figure_show_settings)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -986,6 +904,57 @@
     ")\n",
     "figure.show(**plotly_treemap_figure_show_settings)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14e87aff",
+   "metadata": {},
+   "source": [
+    "## WordCloud of git authors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2f68f02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Query data from graph database\n",
+    "git_author_words_with_frequency = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher\")\n",
+    "# Debug \n",
+    "# display(git_author_words_with_frequency.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d83ce5f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from wordcloud import WordCloud\n",
+    "import matplotlib.pyplot as plot\n",
+    "\n",
+    "if not git_author_words_with_frequency.empty:\n",
+    "    # Expects the first column of the DataFrame to contain the words/text and the second column to contain the count/frequency.\n",
+    "    words_with_frequency_dict=git_author_words_with_frequency.set_index(git_author_words_with_frequency.columns[0]).to_dict()[git_author_words_with_frequency.columns[1]]\n",
+    "    wordcloud = WordCloud(\n",
+    "        width=800, \n",
+    "        height=800,\n",
+    "        max_words=600, \n",
+    "        collocations=False,\n",
+    "        background_color='white', \n",
+    "        colormap='viridis'\n",
+    "    ).generate_from_frequencies(words_with_frequency_dict)\n",
+    "\n",
+    "    # Plot the word cloud\n",
+    "    plot.figure(figsize=(15,15))\n",
+    "    plot.imshow(wordcloud, interpolation='bilinear')\n",
+    "    plot.axis(\"off\")\n",
+    "    plot.title('Wordcloud of git authors')\n",
+    "    plot.show()"
+   ]
   }
  ],
  "metadata": {