Add wordcloud to git history

JohT · JohT · commit ac6a79f8d551 · 2025-03-17T14:15:38.000+01:00
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -91,16 +91,9 @@
    "source": [
     "def get_cypher_query_from_file(cypher_file_name : str):\n",
     "    with open(cypher_file_name) as file:\n",
-    "        return ' '.join(file.readlines())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "59310f6f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "        return ' '.join(file.readlines())\n",
+    "\n",
+    "\n",
     "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n",
     "    \"\"\"\n",
     "    Execute the Cypher query of the given file and returns the result.\n",
@@ -111,16 +104,9 @@
     "    if limit > 0:\n",
     "        cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n",
     "    records, summary, keys = driver.execute_query(cypher_query)\n",
-    "    return pd.DataFrame([r.values() for r in records], columns=keys)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c09da482",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    return pd.DataFrame([r.values() for r in records], columns=keys)\n",
+    "\n",
+    "\n",
     "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n",
     "    \"\"\"\n",
     "    Executes the Cypher queries of the given files and returns the first result that is not empty.\n",
@@ -986,6 +972,57 @@
     ")\n",
     "figure.show(**plotly_treemap_figure_show_settings)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14e87aff",
+   "metadata": {},
+   "source": [
+    "## WordCloud of git authors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2f68f02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Query data from graph database\n",
+    "git_author_words_with_frequency = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher\")\n",
+    "# Debug \n",
+    "# display(git_author_words_with_frequency.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d83ce5f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from wordcloud import WordCloud\n",
+    "import matplotlib.pyplot as plot\n",
+    "\n",
+    "if not git_author_words_with_frequency.empty:\n",
+    "    # Expects the first column of the DataFrame to contain the words/text and the second column to contain the count/frequency.\n",
+    "    words_with_frequency_dict=git_author_words_with_frequency.set_index(git_author_words_with_frequency.columns[0]).to_dict()[git_author_words_with_frequency.columns[1]]\n",
+    "    wordcloud = WordCloud(\n",
+    "        width=800, \n",
+    "        height=800,\n",
+    "        max_words=600, \n",
+    "        collocations=False,\n",
+    "        background_color='white', \n",
+    "        colormap='viridis'\n",
+    "    ).generate_from_frequencies(words_with_frequency_dict)\n",
+    "\n",
+    "    # Plot the word cloud\n",
+    "    plot.figure(figsize=(15,15))\n",
+    "    plot.imshow(wordcloud, interpolation='bilinear')\n",
+    "    plot.axis(\"off\")\n",
+    "    plot.title('Wordcloud of git authors')\n",
+    "    plot.show()"
+   ]
   }
  ],
  "metadata": {