fixup! Add git history file overview treemap

JohT · JohT · commit 67cfcead107e · 2025-03-11T07:47:59.000+01:00
diff --git a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher
@@ -0,0 +1,24 @@
+// List git files with commit statistics
+
+ MATCH (git_file:File&Git&!Repository)
+ WHERE git_file.deletedAt IS NULL // filter out deleted files
+  WITH percentileDisc(git_file.createdAtEpoch, 0.5)          AS medianCreatedAtEpoch
+      ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch
+      ,collect(git_file)                                     AS git_files
+UNWIND git_files AS git_file
+  WITH *
+      ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch))                                            AS fileCreatedAtTimestamp
+      ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp
+ MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
+ MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file)
+RETURN git_repository.name + '/' + git_file.relativePath AS filePath
+      ,split(git_commit.author, ' <')[0]                 AS author
+      ,count(DISTINCT git_commit.sha)                    AS commitCount
+      ,date(max(git_commit.date))                        AS lastCommitDate
+      ,max(date(fileCreatedAtTimestamp))                 AS lastCreationDate
+      ,max(date(fileLastModificationAtTimestamp))        AS lastModificationDate
+      ,duration.inDays(date(max(git_commit.date)), date()).days               AS daysSinceLastCommit
+      ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days          AS daysSinceLastCreation
+      ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification
+      ,max(git_commit.sha)                               AS maxCommitSha
+ORDER BY filePath ASCENDING, commitCount DESCENDING
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -423,6 +423,203 @@
     "    return input_dataframe.copy().join(dataframe_split), dataframe_split.columns.tolist()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "da109679",
+   "metadata": {},
+   "source": [
+    "### File Data Preparation Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "299b06ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_last_file_path_element(file_path_elements: list) -> list:\n",
+    "    \"\"\"\n",
+    "    Removes the last element of the file path so that only the directory names retain.\n",
+    "    file_path_elements : list : The list of levels to remove\n",
+    "    return : list : The list of the directories\n",
+    "    \"\"\"\n",
+    "    return file_path_elements[:-1] if len(file_path_elements) > 1 else ['']\n",
+    "\n",
+    "def convert_path_elements_to_directories(file_path_elements: list) -> list:\n",
+    "    \"\"\"\n",
+    "    Converts the file path elements into directories.\n",
+    "    file_path_elements : list : The list of levels to convert\n",
+    "    return : list : The list of directories\n",
+    "    \"\"\"\n",
+    "    directories = remove_last_file_path_element(file_path_elements)\n",
+    "    return ['/'.join(directories[:i+1]) for i in range(len(directories))]\n",
+    "\n",
+    "def add_directory_column(input_dataframe: pd.DataFrame, file_path_column: str, directory_column: str = 'directoryPath'):\n",
+    "    \"\"\"\n",
+    "    Adds a directory column to the input DataFrame based on the file path column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    file_path_column : str : The name of the file path column\n",
+    "    directory_column : str : The name of the directory column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory column\n",
+    "    \"\"\"\n",
+    "    if directory_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    input_dataframe.insert(0, directory_column, input_dataframe[file_path_column].str.split('/').apply(convert_path_elements_to_directories))\n",
+    "    input_dataframe = input_dataframe.explode(directory_column)\n",
+    "    return input_dataframe\n",
+    "\n",
+    "def add_directory_name_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_name_column: str = 'directoryName'):\n",
+    "    \"\"\"\n",
+    "    Adds a directory name column to the input DataFrame based on the directory column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    directory_column : str : The name of the directory column\n",
+    "    directory_name_column : str : The name of the directory name column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory name column\n",
+    "    \"\"\"\n",
+    "    if directory_name_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n",
+    "    input_dataframe.insert(1, directory_name_column, splitted_directories.apply(lambda x: (x[-1])))\n",
+    "    return input_dataframe\n",
+    "\n",
+    "def add_parent_directory_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_parent_column: str = 'directoryParentPath'):\n",
+    "    \"\"\"\n",
+    "    Adds a directory parent column to the input DataFrame based on the directory column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    directory_column : str : The name of the directory column\n",
+    "    directory_parent_column : str : The name of the directory parent column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory parent column\n",
+    "    \"\"\"\n",
+    "    if directory_parent_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    # Remove last path element from directory_column to get the directory_parent_column\n",
+    "    splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n",
+    "    input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n",
+    "    \n",
+    "    # Clear parent (set to empty string) when it equal to the directory\n",
+    "    input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
+    "    return input_dataframe\n",
+    "\n",
+    "def second_entry(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Returns the second entry of a list of values.\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : any : The second entry\n",
+    "    \"\"\"\n",
+    "    return values.iloc[1] if len(values) > 1 else None\n",
+    "\n",
+    "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Return the file count from an array of array of file paths.\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : int : The number of files\n",
+    "    \"\"\"\n",
+    "    return len(np.unique(np.concatenate(values.to_list())))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09aeae9b",
+   "metadata": {},
+   "source": [
+    "### File Data Preparation "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "682d8aa9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "git_files_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher\")\n",
+    "\n",
+    "display(\"1. query result ---------------------\")\n",
+    "display(git_files_with_commit_statistics)\n",
+    "\n",
+    "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
+    "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
+    "\n",
+    "display(\"2. added directoryPath --------------\")\n",
+    "display(git_files_with_commit_statistics)\n",
+    "\n",
+    "# Define how common non-grouped columns will be aggregated.\n",
+    "common_named_aggregation = dict(\n",
+    "    commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
+    "    daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
+    "    daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
+    "    daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
+    "    lastCommitDate=pd.NamedAgg(column=\"lastCommitDate\", aggfunc=\"max\"),\n",
+    "    lastCreationDate=pd.NamedAgg(column=\"lastCreationDate\", aggfunc=\"max\"),\n",
+    "    lastModificationDate=pd.NamedAgg(column=\"lastModificationDate\", aggfunc=\"max\"),\n",
+    "    maxCommitSha=pd.NamedAgg(column=\"maxCommitSha\", aggfunc=\"max\"),\n",
+    ")\n",
+    "\n",
+    "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
+    "    filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
+    "    **common_named_aggregation\n",
+    ")\n",
+    "\n",
+    "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
+    "# The author with the most commits will then be listed first for each directory.\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
+    "\n",
+    "display(\"3. grouped by 'directoryPath' and 'author' -----\")\n",
+    "display(git_files_with_commit_statistics)\n",
+    "\n",
+    "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
+    "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
+    "    # fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=lambda x: len(np.unique(np.concatenate(x.to_list())))),\n",
+    "    authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
+    "    mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
+    "    secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
+    "    **common_named_aggregation\n",
+    ")\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
+    "\n",
+    "display(\"4. grouped by 'directoryPath' ----------------------\")\n",
+    "display(git_files_with_commit_statistics)\n",
+    "\n",
+    "# git_files_with_commit_statistics['fileCount'] = (git_files_with_commit_statistics['fileCount'] / git_files_with_commit_statistics['authorCount']).astype(int)\n",
+    "\n",
+    "# display(\"4b. fixed file count ----------------------\")\n",
+    "# display(git_files_with_commit_statistics)\n",
+    "\n",
+    "# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n",
+    "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics)\n",
+    "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics)\n",
+    "\n",
+    "display(\"5. added parent and name columns ------------\")\n",
+    "display(git_files_with_commit_statistics)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "114f8d4b",
+   "metadata": {},
+   "source": [
+    "### File Data Preview"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc0c2d06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "git_files_with_commit_statistics.head(20)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2d0df211",
@@ -504,6 +701,26 @@
     "### Directories by file count"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc0dc138",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+    "    create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
+    "    values = git_files_with_commit_statistics['fileCount'],\n",
+    "    # create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n",
+    "    # values = git_file_directories_with_commit_statistics['fileCount'],\n",
+    "))\n",
+    "figure.update_layout(\n",
+    "    **plotly_treemap_layout_base_settings,\n",
+    "    title='Directories and their file count'\n",
+    ")\n",
+    "figure.show()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -512,6 +729,8 @@
    "outputs": [],
    "source": [
     "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+    "    # create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
+    "    # values = git_files_with_commit_statistics['fileCount'],\n",
     "    create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n",
     "    values = git_file_directories_with_commit_statistics['fileCount'],\n",
     "))\n",