diff --git a/COMMANDS.md b/COMMANDS.md index a785a91ba..62d6c8ef0 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -264,7 +264,7 @@ Here is the resulting schema: #### Parameter -The optional parameter `--source directory-path-to-the-source-folder-containing-git-repositories` can be used to select a different directory for the repositories. By default, the `source` directory within the analysis workspace directory is used. This command only needs the git history to be present. Therefore, `git clone --bare` is sufficient. If the `source` directory is also used for code analysis (like for Typescript) then a full git clone is of course needed. +The optional parameter `--source directory-path-to-the-source-folder-containing-git-repositories` can be used to select a different directory for the repositories. By default, the `source` directory within the analysis workspace directory is used. This command only needs the git history to be present. Therefore, `git clone --bare` is sufficient. If the `source` directory is also used for code analysis (like for Typescript) then a full git clone is of course needed. Additionally, if you want to focus on a specific version or branch, use `--branch branch-name` to checkout the branch and `--single-branch` to exclude other branches before importing the git log data. #### Environment Variable diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index 5fb5dd58d..9f138d666 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -66,7 +66,7 @@ Use these optional command line options as needed: - If you want to analyze Typescript code, create a symbolic link inside the `source` directory that points to the Typescript project. Alternatively you can also copy the project into the `source` directory. -- If you want to include git data like changed files and authors, create a symbolic link inside the `source` directory that points to the repository or clone it into the `source` directory. If you already have your Typescript project in there, you of course don't have to do it twice. If you are analyzing Java artifacts (full source not needed), it is sufficient to use a bare clone that only contains the git history without the sources using `git clone --bare`. +- If you want to include git data like changed files and authors, create a symbolic link inside the `source` directory that points to the repository or clone it into the `source` directory. If you already have your Typescript project in there, you of course don't have to do it twice. If you are analyzing Java artifacts (full source not needed), it is sufficient to use a bare clone that only contains the git history without the sources using `git clone --bare`. If you want to focus on one branch, use `--branch branch-name` to checkout the branch and `--single-branch` to only fetch the history of that branch. - Alternatively to the steps above, run an already predefined download script diff --git a/README.md b/README.md index c3295bb51..6bba94412 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ The [Code Structure Analysis Pipeline](./.github/workflows/internal-java-code-an - [numpy](https://numpy.org) - [pandas](https://pandas.pydata.org) - [pip](https://pip.pypa.io/en/stable) + - [plotly](https://plotly.com/python) - [monotonic](https://github.com/atdt/monotonic) - [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver) - [openTSNE](https://github.com/pavlin-policar/openTSNE) diff --git a/cypher/GitLog/List_git_files_per_commit_distribution.cypher b/cypher/GitLog/List_git_files_per_commit_distribution.cypher new file mode 100644 index 000000000..a15fb2994 --- /dev/null +++ b/cypher/GitLog/List_git_files_per_commit_distribution.cypher @@ -0,0 +1,6 @@ +// List how many git commits changed one file, how mandy changed two files, .... + +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[]->(git_file:Git:File) + WITH git_commit, count(DISTINCT git_file.relativePath) AS filesPerCommit +RETURN filesPerCommit, count(DISTINCT git_commit.sha) AS commitCount +ORDER BY filesPerCommit ASC \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher new file mode 100644 index 000000000..34d4e18fc --- /dev/null +++ b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher @@ -0,0 +1,24 @@ +// List git files with commit statistics + + MATCH (git_file:File&Git&!Repository) + WHERE git_file.deletedAt IS NULL // filter out deleted files + WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch + ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch + ,collect(git_file) AS git_files +UNWIND git_files AS git_file + WITH * + ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp + ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp + MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) + MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-->(old_files_included:Git&File&!Repository)-[:HAS_NEW_NAME*0..3]->(git_file) +RETURN git_repository.name + '/' + git_file.relativePath AS filePath + ,split(git_commit.author, ' <')[0] AS author + ,count(DISTINCT git_commit.sha) AS commitCount + ,date(max(git_commit.date)) AS lastCommitDate + ,max(date(fileCreatedAtTimestamp)) AS lastCreationDate + ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate + ,duration.inDays(date(max(git_commit.date)), date()).days AS daysSinceLastCommit + ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation + ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification + ,max(git_commit.sha) AS maxCommitSha +ORDER BY filePath ASCENDING, commitCount DESCENDING \ No newline at end of file diff --git a/cypher/Validation/ValidateGitHistory.cypher b/cypher/Validation/ValidateGitHistory.cypher new file mode 100644 index 000000000..2ca463310 --- /dev/null +++ b/cypher/Validation/ValidateGitHistory.cypher @@ -0,0 +1,6 @@ +// Check if there is at least one Git:Commit pointing to a Git:Change containing a Git:File from a Git:Repository + + MATCH (commit:Git:Commit)-[:CONTAINS_CHANGE]->(change:Git:Change)-->(file:Git:File) + MATCH (repository:Git:Repository)-[:HAS_FILE]->(file) +RETURN commit.sha AS commitSha + LIMIT 1 \ No newline at end of file diff --git a/jupyter/ExternalDependenciesJava.ipynb b/jupyter/ExternalDependenciesJava.ipynb index b35645241..0c533f790 100644 --- a/jupyter/ExternalDependenciesJava.ipynb +++ b/jupyter/ExternalDependenciesJava.ipynb @@ -44,23 +44,16 @@ }, { "cell_type": "code", - "execution_count": 235, + "execution_count": null, "id": "c1db254b", "metadata": {}, "outputs": [], "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": 236, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -1735,7 +1728,7 @@ "celltoolbar": "Tags", "code_graph_analysis_pipeline_data_validation": "ValidateJavaExternalDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -1749,7 +1742,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "External Dependencies for Java" }, diff --git a/jupyter/ExternalDependenciesTypescript.ipynb b/jupyter/ExternalDependenciesTypescript.ipynb index a6d80e822..c8de0f004 100644 --- a/jupyter/ExternalDependenciesTypescript.ipynb +++ b/jupyter/ExternalDependenciesTypescript.ipynb @@ -51,16 +51,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -1638,7 +1631,7 @@ "celltoolbar": "Tags", "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -1652,7 +1645,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "External Dependencies for Typescript" }, diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb new file mode 100644 index 000000000..fc4a1665a --- /dev/null +++ b/jupyter/GitHistoryGeneral.ipynb @@ -0,0 +1,1194 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# git log/history\n", + "
\n", + "\n", + "### References\n", + "- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)\n", + "- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "#pd.options.mode.copy_on_write = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c57aadf9", + "metadata": {}, + "outputs": [], + "source": [ + "from neo4j import GraphDatabase\n", + "from plotly import graph_objects as plotly_graph_objects\n", + "from plotly.express import colors as plotly_colors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "740e64d9", + "metadata": {}, + "outputs": [], + "source": [ + "# To be able to distinguish between command line execution and Jupyter notebook execution\n", + "# we need to check if the environment variable NBCONVERT is set.\n", + "# The command line execution is required to take care of setting NBCONVERT.\n", + "\n", + "# Note: Even if it would be great to retain the interactivity of plotly Treemap plots (e.g. clicking into details)\n", + "# for command line executed notebooks (via nbconvert),\n", + "# it would require to execute the notebook twice: Once including interactivity and once for static Markdown and PDF.\n", + "# Therefore, command line executed notebooks (nbconvert) will contain static graphics (here using svg).\n", + "def is_command_line_execution():\n", + " return 'NBCONVERT' in os.environ\n", + "\n", + "default_renderer = None\n", + "\n", + "if is_command_line_execution():\n", + " print(\"Command line execution (CLI mode): Yes\")\n", + " default_renderer = 'svg' # SVG is the default renderer for static (non interactive) pictures for command line execution\n", + "else:\n", + " print(\"Command line execution (CLI mode): No\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", + " cypher_query = get_cypher_query_from_file(filename)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", + " records, summary, keys = driver.execute_query(cypher_query)\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", + " \"\"\"\n", + " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", + " If all given file names result in empty results, the last (empty) result will be returned.\n", + " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n", + " \"\"\" \n", + " result=pd.DataFrame()\n", + " for filename in filenames:\n", + " result=query_cypher_to_data_frame(filename, limit)\n", + " if not result.empty:\n", + " return result\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a56670c9", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006b9dc8", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6323e85e", + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas DataFrame Display Configuration\n", + "pd.set_option('display.max_colwidth', 500)" + ] + }, + { + "cell_type": "markdown", + "id": "fe17f2aa", + "metadata": {}, + "source": [ + "## Git History - Directory Commit Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f37df7c", + "metadata": {}, + "outputs": [], + "source": [ + "# The first part provides functions that provide basic functionality for the following parts." + ] + }, + { + "cell_type": "markdown", + "id": "01da524e", + "metadata": {}, + "source": [ + "### Treemap Layout Functions and Constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "841967e5", + "metadata": {}, + "outputs": [], + "source": [ + "# Base settings for Plotly Treemap\n", + "\n", + "plotly_main_layout_base_settings = dict(\n", + " margin=dict(t=50, l=15, r=15, b=15),\n", + ")\n", + "plotly_treemap_layout_base_settings = dict(\n", + " **plotly_main_layout_base_settings\n", + ")\n", + "plotly_bar_layout_base_settings = dict(\n", + " **plotly_main_layout_base_settings\n", + ")\n", + "plotly_treemap_figure_show_settings = dict(\n", + " renderer=\"svg\" if is_command_line_execution() else None,\n", + " width=1000,\n", + " height=800\n", + ")\n", + "\n", + "plotly_treemap_marker_base_style = dict(\n", + " cornerradius=5, \n", + ")\n", + "\n", + "plotly_treemap_marker_base_colorscale = dict(\n", + " **plotly_treemap_marker_base_style,\n", + " colorscale='Hot_r', # Hot_r, ice_r, Viridis_r, speed_r, haline_r, thermal_r, Plasma_r, solar_r, Electric_r, Blackbody_r, deep_r, Turbo_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8cc624a", + "metadata": {}, + "outputs": [], + "source": [ + "def create_treemap_commit_statistics_settings(data_frame: pd.DataFrame):\n", + " \"\"\"\n", + " Creates a Plotly Treemap with the given settings and data frame.\n", + " data_frame : pd.DataFrame : The input data frame\n", + " return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n", + " \"\"\"\n", + " return plotly_graph_objects.Treemap(\n", + " labels=data_frame['directoryName'],\n", + " parents=data_frame['directoryParentPath'],\n", + " ids=data_frame['directoryPath'],\n", + " customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", + " hovertemplate='%{label}
Files: %{customdata[0]} (%{customdata[1]})
Commits: %{customdata[2]}
Authors: %{customdata[4]}, %{customdata[5]},.. (%{customdata[3]})
Last Commit: %{customdata[6]} (%{customdata[7]} days ago)
Last Created: %{customdata[8]} (%{customdata[9]} days ago)
Last Modified: %{customdata[10]} (%{customdata[11]} days ago)
Path: %{customdata[12]}',\n", + " maxdepth=-1,\n", + " root_color=\"lightgrey\",\n", + " marker=dict(**plotly_treemap_marker_base_style),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "641fa05c", + "metadata": {}, + "outputs": [], + "source": [ + "def create_rank_colorbar_for_graph_objects_treemap_marker(data_frame: pd.DataFrame, name_column: str, rank_column: str):\n", + " \"\"\"\n", + " Creates a plotly graph_objects.Treemap marker object for a colorbar representing ranked names.\n", + " data_frame : pd.DataFrame : The DataFrame that contains the name and the count column\n", + " name_column : str : The name of the column containing the ranking \n", + " rank_column : str : The name of the column containing the ranking \n", + " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", + " \"\"\"\n", + " # The rank is inverted so that the first rank is shown on the top of the colorbar.\n", + " inverse_ranked = data_frame[rank_column].max() + 1 - data_frame[rank_column]\n", + "\n", + " return dict(\n", + " cornerradius=5, \n", + " colors=inverse_ranked,\n", + " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", + " colorbar=dict(\n", + " title=\"Rank\",\n", + " tickmode=\"array\",\n", + " ticktext=data_frame[name_column],\n", + " tickvals=inverse_ranked,\n", + " tickfont_size=10\n", + " ),\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "acacc415", + "metadata": {}, + "source": [ + "### Visualization Data Preparation Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83077395", + "metadata": {}, + "outputs": [], + "source": [ + "def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:\n", + " \"\"\"\n", + " Limits the values of the given column in the input data frame to the given quantile.\n", + " The values are not filtered out but set to the limited (integer quantile value).\n", + " input_data_frame : pd.DataFrame : The input data frame\n", + " column_name : str : The name of the column to limit\n", + " quantile : float : The quantile to limit the values to (default: 0.95)\n", + " return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')\n", + " \"\"\"\n", + " data_frame=input_data_frame.copy()\n", + " column_values = data_frame[column_name]\n", + " column_limit = column_values.quantile(quantile)\n", + " data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)\n", + " return data_frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f9060d", + "metadata": {}, + "outputs": [], + "source": [ + "def add_rank_column(input_data_frame : pd.DataFrame, column_name : str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Adds a rank column (\"dense\" mode) to the input data frame based on the given column name.\n", + " input_data_frame : pd.DataFrame : The input data frame\n", + " column_name : str : The name of the column to rank\n", + " return : pd.DataFrame : The modified dataframe with the added rank column\n", + " \"\"\"\n", + " data_frame=input_data_frame.copy()\n", + " data_frame[column_name + '_rank'] = data_frame[column_name].rank(ascending=True, method='dense')\n", + " return data_frame" + ] + }, + { + "cell_type": "markdown", + "id": "da109679", + "metadata": {}, + "source": [ + "### File Data Preparation Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "299b06ea", + "metadata": {}, + "outputs": [], + "source": [ + "def get_last_entry(values: pd.Series):\n", + " \"\"\"\n", + " Get the last element of an array and converts therefore an array to a single element\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : any : The last entry\n", + " \"\"\"\n", + " return values[-1]\n", + "\n", + "\n", + "def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n", + " \"\"\"\n", + " Adds a fileExtension column to the input DataFrame based on the file path column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " file_path_column : str : The name of the file path column\n", + " file_extension_column : str : The name of the file extension column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory column\n", + " \"\"\"\n", + " if file_extension_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " # What is the correct extension in the following cases?\n", + " # - /main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n", + " # - MyReactComponent.test.tsx\n", + " # Currently, it would be\n", + " # - HandlerEnhancerDefinition\n", + " # - tsx\n", + " # which is not perfect but good enough to start with.#\n", + " \n", + " file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n", + " file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n", + " file_extensions=file_extensions.str.split('.').map(get_last_entry)\n", + " input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n", + " return input_dataframe\n", + "\n", + "def remove_last_file_path_element(file_path_elements: list) -> list:\n", + " \"\"\"\n", + " Removes the last element of the file path so that only the directory names retain.\n", + " file_path_elements : list : The list of levels to remove\n", + " return : list : The list of the directories\n", + " \"\"\"\n", + " return file_path_elements[:-1] if len(file_path_elements) > 1 else ['']\n", + "\n", + "def convert_path_elements_to_directories(file_path_elements: list) -> list:\n", + " \"\"\"\n", + " Converts the file path elements into directories.\n", + " file_path_elements : list : The list of levels to convert\n", + " return : list : The list of directories\n", + " \"\"\"\n", + " directories = remove_last_file_path_element(file_path_elements)\n", + " return ['/'.join(directories[:i+1]) for i in range(len(directories))]\n", + "\n", + "def add_directory_column(input_dataframe: pd.DataFrame, file_path_column: str, directory_column: str = 'directoryPath'):\n", + " \"\"\"\n", + " Adds a directory column to the input DataFrame based on the file path column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " file_path_column : str : The name of the file path column\n", + " directory_column : str : The name of the directory column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory column\n", + " \"\"\"\n", + " if directory_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " input_dataframe.insert(0, directory_column, input_dataframe[file_path_column].str.split('/').apply(convert_path_elements_to_directories))\n", + " input_dataframe = input_dataframe.explode(directory_column)\n", + " return input_dataframe\n", + "\n", + "def add_directory_name_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_name_column: str = 'directoryName'):\n", + " \"\"\"\n", + " Adds a directory name column to the input DataFrame based on the directory column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " directory_column : str : The name of the directory column\n", + " directory_name_column : str : The name of the directory name column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory name column\n", + " \"\"\"\n", + " if directory_name_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n", + " input_dataframe.insert(1, directory_name_column, splitted_directories.apply(lambda x: (x[-1])))\n", + " return input_dataframe\n", + "\n", + "def add_parent_directory_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_parent_column: str = 'directoryParentPath'):\n", + " \"\"\"\n", + " Adds a directory parent column to the input DataFrame based on the directory column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " directory_column : str : The name of the directory column\n", + " directory_parent_column : str : The name of the directory parent column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory parent column\n", + " \"\"\"\n", + " if directory_parent_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " # Remove last path element from directory_column to get the directory_parent_column\n", + " splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n", + " input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n", + " \n", + " # Clear parent (set to empty string) when it equal to the directory\n", + " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n", + " return input_dataframe\n", + "\n", + "\n", + "def collect_as_array(values: pd.Series):\n", + " \"\"\"\n", + " Just collect all values (no operation, \"noop\")\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : any : The second entry\n", + " \"\"\"\n", + " return np.asanyarray(values.to_list())\n", + "\n", + "def second_entry(values: pd.Series):\n", + " \"\"\"\n", + " Returns the second entry of a list of values.\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : any : The second entry\n", + " \"\"\"\n", + " return values.iloc[1] if len(values) > 1 else None\n", + "\n", + "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n", + " \"\"\"\n", + " Return the file count from an array of array of file paths.\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : int : The number of files\n", + " \"\"\"\n", + " return len(np.unique(np.concatenate(values.to_list())))\n", + "\n", + "\n", + "def get_most_frequent_entry(input_values: pd.Series):\n", + " \"\"\"\n", + " Flattens the array of arrays and return the most frequent entry .\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " input_values : Series : The pandas Series of values\n", + " return : str : The most frequent entry\n", + " \"\"\"\n", + " # flatten the array of arrays \n", + " values = np.concatenate(input_values.to_list())\n", + " # find frequency of each value\n", + " values, counts = np.unique(values, return_counts=True)\n", + " #display all values with highest frequencies\n", + " return values[counts.argmax()]" + ] + }, + { + "cell_type": "markdown", + "id": "09aeae9b", + "metadata": {}, + "source": [ + "### File Data Preparation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "682d8aa9", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher\")\n", + "\n", + "# Get all authors, their commit count and based on it their rank in a separate dataframe.\n", + "# This will then be needed to visualize the (main) author for each directory.\n", + "git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(\n", + " authorCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", + " ).sort_values(by='authorCommitCount', ascending=False).reset_index()\n", + "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=False, method='dense').astype(int)\n", + "\n", + "# Debug\n", + "# display(git_file_authors)\n", + "\n", + "# Debug\n", + "# display(\"1. query result ---------------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Add new column 'fileExtension' for every 'filePath'\n", + "git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n", + "\n", + "# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n", + "git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n", + "git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=False, method='dense').astype(int)\n", + "\n", + "# Debug\n", + "# display(git_file_extensions)\n", + "\n", + "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n", + "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n", + "\n", + "# Debug\n", + "# display(\"2. added directoryPath and fileExtension --------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Define how common non-grouped columns will be aggregated.\n", + "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n", + "common_named_aggregation = dict(\n", + " commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", + " daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n", + " daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n", + " daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n", + " lastCommitDate=pd.NamedAgg(column=\"lastCommitDate\", aggfunc=\"max\"),\n", + " lastCreationDate=pd.NamedAgg(column=\"lastCreationDate\", aggfunc=\"max\"),\n", + " lastModificationDate=pd.NamedAgg(column=\"lastModificationDate\", aggfunc=\"max\"),\n", + " maxCommitSha=pd.NamedAgg(column=\"maxCommitSha\", aggfunc=\"max\"),\n", + ")\n", + "\n", + "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n", + " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", + " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n", + " fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n", + " **common_named_aggregation\n", + ")\n", + "\n", + "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n", + "# The author with the most commits will then be listed first for each directory.\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", + "\n", + "# Debug\n", + "# display(\"3. grouped by 'directoryPath' and 'author' -----\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n", + "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n", + " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", + " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n", + " mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n", + " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n", + " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n", + " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n", + " **common_named_aggregation\n", + ")\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", + "\n", + "# Debug\n", + "# display(\"4. grouped by 'directoryPath' ----------------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n", + "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')\n", + "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')\n", + "\n", + "# Debug\n", + "# display(\"5. added parent and name columns ------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n", + "all_column_names_except_for_the_directory_path = git_files_with_commit_statistics.columns.to_list()[3:]\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(all_column_names_except_for_the_directory_path).aggregate(\n", + " directoryName=pd.NamedAgg(column=\"directoryName\", aggfunc=lambda names: '/'.join(names)),\n", + " directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n", + " directoryPath=pd.NamedAgg(column=\"directoryPath\", aggfunc=\"last\"),\n", + ")\n", + "# Reorder the column positions so that the directory path is again the first column. \n", + "all_column_names_with_the_directory_path_first = ['directoryPath', 'directoryParentPath', 'directoryName'] + all_column_names_except_for_the_directory_path\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()[all_column_names_with_the_directory_path_first]\n", + "\n", + "# Debug\n", + "# display(\"6. grouped by all except for directory path, name and parent columns (max) ----------------------\")\n", + "# display(git_files_with_commit_statistics)" + ] + }, + { + "cell_type": "markdown", + "id": "114f8d4b", + "metadata": {}, + "source": [ + "### Data Preview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc0c2d06", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "053b448d", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics.head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "ccc11f52", + "metadata": {}, + "source": [ + "### Number of files per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc0dc138", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " values = git_files_with_commit_statistics['fileCount'],\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Directories and their file count'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "e93d944a", + "metadata": {}, + "source": [ + "### Most frequent file extension per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0147c747", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics_and_file_extension_rank = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " git_file_extensions, \n", + " left_on='mostFrequentFileExtension', \n", + " right_on=\"fileExtension\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_files_with_commit_statistics['fileCount'],\n", + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_file_extension_rank, 'fileExtension', 'fileExtensionCountRank')\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Most frequent file extension per directory'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "e98ca7b1", + "metadata": {}, + "source": [ + "### Number of commits per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b05c773", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_count_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"commitCount\", 0.98)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_count_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_count_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_count_per_directory['commitCount_limited'], \n", + " colorbar=dict(title=\"Commits\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Number of git commits',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "def69b07", + "metadata": {}, + "source": [ + "### Number of distinct authors per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baeb97f5", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_authors_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.98)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_authors_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_authors_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_authors_per_directory['authorCount_limited'], \n", + " colorbar=dict(title=\"Authors\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Number of distinct commit authors',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "5dbceaef", + "metadata": {}, + "source": [ + "### Main author per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e97c0d87", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics_and_main_author_rank = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " git_file_authors, \n", + " left_on='mainAuthor', \n", + " right_on=\"author\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_files_with_commit_statistics['fileCount'],\n", + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_main_author_rank, 'mainAuthor', 'authorCommitCountRank')\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Main authors with highest number of commits'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "349a1d03", + "metadata": {}, + "source": [ + "### Second author per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29484f84", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics_and_second_author_rank = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " git_file_authors, \n", + " left_on='secondAuthor', \n", + " right_on=\"author\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_files_with_commit_statistics['fileCount'],\n", + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_second_author_rank, 'secondAuthor', 'authorCommitCountRank')\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Second author with the second highest number of commits'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "0ed919b0", + "metadata": {}, + "source": [ + "### Days since last commit per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6929154", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_commit_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"daysSinceLastCommit\", 0.98)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " #values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_limited'], \n", + " colorbar=dict(title=\"Days\"),\n", + " ),\n", + "))\n", + "\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Days since last commit',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "a06f6d20", + "metadata": {}, + "source": [ + "### Days since last commit per directory (ranked)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "720aa99e", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_commit_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCommit\")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_rank'], \n", + " colorbar=dict(title=\"Rank\"),\n", + " ),\n", + "))\n", + "\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Rank of days since last commit',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "4ebf96f7", + "metadata": {}, + "source": [ + "### Days since last file creation per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0de46c2b", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_file_creation_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"daysSinceLastCreation\", 0.98)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_limited'], \n", + " colorbar=dict(title=\"Days\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Days since last file creation',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "772eab2a", + "metadata": {}, + "source": [ + "### Days since last file creation per directory (ranked)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83d918ee", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCreation\")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_rank'], \n", + " colorbar=dict(title=\"Rank\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Rank of days since last file creation',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "e34c46d5", + "metadata": {}, + "source": [ + "### Days since last file modification per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "423fdb2c", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_file_modification_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"daysSinceLastModification\", 0.98)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_limited'], \n", + " colorbar=dict(title=\"Days\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Days since last file modification',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "7abc96e4", + "metadata": {}, + "source": [ + "### Days since last file modification per directory (ranked)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62c33849", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastModification\")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_rank'], \n", + " colorbar=dict(title=\"Rank\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Rank of days since last file modification',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "d8c6ccee", + "metadata": {}, + "source": [ + "## Filecount per commit\n", + "\n", + "Shows how many commits had changed one file, how many had changed two files, and so on.\n", + "The chart is limited to 30 lines for improved readability.\n", + "The data preview also includes overall statistics including the number of commits that are filtered out in the chart." + ] + }, + { + "cell_type": "markdown", + "id": "ed53b6e5", + "metadata": {}, + "source": [ + "### Preview data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5526e458", + "metadata": {}, + "outputs": [], + "source": [ + "git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n", + "\n", + "print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n", + "print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n", + "display(git_file_count_per_commit.describe())\n", + "display(git_file_count_per_commit.head(30))" + ] + }, + { + "cell_type": "markdown", + "id": "dcea826e", + "metadata": {}, + "source": [ + "### Bar chart with the number of files per commit distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e9dbc57", + "metadata": {}, + "outputs": [], + "source": [ + "if git_file_count_per_commit.empty:\n", + " print(\"No data to plot\")\n", + "else:\n", + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n", + " x=git_file_count_per_commit['filesPerCommit'].head(30), \n", + " y=git_file_count_per_commit['commitCount'].head(30)),\n", + " )\n", + " figure.update_layout(\n", + " **plotly_bar_layout_base_settings,\n", + " title='Changed files per commit',\n", + " xaxis_title='file count',\n", + " yaxis_title='commit count'\n", + " )\n", + " figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "14e87aff", + "metadata": {}, + "source": [ + "## WordCloud of git authors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2f68f02", + "metadata": {}, + "outputs": [], + "source": [ + "# Query data from graph database\n", + "git_author_words_with_frequency = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher\")\n", + "\n", + "git_author_words_with_frequency.sort_values(by='frequency', ascending=False).reset_index(drop=True).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d83ce5f4", + "metadata": {}, + "outputs": [], + "source": [ + "from wordcloud import WordCloud\n", + "import matplotlib.pyplot as plot\n", + "\n", + "if not git_author_words_with_frequency.empty:\n", + " # Expects the first column of the DataFrame to contain the words/text and the second column to contain the count/frequency.\n", + " words_with_frequency_dict=git_author_words_with_frequency.set_index(git_author_words_with_frequency.columns[0]).to_dict()[git_author_words_with_frequency.columns[1]]\n", + " wordcloud = WordCloud(\n", + " width=800, \n", + " height=800,\n", + " max_words=600, \n", + " collocations=False,\n", + " background_color='white', \n", + " colormap='viridis'\n", + " ).generate_from_frequencies(words_with_frequency_dict)\n", + "\n", + " # Plot the word cloud\n", + " plot.figure(figsize=(15,15))\n", + " plot.imshow(wordcloud, interpolation='bilinear')\n", + " plot.axis(\"off\")\n", + " plot.title('Wordcloud of git authors')\n", + " plot.show()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateGitHistory", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Git History Charts with Neo4j" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/InternalDependenciesJava.ipynb b/jupyter/InternalDependenciesJava.ipynb index 13c9b59c0..83a21b3b3 100644 --- a/jupyter/InternalDependenciesJava.ipynb +++ b/jupyter/InternalDependenciesJava.ipynb @@ -50,34 +50,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c09da482", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -639,7 +630,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/InternalDependenciesTypescript.ipynb b/jupyter/InternalDependenciesTypescript.ipynb index 83ef138f8..879ca767f 100644 --- a/jupyter/InternalDependenciesTypescript.ipynb +++ b/jupyter/InternalDependenciesTypescript.ipynb @@ -50,34 +50,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb3646d7", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -481,7 +472,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/MethodMetricsJava.ipynb b/jupyter/MethodMetricsJava.ipynb index f543149f4..acba2edca 100644 --- a/jupyter/MethodMetricsJava.ipynb +++ b/jupyter/MethodMetricsJava.ipynb @@ -51,16 +51,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -467,7 +460,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaMethods", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -481,7 +474,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/NodeEmbeddingsJava.ipynb b/jupyter/NodeEmbeddingsJava.ipynb index 071b28efc..a366c528c 100644 --- a/jupyter/NodeEmbeddingsJava.ipynb +++ b/jupyter/NodeEmbeddingsJava.ipynb @@ -98,28 +98,14 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd1d9775", - "metadata": {}, - "outputs": [], - "source": [ + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", @@ -492,7 +478,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -506,7 +492,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/NodeEmbeddingsTypescript.ipynb b/jupyter/NodeEmbeddingsTypescript.ipynb index e7b3b5df9..906234573 100644 --- a/jupyter/NodeEmbeddingsTypescript.ipynb +++ b/jupyter/NodeEmbeddingsTypescript.ipynb @@ -98,28 +98,14 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd1d9775", - "metadata": {}, - "outputs": [], - "source": [ + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", @@ -495,7 +481,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -509,7 +495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/ObjectOrientedDesignMetricsJava.ipynb b/jupyter/ObjectOrientedDesignMetricsJava.ipynb index a3e32b78b..1f1320523 100644 --- a/jupyter/ObjectOrientedDesignMetricsJava.ipynb +++ b/jupyter/ObjectOrientedDesignMetricsJava.ipynb @@ -54,34 +54,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypher_file_name: str):\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", " with open(cypher_file_name) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "013395f1", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -641,7 +632,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -655,7 +646,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb b/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb index e6d93acb5..9c381210d 100644 --- a/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb +++ b/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb @@ -54,34 +54,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypher_file_name: str):\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", " with open(cypher_file_name) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "013395f1", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -476,7 +467,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -490,7 +481,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/OverviewGeneral.ipynb b/jupyter/OverviewGeneral.ipynb index 92657b2c4..9a4843e9d 100644 --- a/jupyter/OverviewGeneral.ipynb +++ b/jupyter/OverviewGeneral.ipynb @@ -51,22 +51,20 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", - " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" ] @@ -610,7 +608,7 @@ } ], "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -624,7 +622,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Graph Metrics" }, diff --git a/jupyter/OverviewJava.ipynb b/jupyter/OverviewJava.ipynb index cd1db0c00..0ace9dd72 100644 --- a/jupyter/OverviewJava.ipynb +++ b/jupyter/OverviewJava.ipynb @@ -51,16 +51,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -552,7 +545,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaTypes", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -566,7 +559,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Overview for Java" }, diff --git a/jupyter/OverviewTypescript.ipynb b/jupyter/OverviewTypescript.ipynb index 5bfefe77c..c868fbcec 100644 --- a/jupyter/OverviewTypescript.ipynb +++ b/jupyter/OverviewTypescript.ipynb @@ -52,16 +52,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -416,7 +409,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -430,7 +423,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Overview for Typescript" }, diff --git a/jupyter/PathFindingJava.ipynb b/jupyter/PathFindingJava.ipynb index 7b8993b61..427e02b27 100644 --- a/jupyter/PathFindingJava.ipynb +++ b/jupyter/PathFindingJava.ipynb @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "d19447e1", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "id": "807bba03", "metadata": {}, "outputs": [], @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "id": "648e2a5a", "metadata": {}, "outputs": [], @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": null, "id": "e49ca888", "metadata": {}, "outputs": [], @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": null, "id": "1c5dab37", "metadata": {}, "outputs": [], @@ -183,23 +183,16 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "id": "c1db254b", "metadata": {}, "outputs": [], "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -207,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": null, "id": "7d2e62d6", "metadata": {}, "outputs": [], @@ -242,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": null, "id": "3f2e905c", "metadata": {}, "outputs": [], @@ -258,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "id": "d2d60597", "metadata": {}, "outputs": [], @@ -291,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "id": "5ef848fd", "metadata": {}, "outputs": [], @@ -310,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "id": "a1c433f7", "metadata": {}, "outputs": [], @@ -333,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "id": "036264ca", "metadata": {}, "outputs": [], @@ -372,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "de2e71ce", "metadata": {}, "outputs": [], @@ -390,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "id": "27a583e9", "metadata": {}, "outputs": [], @@ -432,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "id": "e63ddc97", "metadata": {}, "outputs": [], @@ -484,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": null, "id": "7243fbfd", "metadata": {}, "outputs": [], @@ -529,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "id": "5262a4ea", "metadata": {}, "outputs": [], @@ -605,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "id": "1ecc41b1", "metadata": {}, "outputs": [], @@ -619,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "id": "0b637ce2", "metadata": {}, "outputs": [], @@ -660,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": null, "id": "62f50f28", "metadata": {}, "outputs": [], @@ -840,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, "id": "7b90cfbc", "metadata": {}, "outputs": [], @@ -925,7 +918,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": null, "id": "8b43628b", "metadata": {}, "outputs": [], @@ -1106,7 +1099,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": null, "id": "9765cec6", "metadata": {}, "outputs": [], @@ -1197,7 +1190,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "id": "cec6c79b", "metadata": {}, "outputs": [], @@ -1211,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "id": "deda506f", "metadata": {}, "outputs": [], @@ -1251,7 +1244,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": null, "id": "b0cdf8c5", "metadata": {}, "outputs": [], @@ -1359,7 +1352,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "id": "29e4e8f2", "metadata": {}, "outputs": [], @@ -1522,7 +1515,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -1536,7 +1529,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Path finding algorithms for Java package and artifact dependencies with Neo4j" }, diff --git a/jupyter/PathFindingTypescript.ipynb b/jupyter/PathFindingTypescript.ipynb index 14cfb725f..36c97e37e 100644 --- a/jupyter/PathFindingTypescript.ipynb +++ b/jupyter/PathFindingTypescript.ipynb @@ -190,16 +190,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -1572,7 +1565,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Path finding algorithms for Typescript module dependencies with Neo4j" }, diff --git a/jupyter/VisibilityMetricsJava.ipynb b/jupyter/VisibilityMetricsJava.ipynb index 3b42c7be2..9f0530869 100644 --- a/jupyter/VisibilityMetricsJava.ipynb +++ b/jupyter/VisibilityMetricsJava.ipynb @@ -92,18 +92,11 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" diff --git a/jupyter/VisibilityMetricsTypescript.ipynb b/jupyter/VisibilityMetricsTypescript.ipynb index 4dc9e3186..badfd1111 100644 --- a/jupyter/VisibilityMetricsTypescript.ipynb +++ b/jupyter/VisibilityMetricsTypescript.ipynb @@ -92,9 +92,14 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" + "def get_cypher_query_from_file(cypher_file_name):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + " \n", + "\n", + "def query_cypher_to_data_frame(filename):\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, { @@ -103,11 +108,7 @@ "id": "59310f6f", "metadata": {}, "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] + "source": [] }, { "cell_type": "code", @@ -420,7 +421,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -434,7 +435,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Visibility Metrics for Typescript" }, diff --git a/jupyter/Wordcloud.ipynb b/jupyter/Wordcloud.ipynb index f84f03d5e..7c0330e40 100644 --- a/jupyter/Wordcloud.ipynb +++ b/jupyter/Wordcloud.ipynb @@ -46,23 +46,16 @@ }, { "cell_type": "code", - "execution_count": 249, + "execution_count": null, "id": "c1db254b", "metadata": {}, "outputs": [], "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": 250, - "id": "6e8772aa", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename: str, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -270,7 +263,7 @@ } ], "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -284,7 +277,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/environment.yml b/jupyter/environment.yml index cf6bcdd76..e0ce499d9 100644 --- a/jupyter/environment.yml +++ b/jupyter/environment.yml @@ -1,6 +1,7 @@ name: codegraph channels: - conda-forge/label/python_rc # Needed on Mac since Python >= 3.12 + - plotly - conda-forge dependencies: - python=3.12.* @@ -17,5 +18,7 @@ dependencies: - opentsne=1.0.* # to visualize node embeddings in 2D (t-SNE dimensionality reduction) - wordcloud=1.9.* - monotonic=1.* + - plotly=6.0.* + - python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries. - pip: - neo4j==5.23.* \ No newline at end of file diff --git a/scripts/downloader/downloadAxonFramework.sh b/scripts/downloader/downloadAxonFramework.sh index 219700cb8..63a9ee86a 100755 --- a/scripts/downloader/downloadAxonFramework.sh +++ b/scripts/downloader/downloadAxonFramework.sh @@ -57,6 +57,6 @@ source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g "${ARTIFACTS_GROUP}" -a "axo # This makes it possible to additionally import the git log into the graph if [ ! -d "${SOURCE_DIRECTORY}/AxonFramework-${ARTIFACTS_VERSION}/.git" ]; then echo "download${ANALYSIS_NAME}: Getting bare git history of source code repository..." - git clone --bare https://github.com/AxonFramework/AxonFramework.git --branch "axon-${ARTIFACTS_VERSION}" "${SOURCE_DIRECTORY}/AxonFramework-${ARTIFACTS_VERSION}/.git" + git clone --bare https://github.com/AxonFramework/AxonFramework.git --branch "axon-${ARTIFACTS_VERSION}" --single-branch "${SOURCE_DIRECTORY}/AxonFramework-${ARTIFACTS_VERSION}/.git" fi ################################################################ \ No newline at end of file diff --git a/scripts/downloader/downloadTypescriptProject.sh b/scripts/downloader/downloadTypescriptProject.sh index bba26ebf2..70f02f10e 100755 --- a/scripts/downloader/downloadTypescriptProject.sh +++ b/scripts/downloader/downloadTypescriptProject.sh @@ -115,7 +115,7 @@ if [ ! -d "${fullSourceDirectory}" ]; then # source doesn't exist if [ -n "${cloneUrl}" ]; then # only clone if url is specified and source doesn't exist echo "downloadTypescriptProject: Cloning ${cloneUrl} with version ${projectVersion}..." # A full clone is done since not only the source is scanned, but also the git log/history. - git clone --branch "${projectTag}" "${cloneUrl}" "${fullSourceDirectory}" + git clone --branch "${projectTag}" --single-branch "${cloneUrl}" "${fullSourceDirectory}" else # Source doesn't exist and no clone URL is specified. echo "downloadTypescriptProject: Error: Source directory ${fullSourceDirectory} for project ${projectName} not found." diff --git a/scripts/executeJupyterNotebook.sh b/scripts/executeJupyterNotebook.sh index 0551b198d..476d1e5e6 100755 --- a/scripts/executeJupyterNotebook.sh +++ b/scripts/executeJupyterNotebook.sh @@ -88,8 +88,9 @@ source "${SCRIPTS_DIR}/activateCondaEnvironment.sh" jupyter --version || exit 1 # Execute the Jupyter Notebook and write it to the output file name +# The environment variable NBCONVERT is needed to be able to detect a command line execution in the Jupyter Notebook. echo "executeJupyterNotebook: Executing Jupyter Notebook ${jupyter_notebook_output_file_name}..." -jupyter nbconvert --to notebook \ +NBCONVERT=true jupyter nbconvert --to notebook \ --execute "${jupyter_notebook_file}" \ --output "$jupyter_notebook_output_file_name" \ --output-dir="./" \ @@ -107,7 +108,8 @@ mv -f "${jupyter_notebook_markdown_file}.nostyle" "${jupyter_notebook_markdown_f echo "executeJupyterNotebook: Successfully created Markdown ${jupyter_notebook_markdown_file}.." # Convert the Jupyter Notebook to PDF +# The environment variable NBCONVERT is needed to be able to detect a command line execution in the Jupyter Notebook. if [ -n "${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION}" ]; then - jupyter nbconvert --to webpdf --no-input --allow-chromium-download --disable-chromium-sandbox "$jupyter_notebook_output_file" + NBCONVERT=true jupyter nbconvert --to webpdf --no-input --allow-chromium-download --disable-chromium-sandbox "$jupyter_notebook_output_file" echo "executeJupyterNotebook: Successfully created PDF ${jupyter_notebook_output_file}." fi \ No newline at end of file