From ff9ad6ec9136cc0e2c709811bff3edd1758ce1b7 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 25 Feb 2025 21:32:19 +0100 Subject: [PATCH 1/9] Filter out all git branches except for the selected one --- COMMANDS.md | 2 +- GETTING_STARTED.md | 2 +- scripts/downloader/downloadAxonFramework.sh | 2 +- scripts/downloader/downloadTypescriptProject.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/COMMANDS.md b/COMMANDS.md index a785a91ba..62d6c8ef0 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -264,7 +264,7 @@ Here is the resulting schema: #### Parameter -The optional parameter `--source directory-path-to-the-source-folder-containing-git-repositories` can be used to select a different directory for the repositories. By default, the `source` directory within the analysis workspace directory is used. This command only needs the git history to be present. Therefore, `git clone --bare` is sufficient. If the `source` directory is also used for code analysis (like for Typescript) then a full git clone is of course needed. +The optional parameter `--source directory-path-to-the-source-folder-containing-git-repositories` can be used to select a different directory for the repositories. By default, the `source` directory within the analysis workspace directory is used. This command only needs the git history to be present. Therefore, `git clone --bare` is sufficient. If the `source` directory is also used for code analysis (like for Typescript) then a full git clone is of course needed. Additionally, if you want to focus on a specific version or branch, use `--branch branch-name` to checkout the branch and `--single-branch` to exclude other branches before importing the git log data. #### Environment Variable diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index 5fb5dd58d..9f138d666 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -66,7 +66,7 @@ Use these optional command line options as needed: - If you want to analyze Typescript code, create a symbolic link inside the `source` directory that points to the Typescript project. Alternatively you can also copy the project into the `source` directory. -- If you want to include git data like changed files and authors, create a symbolic link inside the `source` directory that points to the repository or clone it into the `source` directory. If you already have your Typescript project in there, you of course don't have to do it twice. If you are analyzing Java artifacts (full source not needed), it is sufficient to use a bare clone that only contains the git history without the sources using `git clone --bare`. +- If you want to include git data like changed files and authors, create a symbolic link inside the `source` directory that points to the repository or clone it into the `source` directory. If you already have your Typescript project in there, you of course don't have to do it twice. If you are analyzing Java artifacts (full source not needed), it is sufficient to use a bare clone that only contains the git history without the sources using `git clone --bare`. If you want to focus on one branch, use `--branch branch-name` to checkout the branch and `--single-branch` to only fetch the history of that branch. - Alternatively to the steps above, run an already predefined download script diff --git a/scripts/downloader/downloadAxonFramework.sh b/scripts/downloader/downloadAxonFramework.sh index 219700cb8..63a9ee86a 100755 --- a/scripts/downloader/downloadAxonFramework.sh +++ b/scripts/downloader/downloadAxonFramework.sh @@ -57,6 +57,6 @@ source "${SCRIPTS_DIR}/downloadMavenArtifact.sh" -g "${ARTIFACTS_GROUP}" -a "axo # This makes it possible to additionally import the git log into the graph if [ ! -d "${SOURCE_DIRECTORY}/AxonFramework-${ARTIFACTS_VERSION}/.git" ]; then echo "download${ANALYSIS_NAME}: Getting bare git history of source code repository..." - git clone --bare https://github.com/AxonFramework/AxonFramework.git --branch "axon-${ARTIFACTS_VERSION}" "${SOURCE_DIRECTORY}/AxonFramework-${ARTIFACTS_VERSION}/.git" + git clone --bare https://github.com/AxonFramework/AxonFramework.git --branch "axon-${ARTIFACTS_VERSION}" --single-branch "${SOURCE_DIRECTORY}/AxonFramework-${ARTIFACTS_VERSION}/.git" fi ################################################################ \ No newline at end of file diff --git a/scripts/downloader/downloadTypescriptProject.sh b/scripts/downloader/downloadTypescriptProject.sh index bba26ebf2..70f02f10e 100755 --- a/scripts/downloader/downloadTypescriptProject.sh +++ b/scripts/downloader/downloadTypescriptProject.sh @@ -115,7 +115,7 @@ if [ ! -d "${fullSourceDirectory}" ]; then # source doesn't exist if [ -n "${cloneUrl}" ]; then # only clone if url is specified and source doesn't exist echo "downloadTypescriptProject: Cloning ${cloneUrl} with version ${projectVersion}..." # A full clone is done since not only the source is scanned, but also the git log/history. - git clone --branch "${projectTag}" "${cloneUrl}" "${fullSourceDirectory}" + git clone --branch "${projectTag}" --single-branch "${cloneUrl}" "${fullSourceDirectory}" else # Source doesn't exist and no clone URL is specified. echo "downloadTypescriptProject: Error: Source directory ${fullSourceDirectory} for project ${projectName} not found." From 9e8fd8461b6f63e741876db692c9c9ed5e2af42e Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:39:30 +0100 Subject: [PATCH 2/9] Fix default query limit 10_000 and use -1 for unlimited --- jupyter/ExternalDependenciesJava.ipynb | 19 ++--- jupyter/ExternalDependenciesTypescript.ipynb | 17 ++--- jupyter/InternalDependenciesJava.ipynb | 45 +++++------- jupyter/InternalDependenciesTypescript.ipynb | 45 +++++------- jupyter/MethodMetricsJava.ipynb | 17 ++--- jupyter/NodeEmbeddingsJava.ipynb | 30 +++----- jupyter/NodeEmbeddingsTypescript.ipynb | 30 +++----- jupyter/ObjectOrientedDesignMetricsJava.ipynb | 45 +++++------- ...bjectOrientedDesignMetricsTypescript.ipynb | 45 +++++------- jupyter/OverviewGeneral.ipynb | 32 ++++----- jupyter/OverviewJava.ipynb | 17 ++--- jupyter/OverviewTypescript.ipynb | 17 ++--- jupyter/PathFindingJava.ipynb | 71 +++++++++---------- jupyter/PathFindingTypescript.ipynb | 15 ++-- jupyter/VisibilityMetricsJava.ipynb | 17 ++--- jupyter/VisibilityMetricsTypescript.ipynb | 21 +++--- jupyter/Wordcloud.ipynb | 19 ++--- 17 files changed, 187 insertions(+), 315 deletions(-) diff --git a/jupyter/ExternalDependenciesJava.ipynb b/jupyter/ExternalDependenciesJava.ipynb index b35645241..0c533f790 100644 --- a/jupyter/ExternalDependenciesJava.ipynb +++ b/jupyter/ExternalDependenciesJava.ipynb @@ -44,23 +44,16 @@ }, { "cell_type": "code", - "execution_count": 235, + "execution_count": null, "id": "c1db254b", "metadata": {}, "outputs": [], "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": 236, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -1735,7 +1728,7 @@ "celltoolbar": "Tags", "code_graph_analysis_pipeline_data_validation": "ValidateJavaExternalDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -1749,7 +1742,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "External Dependencies for Java" }, diff --git a/jupyter/ExternalDependenciesTypescript.ipynb b/jupyter/ExternalDependenciesTypescript.ipynb index a6d80e822..c8de0f004 100644 --- a/jupyter/ExternalDependenciesTypescript.ipynb +++ b/jupyter/ExternalDependenciesTypescript.ipynb @@ -51,16 +51,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -1638,7 +1631,7 @@ "celltoolbar": "Tags", "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -1652,7 +1645,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "External Dependencies for Typescript" }, diff --git a/jupyter/InternalDependenciesJava.ipynb b/jupyter/InternalDependenciesJava.ipynb index 13c9b59c0..83a21b3b3 100644 --- a/jupyter/InternalDependenciesJava.ipynb +++ b/jupyter/InternalDependenciesJava.ipynb @@ -50,34 +50,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c09da482", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -639,7 +630,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/InternalDependenciesTypescript.ipynb b/jupyter/InternalDependenciesTypescript.ipynb index 83ef138f8..879ca767f 100644 --- a/jupyter/InternalDependenciesTypescript.ipynb +++ b/jupyter/InternalDependenciesTypescript.ipynb @@ -50,34 +50,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb3646d7", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -481,7 +472,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/MethodMetricsJava.ipynb b/jupyter/MethodMetricsJava.ipynb index f543149f4..acba2edca 100644 --- a/jupyter/MethodMetricsJava.ipynb +++ b/jupyter/MethodMetricsJava.ipynb @@ -51,16 +51,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -467,7 +460,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaMethods", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -481,7 +474,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/NodeEmbeddingsJava.ipynb b/jupyter/NodeEmbeddingsJava.ipynb index 071b28efc..a366c528c 100644 --- a/jupyter/NodeEmbeddingsJava.ipynb +++ b/jupyter/NodeEmbeddingsJava.ipynb @@ -98,28 +98,14 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd1d9775", - "metadata": {}, - "outputs": [], - "source": [ + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", @@ -492,7 +478,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -506,7 +492,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/NodeEmbeddingsTypescript.ipynb b/jupyter/NodeEmbeddingsTypescript.ipynb index e7b3b5df9..906234573 100644 --- a/jupyter/NodeEmbeddingsTypescript.ipynb +++ b/jupyter/NodeEmbeddingsTypescript.ipynb @@ -98,28 +98,14 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd1d9775", - "metadata": {}, - "outputs": [], - "source": [ + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", "def query_first_non_empty_cypher_to_data_frame(*filenames : str, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", @@ -495,7 +481,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -509,7 +495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/ObjectOrientedDesignMetricsJava.ipynb b/jupyter/ObjectOrientedDesignMetricsJava.ipynb index a3e32b78b..1f1320523 100644 --- a/jupyter/ObjectOrientedDesignMetricsJava.ipynb +++ b/jupyter/ObjectOrientedDesignMetricsJava.ipynb @@ -54,34 +54,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypher_file_name: str):\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", " with open(cypher_file_name) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "013395f1", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -641,7 +632,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -655,7 +646,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb b/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb index e6d93acb5..9c381210d 100644 --- a/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb +++ b/jupyter/ObjectOrientedDesignMetricsTypescript.ipynb @@ -54,34 +54,25 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypher_file_name: str):\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", " with open(cypher_file_name) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "013395f1", - "metadata": {}, - "outputs": [], - "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -476,7 +467,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -490,7 +481,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, diff --git a/jupyter/OverviewGeneral.ipynb b/jupyter/OverviewGeneral.ipynb index 92657b2c4..9a4843e9d 100644 --- a/jupyter/OverviewGeneral.ipynb +++ b/jupyter/OverviewGeneral.ipynb @@ -51,22 +51,20 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", - " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" ] @@ -610,7 +608,7 @@ } ], "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -624,7 +622,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Graph Metrics" }, diff --git a/jupyter/OverviewJava.ipynb b/jupyter/OverviewJava.ipynb index cd1db0c00..0ace9dd72 100644 --- a/jupyter/OverviewJava.ipynb +++ b/jupyter/OverviewJava.ipynb @@ -51,16 +51,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -552,7 +545,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaTypes", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -566,7 +559,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Overview for Java" }, diff --git a/jupyter/OverviewTypescript.ipynb b/jupyter/OverviewTypescript.ipynb index 5bfefe77c..c868fbcec 100644 --- a/jupyter/OverviewTypescript.ipynb +++ b/jupyter/OverviewTypescript.ipynb @@ -52,16 +52,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -416,7 +409,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -430,7 +423,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Overview for Typescript" }, diff --git a/jupyter/PathFindingJava.ipynb b/jupyter/PathFindingJava.ipynb index 7b8993b61..427e02b27 100644 --- a/jupyter/PathFindingJava.ipynb +++ b/jupyter/PathFindingJava.ipynb @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "d19447e1", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "id": "807bba03", "metadata": {}, "outputs": [], @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "id": "648e2a5a", "metadata": {}, "outputs": [], @@ -157,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": null, "id": "e49ca888", "metadata": {}, "outputs": [], @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": null, "id": "1c5dab37", "metadata": {}, "outputs": [], @@ -183,23 +183,16 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "id": "c1db254b", "metadata": {}, "outputs": [], "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -207,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": null, "id": "7d2e62d6", "metadata": {}, "outputs": [], @@ -242,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": null, "id": "3f2e905c", "metadata": {}, "outputs": [], @@ -258,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "id": "d2d60597", "metadata": {}, "outputs": [], @@ -291,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "id": "5ef848fd", "metadata": {}, "outputs": [], @@ -310,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "id": "a1c433f7", "metadata": {}, "outputs": [], @@ -333,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "id": "036264ca", "metadata": {}, "outputs": [], @@ -372,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "de2e71ce", "metadata": {}, "outputs": [], @@ -390,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "id": "27a583e9", "metadata": {}, "outputs": [], @@ -432,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "id": "e63ddc97", "metadata": {}, "outputs": [], @@ -484,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": null, "id": "7243fbfd", "metadata": {}, "outputs": [], @@ -529,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "id": "5262a4ea", "metadata": {}, "outputs": [], @@ -605,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": null, "id": "1ecc41b1", "metadata": {}, "outputs": [], @@ -619,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "id": "0b637ce2", "metadata": {}, "outputs": [], @@ -660,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": null, "id": "62f50f28", "metadata": {}, "outputs": [], @@ -840,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, "id": "7b90cfbc", "metadata": {}, "outputs": [], @@ -925,7 +918,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": null, "id": "8b43628b", "metadata": {}, "outputs": [], @@ -1106,7 +1099,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": null, "id": "9765cec6", "metadata": {}, "outputs": [], @@ -1197,7 +1190,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "id": "cec6c79b", "metadata": {}, "outputs": [], @@ -1211,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "id": "deda506f", "metadata": {}, "outputs": [], @@ -1251,7 +1244,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": null, "id": "b0cdf8c5", "metadata": {}, "outputs": [], @@ -1359,7 +1352,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "id": "29e4e8f2", "metadata": {}, "outputs": [], @@ -1522,7 +1515,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateJavaPackageDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -1536,7 +1529,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Path finding algorithms for Java package and artifact dependencies with Neo4j" }, diff --git a/jupyter/PathFindingTypescript.ipynb b/jupyter/PathFindingTypescript.ipynb index 14cfb725f..36c97e37e 100644 --- a/jupyter/PathFindingTypescript.ipynb +++ b/jupyter/PathFindingTypescript.ipynb @@ -190,16 +190,9 @@ "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + " \n", + "\n", "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -1572,7 +1565,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Path finding algorithms for Typescript module dependencies with Neo4j" }, diff --git a/jupyter/VisibilityMetricsJava.ipynb b/jupyter/VisibilityMetricsJava.ipynb index 3b42c7be2..9f0530869 100644 --- a/jupyter/VisibilityMetricsJava.ipynb +++ b/jupyter/VisibilityMetricsJava.ipynb @@ -92,18 +92,11 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" diff --git a/jupyter/VisibilityMetricsTypescript.ipynb b/jupyter/VisibilityMetricsTypescript.ipynb index 4dc9e3186..badfd1111 100644 --- a/jupyter/VisibilityMetricsTypescript.ipynb +++ b/jupyter/VisibilityMetricsTypescript.ipynb @@ -92,9 +92,14 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(cypherFileName):\n", - " with open(cypherFileName) as file:\n", - " return ' '.join(file.readlines())" + "def get_cypher_query_from_file(cypher_file_name):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())\n", + " \n", + "\n", + "def query_cypher_to_data_frame(filename):\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, { @@ -103,11 +108,7 @@ "id": "59310f6f", "metadata": {}, "outputs": [], - "source": [ - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] + "source": [] }, { "cell_type": "code", @@ -420,7 +421,7 @@ ], "code_graph_analysis_pipeline_data_validation": "ValidateTypescriptModuleDependencies", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -434,7 +435,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Visibility Metrics for Typescript" }, diff --git a/jupyter/Wordcloud.ipynb b/jupyter/Wordcloud.ipynb index f84f03d5e..7c0330e40 100644 --- a/jupyter/Wordcloud.ipynb +++ b/jupyter/Wordcloud.ipynb @@ -46,23 +46,16 @@ }, { "cell_type": "code", - "execution_count": 249, + "execution_count": null, "id": "c1db254b", "metadata": {}, "outputs": [], "source": [ "def get_cypher_query_from_file(filename):\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": 250, - "id": "6e8772aa", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename: str, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" @@ -270,7 +263,7 @@ } ], "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "codegraph", "language": "python", "name": "python3" }, @@ -284,7 +277,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.9" }, "title": "Object Oriented Design Quality Metrics for Java with Neo4j" }, From 4b284cc86ed5f013523ab867bb3021e9213363b3 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 10 Feb 2025 21:08:17 +0100 Subject: [PATCH 3/9] Add Python visualization library plotly --- README.md | 1 + jupyter/environment.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index c3295bb51..6bba94412 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ The [Code Structure Analysis Pipeline](./.github/workflows/internal-java-code-an - [numpy](https://numpy.org) - [pandas](https://pandas.pydata.org) - [pip](https://pip.pypa.io/en/stable) + - [plotly](https://plotly.com/python) - [monotonic](https://github.com/atdt/monotonic) - [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver) - [openTSNE](https://github.com/pavlin-policar/openTSNE) diff --git a/jupyter/environment.yml b/jupyter/environment.yml index cf6bcdd76..fdfb57c90 100644 --- a/jupyter/environment.yml +++ b/jupyter/environment.yml @@ -1,6 +1,7 @@ name: codegraph channels: - conda-forge/label/python_rc # Needed on Mac since Python >= 3.12 + - plotly - conda-forge dependencies: - python=3.12.* @@ -17,5 +18,6 @@ dependencies: - opentsne=1.0.* # to visualize node embeddings in 2D (t-SNE dimensionality reduction) - wordcloud=1.9.* - monotonic=1.* + - plotly=6.0.* - pip: - neo4j==5.23.* \ No newline at end of file From ab49b40d735509ae2c5569b14f993a58d05c12b8 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 7 Feb 2025 07:35:41 +0100 Subject: [PATCH 4/9] Add git history file overview treemap --- .../GitLog/List_git_files_directories.cypher | 29 + ..._directories_with_commit_statistics.cypher | 66 ++ ...it_statistics_no_joined_directories.cypher | 49 ++ cypher/Validation/ValidateGitHistory.cypher | 5 + jupyter/GitHistoryGeneral.ipynb | 798 ++++++++++++++++++ jupyter/environment.yml | 1 + scripts/executeJupyterNotebook.sh | 6 +- 7 files changed, 952 insertions(+), 2 deletions(-) create mode 100644 cypher/GitLog/List_git_files_directories.cypher create mode 100644 cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher create mode 100644 cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher create mode 100644 cypher/Validation/ValidateGitHistory.cypher create mode 100644 jupyter/GitHistoryGeneral.ipynb diff --git a/cypher/GitLog/List_git_files_directories.cypher b/cypher/GitLog/List_git_files_directories.cypher new file mode 100644 index 000000000..be62779a1 --- /dev/null +++ b/cypher/GitLog/List_git_files_directories.cypher @@ -0,0 +1,29 @@ +// List git file directories and the number of files they contain + + MATCH (git_file:File&Git&!Repository) + WITH * + ,git_file.relativePath AS gitFileName + ,reverse(split(reverse(git_file.relativePath),'/')[0]) AS gitFileNameWithoutPath + ,(git_file:Directory) AS isDirectory + WITH * + ,rtrim(split(gitFileName, gitFileNameWithoutPath)[0], '/') AS gitDirectoryPath + WITH gitDirectoryPath + ,coalesce(nullif(split(gitDirectoryPath, '/')[-2],''), 'root') AS directoryParentName + ,coalesce(nullif(split(gitDirectoryPath, '/')[-1],''), 'root') AS directoryName + ,size(split(gitDirectoryPath, '/')) AS pathLength + ,count(DISTINCT gitFileName) AS fileCount +// Debugging +// ,collect(git_file)[0..4] AS gitFileExamples +// ,collect(gitFileName) AS gitFileNameExamples +// ,collect(gitFileNameWithoutPath) AS gitFileNameWithoutPathExamples + WHERE fileCount > 1 +RETURN gitDirectoryPath + ,directoryParentName + ,directoryName + ,pathLength + ,fileCount +// Debugging +// ,gitFileExamples +// ,gitFileNameExamples +// ,gitFileNameWithoutPathExamples + ORDER BY gitDirectoryPath ASC \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher b/cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher new file mode 100644 index 000000000..522c50978 --- /dev/null +++ b/cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher @@ -0,0 +1,66 @@ +// List git file directories and their statistics + + MATCH (git_file:File&Git&!Repository) + WHERE git_file.deletedAt IS NULL // filter out deleted files + ORDER BY git_file.relativePath + WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch + ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch + ,collect(git_file) AS git_files +UNWIND git_files AS git_file + WITH * + ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp + ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp + WITH *, split(git_file.relativePath, '/') AS pathElements + WITH *, pathElements[-1] AS fileName + MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) + MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file) + WITH pathElements + ,fileCreatedAtTimestamp + ,fileLastModificationAtTimestamp + ,fileName + ,git_file.relativePath AS fileRelativePath + ,max(git_repository.name) AS repository + ,max(git_commit.sha) AS maxCommitSha + ,COUNT(DISTINCT git_commit.sha) AS commitCount + ,COUNT(DISTINCT git_commit.author) AS authorCount + ,date(max(git_commit.date)) AS lastCommitDate +UNWIND pathElements AS pathElement + WITH * + ,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent + WITH * + ,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory + WHERE pathElement <> fileName + WITH repository AS gitRepositoryName + ,directory AS directoryPath + ,split(directory, '/')[-1] AS directoryName + ,parent AS directoryParentPath + ,split(parent, '/')[-1] AS directoryParentName + ,size(split(directory, '/')) AS directoryPathLength + ,count(DISTINCT fileRelativePath) AS fileCount + ,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate + ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate + ,sum(commitCount) AS commitCount + ,sum(authorCount) AS authorCount + ,max(maxCommitSha) AS maxCommitSha + ,max(lastCommitDate) AS lastCommitDate + ,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit + ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation + ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification +// The final results are grouped by the statistic values like file count,... +RETURN gitRepositoryName + ,fileCount + ,lastCreationDate + ,lastModificationDate + ,commitCount + ,authorCount + ,maxCommitSha + ,lastCommitDate + ,daysSinceLastCommit + ,daysSinceLastCreation + ,daysSinceLastModification + ,collect(directoryPath)[-1] AS directoryPath + ,apoc.text.join(collect(directoryName), '/') AS directoryName + ,collect(directoryParentPath)[0] AS directoryParentPath + ,collect(directoryParentName)[0] AS directoryParentName + ,max(directoryPathLength) AS directoryPathLength + ,count(DISTINCT directoryPath) AS combinedDirectoriesCount \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher b/cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher new file mode 100644 index 000000000..d2939dbf9 --- /dev/null +++ b/cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher @@ -0,0 +1,49 @@ +// List git file directories and their statistics + + MATCH (git_file:File&Git&!Repository) + WHERE git_file.deletedAt IS NULL // filter out deleted files + ORDER BY git_file.relativePath + WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch + ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch + ,collect(git_file) AS git_files +UNWIND git_files AS git_file + WITH * + ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp + ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp + WITH *, split(git_file.relativePath, '/') AS pathElements + WITH *, pathElements[-1] AS fileName + MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) + MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file) + WITH pathElements + ,fileCreatedAtTimestamp + ,fileLastModificationAtTimestamp + ,fileName + ,git_file.relativePath AS fileRelativePath + ,max(git_repository.name) AS repository + ,COUNT(DISTINCT git_commit.sha) AS commitCount + ,COUNT(DISTINCT git_commit.author) AS authorCount + ,date(max(git_commit.date)) AS lastCommitDate +UNWIND pathElements AS pathElement + WITH * + ,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent + WITH * + ,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory + WHERE pathElement <> fileName +RETURN repository AS gitRepositoryName + ,directory AS directoryPath + ,split(directory, '/')[-1] AS directoryName + ,parent AS directoryParentPath + ,split(parent, '/')[-1] AS directoryParentName + ,size(split(directory, '/')) AS directoryPathLength + ,count(DISTINCT fileRelativePath) AS fileCount + ,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate + ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate + ,sum(commitCount) AS commitCount + ,sum(authorCount) AS authorCount + ,max(lastCommitDate) AS lastCommitDate + ,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit + ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation + ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification + // Debugging + //,collect(DISTINCT fileRelativePath)[0..4] AS relativePathExamples + //,collect(DISTINCT fileName)[0..4] AS fileNameExamples diff --git a/cypher/Validation/ValidateGitHistory.cypher b/cypher/Validation/ValidateGitHistory.cypher new file mode 100644 index 000000000..d2ead1b9b --- /dev/null +++ b/cypher/Validation/ValidateGitHistory.cypher @@ -0,0 +1,5 @@ +// Check if there is at least one Git:Commit pointing to a Git:Change containing a Git:File + + MATCH (commit:Git:Commit)-[:CONTAINS_CHANGE]->(change:Git:Change)-->(file:Git:File) +RETURN commit.sha AS commitSha + LIMIT 1 \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb new file mode 100644 index 000000000..4e760a7c0 --- /dev/null +++ b/jupyter/GitHistoryGeneral.ipynb @@ -0,0 +1,798 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# git log/history\n", + "
\n", + "\n", + "### References\n", + "- [Visualizing Code: Polyglot Notebooks Repository (YouTube)](https://youtu.be/ipOpToPS-PY?si=3doePt2cp-LgEUmt)\n", + "- [gitstractor (GitHub)](https://github.com/IntegerMan/gitstractor)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "pd.options.mode.copy_on_write = True\n", + "\n", + "from neo4j import GraphDatabase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c57aadf9", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from plotly import graph_objects as plotly_graph_objects\n", + "from plotly.subplots import make_subplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "740e64d9", + "metadata": {}, + "outputs": [], + "source": [ + "# To be able to distinguish between command line execution and Jupyter notebook execution\n", + "# we need to check if the environment variable NBCONVERT is set.\n", + "# The command line execution is required to take care of setting NBCONVERT.\n", + "def is_command_line_execution():\n", + " return 'NBCONVERT' in os.environ\n", + "\n", + "default_renderer = None\n", + "\n", + "if is_command_line_execution():\n", + " print(\"Command line execution (CLI mode): Yes\")\n", + " default_renderer = 'svg' # SVG is the default renderer for static (non interactive) pictures for command line execution\n", + "else:\n", + " print(\"Command line execution (CLI mode): No\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def get_cypher_query_from_file(cypher_file_name : str):\n", + " with open(cypher_file_name) as file:\n", + " return ' '.join(file.readlines())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59310f6f", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", + " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + " cypher_query = get_cypher_query_from_file(filename)\n", + " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " records, summary, keys = driver.execute_query(cypher_query)\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c09da482", + "metadata": {}, + "outputs": [], + "source": [ + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + " \"\"\"\n", + " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", + " If all given file names result in empty results, the last (empty) result will be returned.\n", + " By additionally specifying \"limit=\" the \"LIMIT\" keyword will appended to query so that only the first results get returned.\n", + " \"\"\" \n", + " result=pd.DataFrame()\n", + " for filename in filenames:\n", + " result=query_cypher_to_data_frame(filename, limit)\n", + " if not result.empty:\n", + " return result\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a56670c9", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006b9dc8", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6323e85e", + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas DataFrame Display Configuration\n", + "pd.set_option('display.max_colwidth', 300)" + ] + }, + { + "cell_type": "markdown", + "id": "fe17f2aa", + "metadata": {}, + "source": [ + "## Git History - Directory Commit Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f37df7c", + "metadata": {}, + "outputs": [], + "source": [ + "# The first part provides functions that provide basic functionality for the following parts." + ] + }, + { + "cell_type": "markdown", + "id": "01da524e", + "metadata": {}, + "source": [ + "### Treemap Layout Functions and Constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "841967e5", + "metadata": {}, + "outputs": [], + "source": [ + "# Base settings for Plotly Treemap\n", + "\n", + "plotly_treemap_base_settings = dict(\n", + " color_continuous_scale='Hot_r', # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", + " path=['gitRepositoryName', 'directoryParentName', 'directoryName'],\n", + " maxdepth=-1\n", + ")\n", + "plotly_treemap_traces_base_settings = dict(\n", + " root_color=\"lightgrey\",\n", + " textinfo=\"label+value\",\n", + " marker=dict(cornerradius=5),\n", + ")\n", + "plotly_treemap_layout_base_settings = dict(\n", + " margin=dict(t=50, l=15, r=15, b=15),\n", + ")\n", + "plotly_treemap_figure_base_settings = dict(\n", + " renderer=\"svg\" if is_command_line_execution() else None,\n", + " width=1000,\n", + " height=550\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95066d93", + "metadata": {}, + "outputs": [], + "source": [ + "# Common settings for commit statistics of git file directories with Plotly Treemap\n", + "\n", + "plotly_treemap_commit_statistics_settings = dict(\n", + " **plotly_treemap_base_settings,\n", + " custom_data=['commitCount', 'authorCount', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification' , 'directoryPath'],\n", + ")\n", + "# Provide alternative color scale for diverging values\n", + "plotly_treemap_commit_statistics_settings_with_diverging_color_scale = plotly_treemap_commit_statistics_settings.copy()\n", + "plotly_treemap_commit_statistics_settings_with_diverging_color_scale.update({'color_continuous_scale':'RdBu'})\n", + "plotly_treemap_commit_statistics_settings_with_reverse_color_scale = plotly_treemap_commit_statistics_settings.copy()\n", + "plotly_treemap_commit_statistics_settings_with_reverse_color_scale.update({'color_continuous_scale':'Hot'})\n", + "\n", + "plotly_treemap_traces_commit_statistics_settings = dict(\n", + " **plotly_treemap_traces_base_settings,\n", + " hovertemplate='%{label}
Commits: %{customdata[0]}
Authors: %{customdata[1]}
Last Commit: %{customdata[2]} (%{customdata[3]} days ago)
Last Created: %{customdata[4]} (%{customdata[5]} days ago)
Last Modified: %{customdata[6]} (%{customdata[7]} days ago)
Path: %{customdata[8]}',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8cc624a", + "metadata": {}, + "outputs": [], + "source": [ + "def create_treemap_commit_statistics_settings(data_frame: pd.DataFrame):\n", + " \"\"\"\n", + " Creates a Plotly Treemap with the given settings and data frame.\n", + " data_frame : pd.DataFrame : The input data frame\n", + " return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n", + " \"\"\"\n", + " return plotly_graph_objects.Treemap(\n", + " labels=data_frame['directoryName'],\n", + " parents=data_frame['directoryParentPath'],\n", + " ids=data_frame['directoryPath'],\n", + " customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", + " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[2]}
Last Commit: %{customdata[3]} (%{customdata[4]} days ago)
Last Created: %{customdata[5]} (%{customdata[6]} days ago)
Last Modified: %{customdata[7]} (%{customdata[8]} days ago)
Path: %{customdata[9]}',\n", + " maxdepth=-1,\n", + " root_color=\"lightgrey\",\n", + " marker=dict(cornerradius=5),\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "acacc415", + "metadata": {}, + "source": [ + "### Data Preparation Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83077395", + "metadata": {}, + "outputs": [], + "source": [ + "def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:\n", + " \"\"\"\n", + " Limits the values of the given column in the input data frame to the given quantile.\n", + " The values are not filtered out but set to the limited (integer quantile value).\n", + " input_data_frame : pd.DataFrame : The input data frame\n", + " column_name : str : The name of the column to limit\n", + " quantile : float : The quantile to limit the values to (default: 0.95)\n", + " return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')\n", + " \"\"\"\n", + " data_frame=input_data_frame.copy()\n", + " column_values = data_frame[column_name]\n", + " column_limit = column_values.quantile(quantile)\n", + " data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)\n", + " return data_frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f9060d", + "metadata": {}, + "outputs": [], + "source": [ + "def add_rank_column(input_data_frame : pd.DataFrame, column_name : str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Adds a rank column (\"dense\" mode) to the input data frame based on the given column name.\n", + " input_data_frame : pd.DataFrame : The input data frame\n", + " column_name : str : The name of the column to rank\n", + " return : pd.DataFrame : The modified dataframe with the added rank column\n", + " \"\"\"\n", + " data_frame=input_data_frame.copy()\n", + " data_frame[column_name + '_rank'] = data_frame[column_name].rank(ascending=True, method='dense')\n", + " return data_frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "009a7222", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_out_non_existing_parent_ids(data_frame: pd.DataFrame, parent_column: str, id_column: str):\n", + " \"\"\"\n", + " Filters out all rows with a parent ID where there is no entry in the ID column.\n", + " data_frame : pd.DataFrame : The input data frame\n", + " parent_column : str : The name of the parent column\n", + " id_column : str : The name of the ID column\n", + " return : pd.DataFrame : The filtered data frame\n", + " \"\"\"\n", + " list_of_ids = data_frame[id_column].tolist() + ['']\n", + " # For Debugging\n", + " problems = data_frame[~data_frame[parent_column].isin(list_of_ids)]\n", + " if problems.empty:\n", + " display(\"No problems with non-existing parent IDs found.\")\n", + " else:\n", + " print('\\033[31mFiltered out rows with non-existing parent IDs. See the entries in the table below.\\033[0m')\n", + " display(problems)\n", + " return data_frame[data_frame[parent_column].isin(list_of_ids)]\n", + "\n", + "def replace_empty_parent_by_repository_name(data_frame: pd.DataFrame, column_name: str, repository_column_name: str = ''):\n", + " \"\"\"\n", + " Replaces the value 'root' in the given column by the repository name.\n", + " data_frame : pd.DataFrame : The input data frame\n", + " column_name : str : The name of the column\n", + " gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n", + " return : pd.DataFrame : The modified data frame\n", + " \"\"\"\n", + " repository_names = data_frame[repository_column_name]\n", + " data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n", + "\n", + " return data_frame\n", + "\n", + "def prepare_treemap_commit_statistics_data(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " data_frame : pd.DataFrame : The input data frame\n", + " return : pd.DataFrame : The data frame prepared for treemap visualization\n", + " \"\"\"\n", + " prepared_data = data_frame\n", + " prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'directoryPath')\n", + " prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'directoryPath')\n", + " prepared_data = replace_empty_parent_by_repository_name(prepared_data, 'directoryParentPath', 'gitRepositoryName')\n", + " return prepared_data" + ] + }, + { + "cell_type": "markdown", + "id": "0b717f80", + "metadata": {}, + "source": [ + "### Function to split file path levels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6581ec23", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Still needed?\n", + "\n", + "def fill_array_to_length(length: int, fill_value=''):\n", + " \"\"\"\n", + " Fills the input array with the given fill value to the given length.\n", + " array : list : The input array\n", + " length : int : The length to fill the array to\n", + " fill_value : any : The value to fill the array with (default: '')\n", + " return : list : The filled array\n", + " \"\"\"\n", + " def fill_array(array: list):\n", + " return array + [fill_value] * (length - len(array))\n", + " return fill_array\n", + "\n", + "def add_file_path_levels(input_dataframe: pd.DataFrame, file_path_column: str, delimiter: str = '/'):\n", + " \"\"\"\n", + " Adds hierarchical levels to a DataFrame based on a file path column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " file_path_column : str : The name of the file path column\n", + " delimiter : str : The delimiter used to split the file path (default: '/')\n", + " return : pd.DataFrame : The DataFrame with added hierarchical levels\n", + " \"\"\"\n", + "\n", + " # Get longest path length in the DataFrame\n", + " max_path_length = input_dataframe[file_path_column].str.count(delimiter).max() + 1\n", + "\n", + " # Split the file path column into multiple columns based on the delimiter and align the array to the right so that there are no null leaf nodes with obj.ffill()\n", + " dataframe_split = input_dataframe[file_path_column].str.split(delimiter).apply(fill_array_to_length(max_path_length)).apply(pd.Series)\n", + "\n", + " # Prefix each column in df_split with 'level'\n", + " dataframe_split.columns = [f'level_{i+1}' for i in dataframe_split.columns]\n", + "\n", + " # Join df with df_split\n", + " return input_dataframe.copy().join(dataframe_split), dataframe_split.columns.tolist()" + ] + }, + { + "cell_type": "markdown", + "id": "2d0df211", + "metadata": {}, + "source": [ + "### Data Preview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c9de7c5", + "metadata": {}, + "outputs": [], + "source": [ + "git_file_directories_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher\")\n", + "git_file_directories_with_commit_statistics = prepare_treemap_commit_statistics_data(git_file_directories_with_commit_statistics)\n", + "\n", + "# Show a preview of the first 20 directories with the highest file count\n", + "display(\"Data Preview ------------------\")\n", + "git_file_directories_with_commit_statistics.sort_values(by=\"fileCount\", ascending=False).head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "80338c9c", + "metadata": {}, + "source": [ + "### Null Checks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f95993e", + "metadata": {}, + "outputs": [], + "source": [ + "# Null values in the DataFrame\n", + "git_file_directories_with_commit_statistics.isnull().sum() " + ] + }, + { + "cell_type": "markdown", + "id": "5262bebf", + "metadata": {}, + "source": [ + "### Check for multiple root directories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10076683", + "metadata": {}, + "outputs": [], + "source": [ + "# Take the dataframe \"git_file_directories_with_commit_statistics\" and find values (=directories) in column \"directoryPath\", that have multiple parents (column \"directoryParentPath\").\n", + "\n", + "# Find directories with multiple parents\n", + "directories_with_multiple_parents = git_file_directories_with_commit_statistics.groupby('directoryPath').filter(lambda x: len(x) > 1)\n", + "directories_with_multiple_parents" + ] + }, + { + "cell_type": "markdown", + "id": "1096811a", + "metadata": {}, + "source": [ + "### TODO solve recursive missing parent directories issues\n", + "`WHERE NOT git_file.relativePath STARTS WITH 'docs/deadlines-guide/modules/ROOT/pages'`" + ] + }, + { + "cell_type": "markdown", + "id": "ccc11f52", + "metadata": {}, + "source": [ + "### Directories by file count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19d108bb", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n", + " values = git_file_directories_with_commit_statistics['fileCount'],\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Directories and their file count'\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "markdown", + "id": "e98ca7b1", + "metadata": {}, + "source": [ + "### Number of commits per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b05c773", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_count_per_directory = add_quantile_limited_column(git_file_directories_with_commit_statistics, \"commitCount\", 0.96)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_count_per_directory),\n", + " values = git_commit_count_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_count_per_directory['commitCount_limited'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Commits\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Number of git commits',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "markdown", + "id": "def69b07", + "metadata": {}, + "source": [ + "### Number of distinct authors per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baeb97f5", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_authors_per_directory = add_quantile_limited_column(git_file_directories_with_commit_statistics, \"authorCount\", 0.96)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_authors_per_directory),\n", + " values = git_commit_authors_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_authors_per_directory['authorCount_limited'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Authors\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Number of distinct commit authors',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "markdown", + "id": "0ed919b0", + "metadata": {}, + "source": [ + "### Days since last commit per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6929154", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_commit_per_directory = add_rank_column(git_file_directories_with_commit_statistics, \"daysSinceLastCommit\")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", + " values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Days\"),\n", + " ),\n", + "))\n", + "\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Days since last commit',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "720aa99e", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", + " values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_rank'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Rank\"),\n", + " ),\n", + "))\n", + "\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Rank of days since last commit',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4ebf96f7", + "metadata": {}, + "source": [ + "### Days since last file creation per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0de46c2b", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_file_directories_with_commit_statistics, \"daysSinceLastCreation\")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", + " values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Days\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Days since last file creation',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3da7a4f6", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", + " values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_rank'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Rank\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Rank of days since last file creation',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "markdown", + "id": "e34c46d5", + "metadata": {}, + "source": [ + "### Days since last file modification per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "423fdb2c", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_file_directories_with_commit_statistics, \"daysSinceLastModification\")\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", + " values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Days\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Days since last file modification',\n", + ")\n", + "figure.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62c33849", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", + " values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", + " marker=dict(\n", + " cornerradius=5, \n", + " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_rank'], \n", + " colorscale='Hot_r',\n", + " colorbar=dict(title=\"Rank\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Rank of days since last file modification',\n", + ")\n", + "figure.show()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateGitHistory", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Git History Charts with Neo4j" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/jupyter/environment.yml b/jupyter/environment.yml index fdfb57c90..e0ce499d9 100644 --- a/jupyter/environment.yml +++ b/jupyter/environment.yml @@ -19,5 +19,6 @@ dependencies: - wordcloud=1.9.* - monotonic=1.* - plotly=6.0.* + - python-kaleido=0.2.* # To render plotly plots. Static image export for web-based visualization libraries. - pip: - neo4j==5.23.* \ No newline at end of file diff --git a/scripts/executeJupyterNotebook.sh b/scripts/executeJupyterNotebook.sh index 0551b198d..476d1e5e6 100755 --- a/scripts/executeJupyterNotebook.sh +++ b/scripts/executeJupyterNotebook.sh @@ -88,8 +88,9 @@ source "${SCRIPTS_DIR}/activateCondaEnvironment.sh" jupyter --version || exit 1 # Execute the Jupyter Notebook and write it to the output file name +# The environment variable NBCONVERT is needed to be able to detect a command line execution in the Jupyter Notebook. echo "executeJupyterNotebook: Executing Jupyter Notebook ${jupyter_notebook_output_file_name}..." -jupyter nbconvert --to notebook \ +NBCONVERT=true jupyter nbconvert --to notebook \ --execute "${jupyter_notebook_file}" \ --output "$jupyter_notebook_output_file_name" \ --output-dir="./" \ @@ -107,7 +108,8 @@ mv -f "${jupyter_notebook_markdown_file}.nostyle" "${jupyter_notebook_markdown_f echo "executeJupyterNotebook: Successfully created Markdown ${jupyter_notebook_markdown_file}.." # Convert the Jupyter Notebook to PDF +# The environment variable NBCONVERT is needed to be able to detect a command line execution in the Jupyter Notebook. if [ -n "${ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION}" ]; then - jupyter nbconvert --to webpdf --no-input --allow-chromium-download --disable-chromium-sandbox "$jupyter_notebook_output_file" + NBCONVERT=true jupyter nbconvert --to webpdf --no-input --allow-chromium-download --disable-chromium-sandbox "$jupyter_notebook_output_file" echo "executeJupyterNotebook: Successfully created PDF ${jupyter_notebook_output_file}." fi \ No newline at end of file From bd3159d558fc04adbacc9e1419bfd44c92d90053 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 10 Mar 2025 07:57:40 +0100 Subject: [PATCH 5/9] Use simple git history query enhanced in Python --- .../GitLog/List_git_files_directories.cypher | 29 -- ..._directories_with_commit_statistics.cypher | 66 --- ...it_statistics_no_joined_directories.cypher | 49 -- ...es_with_commit_statistics_by_author.cypher | 24 + cypher/Validation/ValidateGitHistory.cypher | 3 +- jupyter/GitHistoryGeneral.ipynb | 457 ++++++++++-------- 6 files changed, 279 insertions(+), 349 deletions(-) delete mode 100644 cypher/GitLog/List_git_files_directories.cypher delete mode 100644 cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher delete mode 100644 cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher create mode 100644 cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher diff --git a/cypher/GitLog/List_git_files_directories.cypher b/cypher/GitLog/List_git_files_directories.cypher deleted file mode 100644 index be62779a1..000000000 --- a/cypher/GitLog/List_git_files_directories.cypher +++ /dev/null @@ -1,29 +0,0 @@ -// List git file directories and the number of files they contain - - MATCH (git_file:File&Git&!Repository) - WITH * - ,git_file.relativePath AS gitFileName - ,reverse(split(reverse(git_file.relativePath),'/')[0]) AS gitFileNameWithoutPath - ,(git_file:Directory) AS isDirectory - WITH * - ,rtrim(split(gitFileName, gitFileNameWithoutPath)[0], '/') AS gitDirectoryPath - WITH gitDirectoryPath - ,coalesce(nullif(split(gitDirectoryPath, '/')[-2],''), 'root') AS directoryParentName - ,coalesce(nullif(split(gitDirectoryPath, '/')[-1],''), 'root') AS directoryName - ,size(split(gitDirectoryPath, '/')) AS pathLength - ,count(DISTINCT gitFileName) AS fileCount -// Debugging -// ,collect(git_file)[0..4] AS gitFileExamples -// ,collect(gitFileName) AS gitFileNameExamples -// ,collect(gitFileNameWithoutPath) AS gitFileNameWithoutPathExamples - WHERE fileCount > 1 -RETURN gitDirectoryPath - ,directoryParentName - ,directoryName - ,pathLength - ,fileCount -// Debugging -// ,gitFileExamples -// ,gitFileNameExamples -// ,gitFileNameWithoutPathExamples - ORDER BY gitDirectoryPath ASC \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher b/cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher deleted file mode 100644 index 522c50978..000000000 --- a/cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher +++ /dev/null @@ -1,66 +0,0 @@ -// List git file directories and their statistics - - MATCH (git_file:File&Git&!Repository) - WHERE git_file.deletedAt IS NULL // filter out deleted files - ORDER BY git_file.relativePath - WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch - ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch - ,collect(git_file) AS git_files -UNWIND git_files AS git_file - WITH * - ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp - ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp - WITH *, split(git_file.relativePath, '/') AS pathElements - WITH *, pathElements[-1] AS fileName - MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) - MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file) - WITH pathElements - ,fileCreatedAtTimestamp - ,fileLastModificationAtTimestamp - ,fileName - ,git_file.relativePath AS fileRelativePath - ,max(git_repository.name) AS repository - ,max(git_commit.sha) AS maxCommitSha - ,COUNT(DISTINCT git_commit.sha) AS commitCount - ,COUNT(DISTINCT git_commit.author) AS authorCount - ,date(max(git_commit.date)) AS lastCommitDate -UNWIND pathElements AS pathElement - WITH * - ,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent - WITH * - ,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory - WHERE pathElement <> fileName - WITH repository AS gitRepositoryName - ,directory AS directoryPath - ,split(directory, '/')[-1] AS directoryName - ,parent AS directoryParentPath - ,split(parent, '/')[-1] AS directoryParentName - ,size(split(directory, '/')) AS directoryPathLength - ,count(DISTINCT fileRelativePath) AS fileCount - ,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate - ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate - ,sum(commitCount) AS commitCount - ,sum(authorCount) AS authorCount - ,max(maxCommitSha) AS maxCommitSha - ,max(lastCommitDate) AS lastCommitDate - ,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit - ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation - ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification -// The final results are grouped by the statistic values like file count,... -RETURN gitRepositoryName - ,fileCount - ,lastCreationDate - ,lastModificationDate - ,commitCount - ,authorCount - ,maxCommitSha - ,lastCommitDate - ,daysSinceLastCommit - ,daysSinceLastCreation - ,daysSinceLastModification - ,collect(directoryPath)[-1] AS directoryPath - ,apoc.text.join(collect(directoryName), '/') AS directoryName - ,collect(directoryParentPath)[0] AS directoryParentPath - ,collect(directoryParentName)[0] AS directoryParentName - ,max(directoryPathLength) AS directoryPathLength - ,count(DISTINCT directoryPath) AS combinedDirectoriesCount \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher b/cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher deleted file mode 100644 index d2939dbf9..000000000 --- a/cypher/GitLog/List_git_files_directories_with_commit_statistics_no_joined_directories.cypher +++ /dev/null @@ -1,49 +0,0 @@ -// List git file directories and their statistics - - MATCH (git_file:File&Git&!Repository) - WHERE git_file.deletedAt IS NULL // filter out deleted files - ORDER BY git_file.relativePath - WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch - ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch - ,collect(git_file) AS git_files -UNWIND git_files AS git_file - WITH * - ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp - ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp - WITH *, split(git_file.relativePath, '/') AS pathElements - WITH *, pathElements[-1] AS fileName - MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) - MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file) - WITH pathElements - ,fileCreatedAtTimestamp - ,fileLastModificationAtTimestamp - ,fileName - ,git_file.relativePath AS fileRelativePath - ,max(git_repository.name) AS repository - ,COUNT(DISTINCT git_commit.sha) AS commitCount - ,COUNT(DISTINCT git_commit.author) AS authorCount - ,date(max(git_commit.date)) AS lastCommitDate -UNWIND pathElements AS pathElement - WITH * - ,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent - WITH * - ,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory - WHERE pathElement <> fileName -RETURN repository AS gitRepositoryName - ,directory AS directoryPath - ,split(directory, '/')[-1] AS directoryName - ,parent AS directoryParentPath - ,split(parent, '/')[-1] AS directoryParentName - ,size(split(directory, '/')) AS directoryPathLength - ,count(DISTINCT fileRelativePath) AS fileCount - ,max(date(fileCreatedAtTimestamp) ) AS lastCreationDate - ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate - ,sum(commitCount) AS commitCount - ,sum(authorCount) AS authorCount - ,max(lastCommitDate) AS lastCommitDate - ,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLastCommit - ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation - ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification - // Debugging - //,collect(DISTINCT fileRelativePath)[0..4] AS relativePathExamples - //,collect(DISTINCT fileName)[0..4] AS fileNameExamples diff --git a/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher new file mode 100644 index 000000000..34d4e18fc --- /dev/null +++ b/cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher @@ -0,0 +1,24 @@ +// List git files with commit statistics + + MATCH (git_file:File&Git&!Repository) + WHERE git_file.deletedAt IS NULL // filter out deleted files + WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch + ,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch + ,collect(git_file) AS git_files +UNWIND git_files AS git_file + WITH * + ,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp + ,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp + MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) + MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-->(old_files_included:Git&File&!Repository)-[:HAS_NEW_NAME*0..3]->(git_file) +RETURN git_repository.name + '/' + git_file.relativePath AS filePath + ,split(git_commit.author, ' <')[0] AS author + ,count(DISTINCT git_commit.sha) AS commitCount + ,date(max(git_commit.date)) AS lastCommitDate + ,max(date(fileCreatedAtTimestamp)) AS lastCreationDate + ,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate + ,duration.inDays(date(max(git_commit.date)), date()).days AS daysSinceLastCommit + ,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation + ,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification + ,max(git_commit.sha) AS maxCommitSha +ORDER BY filePath ASCENDING, commitCount DESCENDING \ No newline at end of file diff --git a/cypher/Validation/ValidateGitHistory.cypher b/cypher/Validation/ValidateGitHistory.cypher index d2ead1b9b..2ca463310 100644 --- a/cypher/Validation/ValidateGitHistory.cypher +++ b/cypher/Validation/ValidateGitHistory.cypher @@ -1,5 +1,6 @@ -// Check if there is at least one Git:Commit pointing to a Git:Change containing a Git:File +// Check if there is at least one Git:Commit pointing to a Git:Change containing a Git:File from a Git:Repository MATCH (commit:Git:Commit)-[:CONTAINS_CHANGE]->(change:Git:Change)-->(file:Git:File) + MATCH (repository:Git:Repository)-[:HAS_FILE]->(file) RETURN commit.sha AS commitSha LIMIT 1 \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 4e760a7c0..4696d4436 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -23,10 +23,9 @@ "outputs": [], "source": [ "import os\n", + "import numpy as np\n", "import pandas as pd\n", - "pd.options.mode.copy_on_write = True\n", - "\n", - "from neo4j import GraphDatabase" + "#pd.options.mode.copy_on_write = True" ] }, { @@ -36,9 +35,8 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "from plotly import graph_objects as plotly_graph_objects\n", - "from plotly.subplots import make_subplots" + "from neo4j import GraphDatabase\n", + "from plotly import graph_objects as plotly_graph_objects" ] }, { @@ -51,6 +49,11 @@ "# To be able to distinguish between command line execution and Jupyter notebook execution\n", "# we need to check if the environment variable NBCONVERT is set.\n", "# The command line execution is required to take care of setting NBCONVERT.\n", + "\n", + "# Note: Even if it would be great to retain the interactivity of plotly Treemap plots (e.g. clicking into details)\n", + "# for command line executed notebooks (via nbconvert),\n", + "# it would require to execute the notebook twice: Once including interactivity and once for static Markdown and PDF.\n", + "# Therefore, command line executed notebooks (nbconvert) will contain static graphics (here using svg).\n", "def is_command_line_execution():\n", " return 'NBCONVERT' in os.environ\n", "\n", @@ -97,10 +100,15 @@ "metadata": {}, "outputs": [], "source": [ - "def query_cypher_to_data_frame(filename : str, limit: int = 10_000):\n", - " cypher_query_template = \"{query}\\nLIMIT {row_limit}\"\n", + "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", + " \"\"\"\n", + " Execute the Cypher query of the given file and returns the result.\n", + " filename : str : The name of the file containing the Cypher query\n", + " limit : int : The optional limit of rows to optimize the query. Default = -1 = no limit\n", + " \"\"\"\n", " cypher_query = get_cypher_query_from_file(filename)\n", - " cypher_query = cypher_query_template.format(query = cypher_query, row_limit = limit)\n", + " if limit > 0:\n", + " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" ] @@ -112,7 +120,7 @@ "metadata": {}, "outputs": [], "source": [ - "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = 10_000):\n", + "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", " If all given file names result in empty results, the last (empty) result will be returned.\n", @@ -164,7 +172,7 @@ "outputs": [], "source": [ "# Pandas DataFrame Display Configuration\n", - "pd.set_option('display.max_colwidth', 300)" + "pd.set_option('display.max_colwidth', 500)" ] }, { @@ -202,48 +210,22 @@ "source": [ "# Base settings for Plotly Treemap\n", "\n", - "plotly_treemap_base_settings = dict(\n", - " color_continuous_scale='Hot_r', # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", - " path=['gitRepositoryName', 'directoryParentName', 'directoryName'],\n", - " maxdepth=-1\n", - ")\n", - "plotly_treemap_traces_base_settings = dict(\n", - " root_color=\"lightgrey\",\n", - " textinfo=\"label+value\",\n", - " marker=dict(cornerradius=5),\n", - ")\n", "plotly_treemap_layout_base_settings = dict(\n", " margin=dict(t=50, l=15, r=15, b=15),\n", ")\n", - "plotly_treemap_figure_base_settings = dict(\n", + "plotly_treemap_figure_show_settings = dict(\n", " renderer=\"svg\" if is_command_line_execution() else None,\n", " width=1000,\n", - " height=550\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95066d93", - "metadata": {}, - "outputs": [], - "source": [ - "# Common settings for commit statistics of git file directories with Plotly Treemap\n", + " height=800\n", + ")\n", "\n", - "plotly_treemap_commit_statistics_settings = dict(\n", - " **plotly_treemap_base_settings,\n", - " custom_data=['commitCount', 'authorCount', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification' , 'directoryPath'],\n", + "plotly_treemap_marker_base_style = dict(\n", + " cornerradius=5, \n", ")\n", - "# Provide alternative color scale for diverging values\n", - "plotly_treemap_commit_statistics_settings_with_diverging_color_scale = plotly_treemap_commit_statistics_settings.copy()\n", - "plotly_treemap_commit_statistics_settings_with_diverging_color_scale.update({'color_continuous_scale':'RdBu'})\n", - "plotly_treemap_commit_statistics_settings_with_reverse_color_scale = plotly_treemap_commit_statistics_settings.copy()\n", - "plotly_treemap_commit_statistics_settings_with_reverse_color_scale.update({'color_continuous_scale':'Hot'})\n", - "\n", - "plotly_treemap_traces_commit_statistics_settings = dict(\n", - " **plotly_treemap_traces_base_settings,\n", - " hovertemplate='%{label}
Commits: %{customdata[0]}
Authors: %{customdata[1]}
Last Commit: %{customdata[2]} (%{customdata[3]} days ago)
Last Created: %{customdata[4]} (%{customdata[5]} days ago)
Last Modified: %{customdata[6]} (%{customdata[7]} days ago)
Path: %{customdata[8]}',\n", + "\n", + "plotly_treemap_marker_base_colorscale = dict(\n", + " **plotly_treemap_marker_base_style,\n", + " colorscale='Hot_r', # Hot_r, ice_r, Viridis_r, speed_r, haline_r, thermal_r, Plasma_r, solar_r, Electric_r, Blackbody_r, deep_r, Turbo_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", ")" ] }, @@ -268,7 +250,7 @@ " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[2]}
Last Commit: %{customdata[3]} (%{customdata[4]} days ago)
Last Created: %{customdata[5]} (%{customdata[6]} days ago)
Last Modified: %{customdata[7]} (%{customdata[8]} days ago)
Path: %{customdata[9]}',\n", " maxdepth=-1,\n", " root_color=\"lightgrey\",\n", - " marker=dict(cornerradius=5),\n", + " marker=dict(**plotly_treemap_marker_base_style),\n", " )" ] }, @@ -277,7 +259,7 @@ "id": "acacc415", "metadata": {}, "source": [ - "### Data Preparation Functions" + "### Visualization Data Preparation Functions" ] }, { @@ -322,178 +304,219 @@ " return data_frame" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "009a7222", - "metadata": {}, - "outputs": [], - "source": [ - "def filter_out_non_existing_parent_ids(data_frame: pd.DataFrame, parent_column: str, id_column: str):\n", - " \"\"\"\n", - " Filters out all rows with a parent ID where there is no entry in the ID column.\n", - " data_frame : pd.DataFrame : The input data frame\n", - " parent_column : str : The name of the parent column\n", - " id_column : str : The name of the ID column\n", - " return : pd.DataFrame : The filtered data frame\n", - " \"\"\"\n", - " list_of_ids = data_frame[id_column].tolist() + ['']\n", - " # For Debugging\n", - " problems = data_frame[~data_frame[parent_column].isin(list_of_ids)]\n", - " if problems.empty:\n", - " display(\"No problems with non-existing parent IDs found.\")\n", - " else:\n", - " print('\\033[31mFiltered out rows with non-existing parent IDs. See the entries in the table below.\\033[0m')\n", - " display(problems)\n", - " return data_frame[data_frame[parent_column].isin(list_of_ids)]\n", - "\n", - "def replace_empty_parent_by_repository_name(data_frame: pd.DataFrame, column_name: str, repository_column_name: str = ''):\n", - " \"\"\"\n", - " Replaces the value 'root' in the given column by the repository name.\n", - " data_frame : pd.DataFrame : The input data frame\n", - " column_name : str : The name of the column\n", - " gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n", - " return : pd.DataFrame : The modified data frame\n", - " \"\"\"\n", - " repository_names = data_frame[repository_column_name]\n", - " data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n", - "\n", - " return data_frame\n", - "\n", - "def prepare_treemap_commit_statistics_data(data_frame: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " data_frame : pd.DataFrame : The input data frame\n", - " return : pd.DataFrame : The data frame prepared for treemap visualization\n", - " \"\"\"\n", - " prepared_data = data_frame\n", - " prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'directoryPath')\n", - " prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'directoryPath')\n", - " prepared_data = replace_empty_parent_by_repository_name(prepared_data, 'directoryParentPath', 'gitRepositoryName')\n", - " return prepared_data" - ] - }, { "cell_type": "markdown", - "id": "0b717f80", + "id": "da109679", "metadata": {}, "source": [ - "### Function to split file path levels" + "### File Data Preparation Functions" ] }, { "cell_type": "code", "execution_count": null, - "id": "6581ec23", + "id": "299b06ea", "metadata": {}, "outputs": [], "source": [ - "# TODO Still needed?\n", + "def remove_last_file_path_element(file_path_elements: list) -> list:\n", + " \"\"\"\n", + " Removes the last element of the file path so that only the directory names retain.\n", + " file_path_elements : list : The list of levels to remove\n", + " return : list : The list of the directories\n", + " \"\"\"\n", + " return file_path_elements[:-1] if len(file_path_elements) > 1 else ['']\n", "\n", - "def fill_array_to_length(length: int, fill_value=''):\n", + "def convert_path_elements_to_directories(file_path_elements: list) -> list:\n", " \"\"\"\n", - " Fills the input array with the given fill value to the given length.\n", - " array : list : The input array\n", - " length : int : The length to fill the array to\n", - " fill_value : any : The value to fill the array with (default: '')\n", - " return : list : The filled array\n", + " Converts the file path elements into directories.\n", + " file_path_elements : list : The list of levels to convert\n", + " return : list : The list of directories\n", " \"\"\"\n", - " def fill_array(array: list):\n", - " return array + [fill_value] * (length - len(array))\n", - " return fill_array\n", + " directories = remove_last_file_path_element(file_path_elements)\n", + " return ['/'.join(directories[:i+1]) for i in range(len(directories))]\n", "\n", - "def add_file_path_levels(input_dataframe: pd.DataFrame, file_path_column: str, delimiter: str = '/'):\n", + "def add_directory_column(input_dataframe: pd.DataFrame, file_path_column: str, directory_column: str = 'directoryPath'):\n", " \"\"\"\n", - " Adds hierarchical levels to a DataFrame based on a file path column.\n", + " Adds a directory column to the input DataFrame based on the file path column.\n", " input_dataframe : pd.DataFrame : The input DataFrame\n", " file_path_column : str : The name of the file path column\n", - " delimiter : str : The delimiter used to split the file path (default: '/')\n", - " return : pd.DataFrame : The DataFrame with added hierarchical levels\n", + " directory_column : str : The name of the directory column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory column\n", " \"\"\"\n", + " if directory_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " input_dataframe.insert(0, directory_column, input_dataframe[file_path_column].str.split('/').apply(convert_path_elements_to_directories))\n", + " input_dataframe = input_dataframe.explode(directory_column)\n", + " return input_dataframe\n", "\n", - " # Get longest path length in the DataFrame\n", - " max_path_length = input_dataframe[file_path_column].str.count(delimiter).max() + 1\n", + "def add_directory_name_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_name_column: str = 'directoryName'):\n", + " \"\"\"\n", + " Adds a directory name column to the input DataFrame based on the directory column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " directory_column : str : The name of the directory column\n", + " directory_name_column : str : The name of the directory name column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory name column\n", + " \"\"\"\n", + " if directory_name_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n", + " input_dataframe.insert(1, directory_name_column, splitted_directories.apply(lambda x: (x[-1])))\n", + " return input_dataframe\n", "\n", - " # Split the file path column into multiple columns based on the delimiter and align the array to the right so that there are no null leaf nodes with obj.ffill()\n", - " dataframe_split = input_dataframe[file_path_column].str.split(delimiter).apply(fill_array_to_length(max_path_length)).apply(pd.Series)\n", + "def add_parent_directory_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_parent_column: str = 'directoryParentPath'):\n", + " \"\"\"\n", + " Adds a directory parent column to the input DataFrame based on the directory column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " directory_column : str : The name of the directory column\n", + " directory_parent_column : str : The name of the directory parent column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory parent column\n", + " \"\"\"\n", + " if directory_parent_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " # Remove last path element from directory_column to get the directory_parent_column\n", + " splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n", + " input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n", + " \n", + " # Clear parent (set to empty string) when it equal to the directory\n", + " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n", + " return input_dataframe\n", "\n", - " # Prefix each column in df_split with 'level'\n", - " dataframe_split.columns = [f'level_{i+1}' for i in dataframe_split.columns]\n", + "def second_entry(values: pd.Series):\n", + " \"\"\"\n", + " Returns the second entry of a list of values.\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : any : The second entry\n", + " \"\"\"\n", + " return values.iloc[1] if len(values) > 1 else None\n", "\n", - " # Join df with df_split\n", - " return input_dataframe.copy().join(dataframe_split), dataframe_split.columns.tolist()" + "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n", + " \"\"\"\n", + " Return the file count from an array of array of file paths.\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : int : The number of files\n", + " \"\"\"\n", + " return len(np.unique(np.concatenate(values.to_list())))" ] }, { "cell_type": "markdown", - "id": "2d0df211", + "id": "09aeae9b", "metadata": {}, "source": [ - "### Data Preview" + "### File Data Preparation " ] }, { "cell_type": "code", "execution_count": null, - "id": "9c9de7c5", + "id": "682d8aa9", "metadata": {}, "outputs": [], "source": [ - "git_file_directories_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher\")\n", - "git_file_directories_with_commit_statistics = prepare_treemap_commit_statistics_data(git_file_directories_with_commit_statistics)\n", + "git_files_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher\")\n", "\n", - "# Show a preview of the first 20 directories with the highest file count\n", - "display(\"Data Preview ------------------\")\n", - "git_file_directories_with_commit_statistics.sort_values(by=\"fileCount\", ascending=False).head(10)" - ] - }, - { - "cell_type": "markdown", - "id": "80338c9c", - "metadata": {}, - "source": [ - "### Null Checks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f95993e", - "metadata": {}, - "outputs": [], - "source": [ - "# Null values in the DataFrame\n", - "git_file_directories_with_commit_statistics.isnull().sum() " + "# Debug\n", + "# display(\"1. query result ---------------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n", + "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n", + "\n", + "# Debug\n", + "# display(\"2. added directoryPath --------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Define how common non-grouped columns will be aggregated.\n", + "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n", + "common_named_aggregation = dict(\n", + " commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", + " daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n", + " daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n", + " daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n", + " lastCommitDate=pd.NamedAgg(column=\"lastCommitDate\", aggfunc=\"max\"),\n", + " lastCreationDate=pd.NamedAgg(column=\"lastCreationDate\", aggfunc=\"max\"),\n", + " lastModificationDate=pd.NamedAgg(column=\"lastModificationDate\", aggfunc=\"max\"),\n", + " maxCommitSha=pd.NamedAgg(column=\"maxCommitSha\", aggfunc=\"max\"),\n", + ")\n", + "\n", + "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n", + " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", + " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n", + " **common_named_aggregation\n", + ")\n", + "\n", + "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n", + "# The author with the most commits will then be listed first for each directory.\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", + "\n", + "# Debug\n", + "# display(\"3. grouped by 'directoryPath' and 'author' -----\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n", + "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n", + " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", + " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n", + " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n", + " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n", + " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n", + " **common_named_aggregation\n", + ")\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", + "\n", + "# Debug\n", + "# display(\"4. grouped by 'directoryPath' ----------------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n", + "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')\n", + "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')\n", + "\n", + "# Debug\n", + "# display(\"5. added parent and name columns ------------\")\n", + "# display(git_files_with_commit_statistics)\n", + "\n", + "# Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n", + "all_column_names_except_for_the_directory_path = git_files_with_commit_statistics.columns.to_list()[3:]\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(all_column_names_except_for_the_directory_path).aggregate(\n", + " directoryName=pd.NamedAgg(column=\"directoryName\", aggfunc=lambda names: '/'.join(names)),\n", + " directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n", + " directoryPath=pd.NamedAgg(column=\"directoryPath\", aggfunc=\"last\"),\n", + ")\n", + "# Reorder the column positions so that the directory path is again the first column. \n", + "all_column_names_with_the_directory_path_first = ['directoryPath', 'directoryParentPath', 'directoryName'] + all_column_names_except_for_the_directory_path\n", + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()[all_column_names_with_the_directory_path_first]\n", + "\n", + "# Debug\n", + "# display(\"6. grouped by all except for directory path, name and parent columns (max) ----------------------\")\n", + "# display(git_files_with_commit_statistics)" ] }, { "cell_type": "markdown", - "id": "5262bebf", + "id": "114f8d4b", "metadata": {}, "source": [ - "### Check for multiple root directories" + "### Data Preview" ] }, { "cell_type": "code", "execution_count": null, - "id": "10076683", + "id": "dc0c2d06", "metadata": {}, "outputs": [], "source": [ - "# Take the dataframe \"git_file_directories_with_commit_statistics\" and find values (=directories) in column \"directoryPath\", that have multiple parents (column \"directoryParentPath\").\n", - "\n", - "# Find directories with multiple parents\n", - "directories_with_multiple_parents = git_file_directories_with_commit_statistics.groupby('directoryPath').filter(lambda x: len(x) > 1)\n", - "directories_with_multiple_parents" - ] - }, - { - "cell_type": "markdown", - "id": "1096811a", - "metadata": {}, - "source": [ - "### TODO solve recursive missing parent directories issues\n", - "`WHERE NOT git_file.relativePath STARTS WITH 'docs/deadlines-guide/modules/ROOT/pages'`" + "git_files_with_commit_statistics.head(30)" ] }, { @@ -507,19 +530,29 @@ { "cell_type": "code", "execution_count": null, - "id": "19d108bb", + "id": "bc0dc138", "metadata": {}, "outputs": [], "source": [ "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", - " create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n", - " values = git_file_directories_with_commit_statistics['fileCount'],\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " values = git_files_with_commit_statistics['fileCount'],\n", "))\n", "figure.update_layout(\n", " **plotly_treemap_layout_base_settings,\n", " title='Directories and their file count'\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb399f44", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO Directories by main author" ] }, { @@ -537,15 +570,14 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_count_per_directory = add_quantile_limited_column(git_file_directories_with_commit_statistics, \"commitCount\", 0.96)\n", + "git_commit_count_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"commitCount\", 0.98)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_count_per_directory),\n", " values = git_commit_count_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", + " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_count_per_directory['commitCount_limited'], \n", - " colorscale='Hot_r',\n", " colorbar=dict(title=\"Commits\"),\n", " ),\n", "))\n", @@ -553,7 +585,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Number of git commits',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -571,15 +603,14 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_authors_per_directory = add_quantile_limited_column(git_file_directories_with_commit_statistics, \"authorCount\", 0.96)\n", + "git_commit_authors_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.96)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_authors_per_directory),\n", " values = git_commit_authors_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", + " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_authors_per_directory['authorCount_limited'], \n", - " colorscale='Hot_r',\n", " colorbar=dict(title=\"Authors\"),\n", " ),\n", "))\n", @@ -587,7 +618,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Number of distinct commit authors',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -605,15 +636,14 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_days_since_last_commit_per_directory = add_rank_column(git_file_directories_with_commit_statistics, \"daysSinceLastCommit\")\n", + "git_commit_days_since_last_commit_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCommit\")\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", " values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", - " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit'], \n", - " colorscale='Hot_r',\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_limited'], \n", " colorbar=dict(title=\"Days\"),\n", " ),\n", "))\n", @@ -622,7 +652,15 @@ " **plotly_treemap_layout_base_settings,\n", " title='Days since last commit',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "a06f6d20", + "metadata": {}, + "source": [ + "### Days since last commit per directory (ranked)" ] }, { @@ -636,9 +674,8 @@ " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", " values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", + " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_rank'], \n", - " colorscale='Hot_r',\n", " colorbar=dict(title=\"Rank\"),\n", " ),\n", "))\n", @@ -647,7 +684,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Rank of days since last commit',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -665,15 +702,14 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_file_directories_with_commit_statistics, \"daysSinceLastCreation\")\n", + "git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCreation\")\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", " values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", - " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation'], \n", - " colorscale='Hot_r',\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_limited'], \n", " colorbar=dict(title=\"Days\"),\n", " ),\n", "))\n", @@ -681,13 +717,21 @@ " **plotly_treemap_layout_base_settings,\n", " title='Days since last file creation',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "772eab2a", + "metadata": {}, + "source": [ + "### Days since last file creation per directory (ranked)" ] }, { "cell_type": "code", "execution_count": null, - "id": "3da7a4f6", + "id": "83d918ee", "metadata": {}, "outputs": [], "source": [ @@ -695,9 +739,8 @@ " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", " values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", + " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_rank'], \n", - " colorscale='Hot_r',\n", " colorbar=dict(title=\"Rank\"),\n", " ),\n", "))\n", @@ -705,7 +748,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Rank of days since last file creation',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -723,15 +766,14 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_file_directories_with_commit_statistics, \"daysSinceLastModification\")\n", + "git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastModification\")\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", " values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", - " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification'], \n", - " colorscale='Hot_r',\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_limited'], \n", " colorbar=dict(title=\"Days\"),\n", " ),\n", "))\n", @@ -739,7 +781,15 @@ " **plotly_treemap_layout_base_settings,\n", " title='Days since last file modification',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "7abc96e4", + "metadata": {}, + "source": [ + "### Days since last file modification per directory (ranked)" ] }, { @@ -753,9 +803,8 @@ " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", " values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", " marker=dict(\n", - " cornerradius=5, \n", + " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_rank'], \n", - " colorscale='Hot_r',\n", " colorbar=dict(title=\"Rank\"),\n", " ),\n", "))\n", @@ -763,7 +812,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Rank of days since last file modification',\n", ")\n", - "figure.show()" + "figure.show(**plotly_treemap_figure_show_settings)" ] } ], From 7a3e1319ae6f3fd4f262d78386570f72d540491d Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:29:43 +0100 Subject: [PATCH 6/9] Add main and second git author treemap plots --- jupyter/GitHistoryGeneral.ipynb | 232 ++++++++++++++++++++++++++++---- 1 file changed, 206 insertions(+), 26 deletions(-) diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 4696d4436..8e91bd785 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -36,7 +36,8 @@ "outputs": [], "source": [ "from neo4j import GraphDatabase\n", - "from plotly import graph_objects as plotly_graph_objects" + "from plotly import graph_objects as plotly_graph_objects\n", + "from plotly.express import colors as plotly_colors" ] }, { @@ -246,8 +247,8 @@ " labels=data_frame['directoryName'],\n", " parents=data_frame['directoryParentPath'],\n", " ids=data_frame['directoryPath'],\n", - " customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", - " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[2]}
Last Commit: %{customdata[3]} (%{customdata[4]} days ago)
Last Created: %{customdata[5]} (%{customdata[6]} days ago)
Last Modified: %{customdata[7]} (%{customdata[8]} days ago)
Path: %{customdata[9]}',\n", + " customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", + " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[3]}, %{customdata[4]},.. (%{customdata[2]})
Last Commit: %{customdata[5]} (%{customdata[6]} days ago)
Last Created: %{customdata[7]} (%{customdata[8]} days ago)
Last Modified: %{customdata[9]} (%{customdata[10]} days ago)
Path: %{customdata[11]}',\n", " maxdepth=-1,\n", " root_color=\"lightgrey\",\n", " marker=dict(**plotly_treemap_marker_base_style),\n", @@ -421,6 +422,16 @@ "source": [ "git_files_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher\")\n", "\n", + "# Get all authors, their commit count and based on it their rank in a separate dataframe.\n", + "# This will then be needed to visualize the (main) author for each directory.\n", + "git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(\n", + " authorCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", + " ).sort_values(by='authorCommitCount', ascending=False).reset_index()\n", + "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=True, method='dense').astype(int)\n", + "\n", + "# Debug\n", + "# display(git_file_authors)\n", + "\n", "# Debug\n", "# display(\"1. query result ---------------------\")\n", "# display(git_files_with_commit_statistics)\n", @@ -524,7 +535,7 @@ "id": "ccc11f52", "metadata": {}, "source": [ - "### Directories by file count" + "### Number of files per directory" ] }, { @@ -545,16 +556,6 @@ "figure.show(**plotly_treemap_figure_show_settings)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb399f44", - "metadata": {}, - "outputs": [], - "source": [ - "# TODO Directories by main author" - ] - }, { "cell_type": "markdown", "id": "e98ca7b1", @@ -574,7 +575,8 @@ "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_count_per_directory),\n", - " values = git_commit_count_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_count_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_count_per_directory['commitCount_limited'], \n", @@ -603,11 +605,12 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_authors_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.96)\n", + "git_commit_authors_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.98)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_authors_per_directory),\n", - " values = git_commit_authors_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_authors_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_authors_per_directory['authorCount_limited'], \n", @@ -621,6 +624,171 @@ "figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "markdown", + "id": "5dbceaef", + "metadata": {}, + "source": [ + "### Main author per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29069753", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO delete unused code" + ] + }, + { + "cell_type": "raw", + "id": "7ccca44e", + "metadata": {}, + "source": [ + "# TODO experiment again with plotly express\n", + "\n", + "import plotly.express as plotly_express\n", + "\n", + "plotly_treemap_color_settings = dict(\n", + " color_continuous_scale='Hot_r', # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", + " color_discrete_sequence=plotly_express.colors.qualitative.Vivid,\n", + ")\n", + "plotly_treemap_commit_statistics_custom_data= dict(\n", + " custom_data=['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath'],\n", + ")\n", + "plotly_treemap_traces_base_settings = dict(\n", + " root_color=\"lightgrey\",\n", + " textinfo=\"label+value\",\n", + " marker=dict(cornerradius=5),\n", + ")\n", + "plotly_treemap_traces_commit_statistics_settings = dict(\n", + " **plotly_treemap_traces_base_settings,\n", + " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[2]}
Main Author: %{customdata[3]}
Last Commit: %{customdata[4]} (%{customdata[5]} days ago)
Last Created: %{customdata[6]} (%{customdata[7]} days ago)
Last Modified: %{customdata[8]} (%{customdata[9]} days ago)
Path: %{customdata[10]}',\n", + ")\n", + "plotly_treemap_layout_base_settings = dict(\n", + " margin=dict(t=50, l=15, r=15, b=15),\n", + ")\n", + "\n", + "# Extract unique authors for category orders\n", + "#unique_authors = git_files_with_commit_statistics['mainAuthor'].unique()\n", + "\n", + "figure = plotly_express.treemap(\n", + " git_files_with_commit_statistics,\n", + " **plotly_treemap_color_settings,\n", + " **plotly_treemap_commit_statistics_custom_data,\n", + " ids='directoryPath',\n", + " names='directoryName',\n", + " parents='directoryParentPath',\n", + " # Without values, much more squares are shown which gives a much better overview\n", + " # values='fileCount', \n", + " color='mainAuthor',\n", + " title='Directories and their main author (discrete coloring, no legend?)',\n", + ")\n", + "figure.update_traces(\n", + " **plotly_treemap_traces_commit_statistics_settings,\n", + ")\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " # coloraxis_colorbar=dict(title=\"Author\"),\n", + " legend_title_text='Main Author',\n", + " showlegend=True,\n", + " legend_visible=True,\n", + ") \n", + "\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "259f7278", + "metadata": {}, + "outputs": [], + "source": [ + "def create_git_authors_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n", + " \"\"\"\n", + " Creates a plotly graph_objects.Treemap marker object for git author plots.\n", + " main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n", + " author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n", + " author_column_name : str : The name of the (aggregated) author column for coloring the plot\n", + " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", + " \"\"\"\n", + " data_frame_with_authors=pd.merge(\n", + " main_data_frame, \n", + " author_rank_data_frame, \n", + " left_on=author_column_name, \n", + " right_on=\"author\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + " )\n", + " #display(data_frame_with_author_ranks)\n", + "\n", + " data_frame_with_author_ranks=data_frame_with_authors['authorCommitCountRank']\n", + "\n", + " return dict(\n", + " cornerradius=5, \n", + " colors=data_frame_with_author_ranks,\n", + " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", + " colorbar=dict(\n", + " title=\"Rank\",\n", + " tickmode=\"array\",\n", + " ticktext=data_frame_with_authors[author_column_name],\n", + " tickvals=data_frame_with_author_ranks,\n", + " tickfont_size=8\n", + " ),\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e97c0d87", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_files_with_commit_statistics['fileCount'],\n", + " marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"mainAuthor\")\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Main author (highest number of commits)'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "349a1d03", + "metadata": {}, + "source": [ + "### Second author per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29484f84", + "metadata": {}, + "outputs": [], + "source": [ + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_files_with_commit_statistics['fileCount'],\n", + " marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"secondAuthor\")\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Second author (second highest number of commits)'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, { "cell_type": "markdown", "id": "0ed919b0", @@ -636,11 +804,12 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_days_since_last_commit_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCommit\")\n", + "git_commit_days_since_last_commit_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"daysSinceLastCommit\", 0.98)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", - " values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " #values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_limited'], \n", @@ -670,9 +839,12 @@ "metadata": {}, "outputs": [], "source": [ + "git_commit_days_since_last_commit_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCommit\")\n", + "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_commit_per_directory),\n", - " values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_commit_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_commit_per_directory['daysSinceLastCommit_rank'], \n", @@ -702,11 +874,12 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCreation\")\n", + "git_commit_days_since_last_file_creation_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"daysSinceLastCreation\", 0.98)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", - " values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_limited'], \n", @@ -735,9 +908,12 @@ "metadata": {}, "outputs": [], "source": [ + "git_commit_days_since_last_file_creation_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastCreation\")\n", + "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_creation_per_directory),\n", - " values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_creation_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_file_creation_per_directory['daysSinceLastCreation_rank'], \n", @@ -766,11 +942,12 @@ "metadata": {}, "outputs": [], "source": [ - "git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastModification\")\n", + "git_commit_days_since_last_file_modification_per_directory = add_quantile_limited_column(git_files_with_commit_statistics, \"daysSinceLastModification\", 0.98)\n", "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", - " values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_limited'], \n", @@ -799,9 +976,12 @@ "metadata": {}, "outputs": [], "source": [ + "git_commit_days_since_last_file_modification_per_directory = add_rank_column(git_files_with_commit_statistics, \"daysSinceLastModification\")\n", + "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_commit_days_since_last_file_modification_per_directory),\n", - " values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_days_since_last_file_modification_per_directory['fileCount'],\n", " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_days_since_last_file_modification_per_directory['daysSinceLastModification_rank'], \n", From 14062080712ed54b079170c68ee5dc1482a0058c Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:14:55 +0100 Subject: [PATCH 7/9] Add wordcloud to git history --- jupyter/GitHistoryGeneral.ipynb | 145 +++++++++++++------------------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 8e91bd785..72ad6d602 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -91,16 +91,9 @@ "source": [ "def get_cypher_query_from_file(cypher_file_name : str):\n", " with open(cypher_file_name) as file:\n", - " return ' '.join(file.readlines())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59310f6f", - "metadata": {}, - "outputs": [], - "source": [ + " return ' '.join(file.readlines())\n", + "\n", + "\n", "def query_cypher_to_data_frame(filename : str, limit: int = -1):\n", " \"\"\"\n", " Execute the Cypher query of the given file and returns the result.\n", @@ -111,16 +104,9 @@ " if limit > 0:\n", " cypher_query = \"{query}\\nLIMIT {row_limit}\".format(query = cypher_query, row_limit = limit)\n", " records, summary, keys = driver.execute_query(cypher_query)\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c09da482", - "metadata": {}, - "outputs": [], - "source": [ + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", "def query_first_non_empty_cypher_to_data_frame(*filenames : str, limit: int = -1):\n", " \"\"\"\n", " Executes the Cypher queries of the given files and returns the first result that is not empty.\n", @@ -632,74 +618,6 @@ "### Main author per directory" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "29069753", - "metadata": {}, - "outputs": [], - "source": [ - "# TODO delete unused code" - ] - }, - { - "cell_type": "raw", - "id": "7ccca44e", - "metadata": {}, - "source": [ - "# TODO experiment again with plotly express\n", - "\n", - "import plotly.express as plotly_express\n", - "\n", - "plotly_treemap_color_settings = dict(\n", - " color_continuous_scale='Hot_r', # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", - " color_discrete_sequence=plotly_express.colors.qualitative.Vivid,\n", - ")\n", - "plotly_treemap_commit_statistics_custom_data= dict(\n", - " custom_data=['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath'],\n", - ")\n", - "plotly_treemap_traces_base_settings = dict(\n", - " root_color=\"lightgrey\",\n", - " textinfo=\"label+value\",\n", - " marker=dict(cornerradius=5),\n", - ")\n", - "plotly_treemap_traces_commit_statistics_settings = dict(\n", - " **plotly_treemap_traces_base_settings,\n", - " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[2]}
Main Author: %{customdata[3]}
Last Commit: %{customdata[4]} (%{customdata[5]} days ago)
Last Created: %{customdata[6]} (%{customdata[7]} days ago)
Last Modified: %{customdata[8]} (%{customdata[9]} days ago)
Path: %{customdata[10]}',\n", - ")\n", - "plotly_treemap_layout_base_settings = dict(\n", - " margin=dict(t=50, l=15, r=15, b=15),\n", - ")\n", - "\n", - "# Extract unique authors for category orders\n", - "#unique_authors = git_files_with_commit_statistics['mainAuthor'].unique()\n", - "\n", - "figure = plotly_express.treemap(\n", - " git_files_with_commit_statistics,\n", - " **plotly_treemap_color_settings,\n", - " **plotly_treemap_commit_statistics_custom_data,\n", - " ids='directoryPath',\n", - " names='directoryName',\n", - " parents='directoryParentPath',\n", - " # Without values, much more squares are shown which gives a much better overview\n", - " # values='fileCount', \n", - " color='mainAuthor',\n", - " title='Directories and their main author (discrete coloring, no legend?)',\n", - ")\n", - "figure.update_traces(\n", - " **plotly_treemap_traces_commit_statistics_settings,\n", - ")\n", - "figure.update_layout(\n", - " **plotly_treemap_layout_base_settings,\n", - " # coloraxis_colorbar=dict(title=\"Author\"),\n", - " legend_title_text='Main Author',\n", - " showlegend=True,\n", - " legend_visible=True,\n", - ") \n", - "\n", - "figure.show(**plotly_treemap_figure_show_settings)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -994,6 +912,57 @@ ")\n", "figure.show(**plotly_treemap_figure_show_settings)" ] + }, + { + "cell_type": "markdown", + "id": "14e87aff", + "metadata": {}, + "source": [ + "## WordCloud of git authors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2f68f02", + "metadata": {}, + "outputs": [], + "source": [ + "# Query data from graph database\n", + "git_author_words_with_frequency = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher\")\n", + "# Debug \n", + "# display(git_author_words_with_frequency.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d83ce5f4", + "metadata": {}, + "outputs": [], + "source": [ + "from wordcloud import WordCloud\n", + "import matplotlib.pyplot as plot\n", + "\n", + "if not git_author_words_with_frequency.empty:\n", + " # Expects the first column of the DataFrame to contain the words/text and the second column to contain the count/frequency.\n", + " words_with_frequency_dict=git_author_words_with_frequency.set_index(git_author_words_with_frequency.columns[0]).to_dict()[git_author_words_with_frequency.columns[1]]\n", + " wordcloud = WordCloud(\n", + " width=800, \n", + " height=800,\n", + " max_words=600, \n", + " collocations=False,\n", + " background_color='white', \n", + " colormap='viridis'\n", + " ).generate_from_frequencies(words_with_frequency_dict)\n", + "\n", + " # Plot the word cloud\n", + " plot.figure(figsize=(15,15))\n", + " plot.imshow(wordcloud, interpolation='bilinear')\n", + " plot.axis(\"off\")\n", + " plot.title('Wordcloud of git authors')\n", + " plot.show()" + ] } ], "metadata": { From 1a1832f5132f950d6eb685c6f6d2b1cf18ef5b11 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Thu, 20 Mar 2025 08:10:16 +0100 Subject: [PATCH 8/9] Add file count per commit distribution chart --- ...t_git_files_per_commit_distribution.cypher | 6 ++ jupyter/GitHistoryGeneral.ipynb | 88 ++++++++++++++++++- 2 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 cypher/GitLog/List_git_files_per_commit_distribution.cypher diff --git a/cypher/GitLog/List_git_files_per_commit_distribution.cypher b/cypher/GitLog/List_git_files_per_commit_distribution.cypher new file mode 100644 index 000000000..a15fb2994 --- /dev/null +++ b/cypher/GitLog/List_git_files_per_commit_distribution.cypher @@ -0,0 +1,6 @@ +// List how many git commits changed one file, how mandy changed two files, .... + +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change)-[]->(git_file:Git:File) + WITH git_commit, count(DISTINCT git_file.relativePath) AS filesPerCommit +RETURN filesPerCommit, count(DISTINCT git_commit.sha) AS commitCount +ORDER BY filesPerCommit ASC \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 72ad6d602..c03ad9ef0 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -197,9 +197,15 @@ "source": [ "# Base settings for Plotly Treemap\n", "\n", - "plotly_treemap_layout_base_settings = dict(\n", + "plotly_main_layout_base_settings = dict(\n", " margin=dict(t=50, l=15, r=15, b=15),\n", ")\n", + "plotly_treemap_layout_base_settings = dict(\n", + " **plotly_main_layout_base_settings\n", + ")\n", + "plotly_bar_layout_base_settings = dict(\n", + " **plotly_main_layout_base_settings\n", + ")\n", "plotly_treemap_figure_show_settings = dict(\n", " renderer=\"svg\" if is_command_line_execution() else None,\n", " width=1000,\n", @@ -512,6 +518,16 @@ "id": "dc0c2d06", "metadata": {}, "outputs": [], + "source": [ + "git_files_with_commit_statistics.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "053b448d", + "metadata": {}, + "outputs": [], "source": [ "git_files_with_commit_statistics.head(30)" ] @@ -913,6 +929,72 @@ "figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "markdown", + "id": "d8c6ccee", + "metadata": {}, + "source": [ + "## Filecount per commit\n", + "\n", + "Shows how many commits had changed one file, how many had changed two files, and so on.\n", + "The chart is limited to 30 lines for improved readability.\n", + "The data preview also includes overall statistics including the number of commits that are filtered out in the chart." + ] + }, + { + "cell_type": "markdown", + "id": "ed53b6e5", + "metadata": {}, + "source": [ + "### Preview data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5526e458", + "metadata": {}, + "outputs": [], + "source": [ + "git_file_count_per_commit = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_per_commit_distribution.cypher\")\n", + "\n", + "print(\"Sum of commits that changed more than 30 files (each) = \" + str(git_file_count_per_commit[git_file_count_per_commit['filesPerCommit'] > 30]['commitCount'].sum()))\n", + "print(\"Max changed files with one commit = \" + str(git_file_count_per_commit['filesPerCommit'].max()))\n", + "display(git_file_count_per_commit.describe())\n", + "display(git_file_count_per_commit.head(30))" + ] + }, + { + "cell_type": "markdown", + "id": "dcea826e", + "metadata": {}, + "source": [ + "### Bar chart with the number of files per commit distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e9dbc57", + "metadata": {}, + "outputs": [], + "source": [ + "if git_file_count_per_commit.empty:\n", + " print(\"No data to plot\")\n", + "else:\n", + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Bar(\n", + " x=git_file_count_per_commit['filesPerCommit'].head(30), \n", + " y=git_file_count_per_commit['commitCount'].head(30)),\n", + " )\n", + " figure.update_layout(\n", + " **plotly_bar_layout_base_settings,\n", + " title='Changed files per commit',\n", + " xaxis_title='file count',\n", + " yaxis_title='commit count'\n", + " )\n", + " figure.show(**plotly_treemap_figure_show_settings)" + ] + }, { "cell_type": "markdown", "id": "14e87aff", @@ -930,8 +1012,8 @@ "source": [ "# Query data from graph database\n", "git_author_words_with_frequency = query_cypher_to_data_frame(\"../cypher/Overview/Words_for_git_author_Wordcloud_with_frequency.cypher\")\n", - "# Debug \n", - "# display(git_author_words_with_frequency.head(10))" + "\n", + "git_author_words_with_frequency.sort_values(by='frequency', ascending=False).reset_index(drop=True).head(10)" ] }, { From 0e7c6453cf46dae4b26d0997345be4b2f247ef77 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 21 Mar 2025 08:21:57 +0100 Subject: [PATCH 9/9] Add most frequent file extension treemap --- jupyter/GitHistoryGeneral.ipynb | 216 ++++++++++++++++++++++++-------- 1 file changed, 166 insertions(+), 50 deletions(-) diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index c03ad9ef0..fc4a1665a 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -239,14 +239,46 @@ " labels=data_frame['directoryName'],\n", " parents=data_frame['directoryParentPath'],\n", " ids=data_frame['directoryPath'],\n", - " customdata=data_frame[['fileCount', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", - " hovertemplate='%{label}
Files: %{customdata[0]}
Commits: %{customdata[1]}
Authors: %{customdata[3]}, %{customdata[4]},.. (%{customdata[2]})
Last Commit: %{customdata[5]} (%{customdata[6]} days ago)
Last Created: %{customdata[7]} (%{customdata[8]} days ago)
Last Modified: %{customdata[9]} (%{customdata[10]} days ago)
Path: %{customdata[11]}',\n", + " customdata=data_frame[['fileCount', 'mostFrequentFileExtension', 'commitCount', 'authorCount', 'mainAuthor', 'secondAuthor','lastCommitDate', 'daysSinceLastCommit', 'lastCreationDate', 'daysSinceLastCreation', 'lastModificationDate', 'daysSinceLastModification', 'directoryPath']],\n", + " hovertemplate='%{label}
Files: %{customdata[0]} (%{customdata[1]})
Commits: %{customdata[2]}
Authors: %{customdata[4]}, %{customdata[5]},.. (%{customdata[3]})
Last Commit: %{customdata[6]} (%{customdata[7]} days ago)
Last Created: %{customdata[8]} (%{customdata[9]} days ago)
Last Modified: %{customdata[10]} (%{customdata[11]} days ago)
Path: %{customdata[12]}',\n", " maxdepth=-1,\n", " root_color=\"lightgrey\",\n", " marker=dict(**plotly_treemap_marker_base_style),\n", " )" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "641fa05c", + "metadata": {}, + "outputs": [], + "source": [ + "def create_rank_colorbar_for_graph_objects_treemap_marker(data_frame: pd.DataFrame, name_column: str, rank_column: str):\n", + " \"\"\"\n", + " Creates a plotly graph_objects.Treemap marker object for a colorbar representing ranked names.\n", + " data_frame : pd.DataFrame : The DataFrame that contains the name and the count column\n", + " name_column : str : The name of the column containing the ranking \n", + " rank_column : str : The name of the column containing the ranking \n", + " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", + " \"\"\"\n", + " # The rank is inverted so that the first rank is shown on the top of the colorbar.\n", + " inverse_ranked = data_frame[rank_column].max() + 1 - data_frame[rank_column]\n", + "\n", + " return dict(\n", + " cornerradius=5, \n", + " colors=inverse_ranked,\n", + " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", + " colorbar=dict(\n", + " title=\"Rank\",\n", + " tickmode=\"array\",\n", + " ticktext=data_frame[name_column],\n", + " tickvals=inverse_ranked,\n", + " tickfont_size=10\n", + " ),\n", + " )" + ] + }, { "cell_type": "markdown", "id": "acacc415", @@ -312,6 +344,41 @@ "metadata": {}, "outputs": [], "source": [ + "def get_last_entry(values: pd.Series):\n", + " \"\"\"\n", + " Get the last element of an array and converts therefore an array to a single element\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : any : The last entry\n", + " \"\"\"\n", + " return values[-1]\n", + "\n", + "\n", + "def add_file_extension_column(input_dataframe: pd.DataFrame, file_path_column: str, file_extension_column: str = 'fileExtension'):\n", + " \"\"\"\n", + " Adds a fileExtension column to the input DataFrame based on the file path column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " file_path_column : str : The name of the file path column\n", + " file_extension_column : str : The name of the file extension column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory column\n", + " \"\"\"\n", + " if file_extension_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " # What is the correct extension in the following cases?\n", + " # - /main/resources/META-INF/services/org.axonframework.messaging.annotation.HandlerEnhancerDefinition\n", + " # - MyReactComponent.test.tsx\n", + " # Currently, it would be\n", + " # - HandlerEnhancerDefinition\n", + " # - tsx\n", + " # which is not perfect but good enough to start with.#\n", + " \n", + " file_path_column_position = input_dataframe.columns.get_loc(file_path_column)\n", + " file_extensions=input_dataframe[file_path_column].str.split('/').map(get_last_entry)\n", + " file_extensions=file_extensions.str.split('.').map(get_last_entry)\n", + " input_dataframe.insert(file_path_column_position + 1, file_extension_column, file_extensions)\n", + " return input_dataframe\n", + "\n", "def remove_last_file_path_element(file_path_elements: list) -> list:\n", " \"\"\"\n", " Removes the last element of the file path so that only the directory names retain.\n", @@ -378,6 +445,16 @@ " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n", " return input_dataframe\n", "\n", + "\n", + "def collect_as_array(values: pd.Series):\n", + " \"\"\"\n", + " Just collect all values (no operation, \"noop\")\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : any : The second entry\n", + " \"\"\"\n", + " return np.asanyarray(values.to_list())\n", + "\n", "def second_entry(values: pd.Series):\n", " \"\"\"\n", " Returns the second entry of a list of values.\n", @@ -394,7 +471,22 @@ " values : Series : The pandas Series of values\n", " return : int : The number of files\n", " \"\"\"\n", - " return len(np.unique(np.concatenate(values.to_list())))" + " return len(np.unique(np.concatenate(values.to_list())))\n", + "\n", + "\n", + "def get_most_frequent_entry(input_values: pd.Series):\n", + " \"\"\"\n", + " Flattens the array of arrays and return the most frequent entry .\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " input_values : Series : The pandas Series of values\n", + " return : str : The most frequent entry\n", + " \"\"\"\n", + " # flatten the array of arrays \n", + " values = np.concatenate(input_values.to_list())\n", + " # find frequency of each value\n", + " values, counts = np.unique(values, return_counts=True)\n", + " #display all values with highest frequencies\n", + " return values[counts.argmax()]" ] }, { @@ -419,7 +511,7 @@ "git_file_authors=git_files_with_commit_statistics[['author', 'commitCount']].groupby('author').aggregate(\n", " authorCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", " ).sort_values(by='authorCommitCount', ascending=False).reset_index()\n", - "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=True, method='dense').astype(int)\n", + "git_file_authors['authorCommitCountRank'] = git_file_authors['authorCommitCount'].rank(ascending=False, method='dense').astype(int)\n", "\n", "# Debug\n", "# display(git_file_authors)\n", @@ -428,11 +520,21 @@ "# display(\"1. query result ---------------------\")\n", "# display(git_files_with_commit_statistics)\n", "\n", + "# Add new column 'fileExtension' for every 'filePath'\n", + "git_files_with_commit_statistics = add_file_extension_column(git_files_with_commit_statistics, 'filePath', 'fileExtension')\n", + "\n", + "# Create a separate dataframe with all unique extensions, the number of their occurrences and the rank derived from it.\n", + "git_file_extensions=git_files_with_commit_statistics['fileExtension'].value_counts().rename_axis('fileExtension').reset_index(name='fileExtensionCount')\n", + "git_file_extensions['fileExtensionCountRank'] = git_file_extensions['fileExtensionCount'].rank(ascending=False, method='dense').astype(int)\n", + "\n", + "# Debug\n", + "# display(git_file_extensions)\n", + "\n", "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n", "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n", "\n", "# Debug\n", - "# display(\"2. added directoryPath --------------\")\n", + "# display(\"2. added directoryPath and fileExtension --------------\")\n", "# display(git_files_with_commit_statistics)\n", "\n", "# Define how common non-grouped columns will be aggregated.\n", @@ -452,6 +554,7 @@ "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n", " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n", + " fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n", " **common_named_aggregation\n", ")\n", "\n", @@ -469,6 +572,7 @@ "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n", " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n", + " mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n", " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n", " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n", " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n", @@ -558,6 +662,42 @@ "figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "markdown", + "id": "e93d944a", + "metadata": {}, + "source": [ + "### Most frequent file extension per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0147c747", + "metadata": {}, + "outputs": [], + "source": [ + "git_files_with_commit_statistics_and_file_extension_rank = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " git_file_extensions, \n", + " left_on='mostFrequentFileExtension', \n", + " right_on=\"fileExtension\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_files_with_commit_statistics['fileCount'],\n", + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_file_extension_rank, 'fileExtension', 'fileExtensionCountRank')\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Most frequent file extension per directory'\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, { "cell_type": "markdown", "id": "e98ca7b1", @@ -634,47 +774,6 @@ "### Main author per directory" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "259f7278", - "metadata": {}, - "outputs": [], - "source": [ - "def create_git_authors_graph_objects_treemap_marker(main_data_frame: pd.DataFrame, author_rank_data_frame: pd.DataFrame, author_column_name: str):\n", - " \"\"\"\n", - " Creates a plotly graph_objects.Treemap marker object for git author plots.\n", - " main_data_frame : pd.DataFrame : The DataFrame that contains the git directories and their commit statistics\n", - " author_rank_data_frame : pd.DataFrame : The DataFrame that contains the git authors, their commit count and based on that their rank.\n", - " author_column_name : str : The name of the (aggregated) author column for coloring the plot\n", - " return : plotly_graph_objects.treemap.Marker : The created Marker object\n", - " \"\"\"\n", - " data_frame_with_authors=pd.merge(\n", - " main_data_frame, \n", - " author_rank_data_frame, \n", - " left_on=author_column_name, \n", - " right_on=\"author\",\n", - " how=\"left\",\n", - " validate=\"m:1\"\n", - " )\n", - " #display(data_frame_with_author_ranks)\n", - "\n", - " data_frame_with_author_ranks=data_frame_with_authors['authorCommitCountRank']\n", - "\n", - " return dict(\n", - " cornerradius=5, \n", - " colors=data_frame_with_author_ranks,\n", - " colorscale=plotly_colors.qualitative.G10, #favorites: plotly_colors.qualitative.G10, Blackbody, ice, haline, hot\n", - " colorbar=dict(\n", - " title=\"Rank\",\n", - " tickmode=\"array\",\n", - " ticktext=data_frame_with_authors[author_column_name],\n", - " tickvals=data_frame_with_author_ranks,\n", - " tickfont_size=8\n", - " ),\n", - " )\n" - ] - }, { "cell_type": "code", "execution_count": null, @@ -682,15 +781,23 @@ "metadata": {}, "outputs": [], "source": [ + "git_files_with_commit_statistics_and_main_author_rank = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " git_file_authors, \n", + " left_on='mainAuthor', \n", + " right_on=\"author\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", " # values = git_files_with_commit_statistics['fileCount'],\n", - " marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"mainAuthor\")\n", + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_main_author_rank, 'mainAuthor', 'authorCommitCountRank')\n", "))\n", "figure.update_layout(\n", " **plotly_treemap_layout_base_settings,\n", - " title='Main author (highest number of commits)'\n", + " title='Main authors with highest number of commits'\n", ")\n", "figure.show(**plotly_treemap_figure_show_settings)" ] @@ -710,15 +817,24 @@ "metadata": {}, "outputs": [], "source": [ + "git_files_with_commit_statistics_and_second_author_rank = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " git_file_authors, \n", + " left_on='secondAuthor', \n", + " right_on=\"author\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", + "\n", "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", " # values = git_files_with_commit_statistics['fileCount'],\n", - " marker=create_git_authors_graph_objects_treemap_marker(git_files_with_commit_statistics, git_file_authors, \"secondAuthor\")\n", + " marker=create_rank_colorbar_for_graph_objects_treemap_marker(git_files_with_commit_statistics_and_second_author_rank, 'secondAuthor', 'authorCommitCountRank')\n", "))\n", "figure.update_layout(\n", " **plotly_treemap_layout_base_settings,\n", - " title='Second author (second highest number of commits)'\n", + " title='Second author with the second highest number of commits'\n", ")\n", "figure.show(**plotly_treemap_figure_show_settings)" ]