|
423 | 423 | " return input_dataframe.copy().join(dataframe_split), dataframe_split.columns.tolist()"
|
424 | 424 | ]
|
425 | 425 | },
|
| 426 | + { |
| 427 | + "cell_type": "markdown", |
| 428 | + "id": "da109679", |
| 429 | + "metadata": {}, |
| 430 | + "source": [ |
| 431 | + "### File Data Preparation Functions" |
| 432 | + ] |
| 433 | + }, |
| 434 | + { |
| 435 | + "cell_type": "code", |
| 436 | + "execution_count": null, |
| 437 | + "id": "299b06ea", |
| 438 | + "metadata": {}, |
| 439 | + "outputs": [], |
| 440 | + "source": [ |
| 441 | + "def remove_last_file_path_element(file_path_elements: list) -> list:\n", |
| 442 | + " \"\"\"\n", |
| 443 | + " Removes the last element of the file path so that only the directory names retain.\n", |
| 444 | + " file_path_elements : list : The list of levels to remove\n", |
| 445 | + " return : list : The list of the directories\n", |
| 446 | + " \"\"\"\n", |
| 447 | + " return file_path_elements[:-1] if len(file_path_elements) > 1 else ['']\n", |
| 448 | + "\n", |
| 449 | + "def convert_path_elements_to_directories(file_path_elements: list) -> list:\n", |
| 450 | + " \"\"\"\n", |
| 451 | + " Converts the file path elements into directories.\n", |
| 452 | + " file_path_elements : list : The list of levels to convert\n", |
| 453 | + " return : list : The list of directories\n", |
| 454 | + " \"\"\"\n", |
| 455 | + " directories = remove_last_file_path_element(file_path_elements)\n", |
| 456 | + " return ['/'.join(directories[:i+1]) for i in range(len(directories))]\n", |
| 457 | + "\n", |
| 458 | + "def add_directory_column(input_dataframe: pd.DataFrame, file_path_column: str, directory_column: str = 'directoryPath'):\n", |
| 459 | + " \"\"\"\n", |
| 460 | + " Adds a directory column to the input DataFrame based on the file path column.\n", |
| 461 | + " input_dataframe : pd.DataFrame : The input DataFrame\n", |
| 462 | + " file_path_column : str : The name of the file path column\n", |
| 463 | + " directory_column : str : The name of the directory column to be added\n", |
| 464 | + " return : pd.DataFrame : The DataFrame with added directory column\n", |
| 465 | + " \"\"\"\n", |
| 466 | + " if directory_column in input_dataframe.columns:\n", |
| 467 | + " return input_dataframe # Column already exists\n", |
| 468 | + " \n", |
| 469 | + " input_dataframe.insert(0, directory_column, input_dataframe[file_path_column].str.split('/').apply(convert_path_elements_to_directories))\n", |
| 470 | + " input_dataframe = input_dataframe.explode(directory_column)\n", |
| 471 | + " return input_dataframe\n", |
| 472 | + "\n", |
| 473 | + "def add_directory_name_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_name_column: str = 'directoryName'):\n", |
| 474 | + " \"\"\"\n", |
| 475 | + " Adds a directory name column to the input DataFrame based on the directory column.\n", |
| 476 | + " input_dataframe : pd.DataFrame : The input DataFrame\n", |
| 477 | + " directory_column : str : The name of the directory column\n", |
| 478 | + " directory_name_column : str : The name of the directory name column to be added\n", |
| 479 | + " return : pd.DataFrame : The DataFrame with added directory name column\n", |
| 480 | + " \"\"\"\n", |
| 481 | + " if directory_name_column in input_dataframe.columns:\n", |
| 482 | + " return input_dataframe # Column already exists\n", |
| 483 | + " \n", |
| 484 | + " splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n", |
| 485 | + " input_dataframe.insert(1, directory_name_column, splitted_directories.apply(lambda x: (x[-1])))\n", |
| 486 | + " return input_dataframe\n", |
| 487 | + "\n", |
| 488 | + "def add_parent_directory_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_parent_column: str = 'directoryParentPath'):\n", |
| 489 | + " \"\"\"\n", |
| 490 | + " Adds a directory parent column to the input DataFrame based on the directory column.\n", |
| 491 | + " input_dataframe : pd.DataFrame : The input DataFrame\n", |
| 492 | + " directory_column : str : The name of the directory column\n", |
| 493 | + " directory_parent_column : str : The name of the directory parent column to be added\n", |
| 494 | + " return : pd.DataFrame : The DataFrame with added directory parent column\n", |
| 495 | + " \"\"\"\n", |
| 496 | + " if directory_parent_column in input_dataframe.columns:\n", |
| 497 | + " return input_dataframe # Column already exists\n", |
| 498 | + " \n", |
| 499 | + " # Remove last path element from directory_column to get the directory_parent_column\n", |
| 500 | + " splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n", |
| 501 | + " input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n", |
| 502 | + " \n", |
| 503 | + " # Clear parent (set to empty string) when it equal to the directory\n", |
| 504 | + " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n", |
| 505 | + " return input_dataframe\n", |
| 506 | + "\n", |
| 507 | + "def second_entry(values: pd.Series):\n", |
| 508 | + " \"\"\"\n", |
| 509 | + " Returns the second entry of a list of values.\n", |
| 510 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 511 | + " values : Series : The pandas Series of values\n", |
| 512 | + " return : any : The second entry\n", |
| 513 | + " \"\"\"\n", |
| 514 | + " return values.iloc[1] if len(values) > 1 else None\n", |
| 515 | + "\n", |
| 516 | + "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n", |
| 517 | + " \"\"\"\n", |
| 518 | + " Return the file count from an array of array of file paths.\n", |
| 519 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 520 | + " values : Series : The pandas Series of values\n", |
| 521 | + " return : int : The number of files\n", |
| 522 | + " \"\"\"\n", |
| 523 | + " return len(np.unique(np.concatenate(values.to_list())))" |
| 524 | + ] |
| 525 | + }, |
| 526 | + { |
| 527 | + "cell_type": "markdown", |
| 528 | + "id": "09aeae9b", |
| 529 | + "metadata": {}, |
| 530 | + "source": [ |
| 531 | + "### File Data Preparation " |
| 532 | + ] |
| 533 | + }, |
| 534 | + { |
| 535 | + "cell_type": "code", |
| 536 | + "execution_count": null, |
| 537 | + "id": "682d8aa9", |
| 538 | + "metadata": {}, |
| 539 | + "outputs": [], |
| 540 | + "source": [ |
| 541 | + "git_files_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher\")\n", |
| 542 | + "\n", |
| 543 | + "display(\"1. query result ---------------------\")\n", |
| 544 | + "display(git_files_with_commit_statistics)\n", |
| 545 | + "\n", |
| 546 | + "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n", |
| 547 | + "git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n", |
| 548 | + "\n", |
| 549 | + "display(\"2. added directoryPath --------------\")\n", |
| 550 | + "display(git_files_with_commit_statistics)\n", |
| 551 | + "\n", |
| 552 | + "# Define how common non-grouped columns will be aggregated.\n", |
| 553 | + "common_named_aggregation = dict(\n", |
| 554 | + " commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", |
| 555 | + " daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n", |
| 556 | + " daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n", |
| 557 | + " daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n", |
| 558 | + " lastCommitDate=pd.NamedAgg(column=\"lastCommitDate\", aggfunc=\"max\"),\n", |
| 559 | + " lastCreationDate=pd.NamedAgg(column=\"lastCreationDate\", aggfunc=\"max\"),\n", |
| 560 | + " lastModificationDate=pd.NamedAgg(column=\"lastModificationDate\", aggfunc=\"max\"),\n", |
| 561 | + " maxCommitSha=pd.NamedAgg(column=\"maxCommitSha\", aggfunc=\"max\"),\n", |
| 562 | + ")\n", |
| 563 | + "\n", |
| 564 | + "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n", |
| 565 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n", |
| 566 | + " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", |
| 567 | + " **common_named_aggregation\n", |
| 568 | + ")\n", |
| 569 | + "\n", |
| 570 | + "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n", |
| 571 | + "# The author with the most commits will then be listed first for each directory.\n", |
| 572 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n", |
| 573 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", |
| 574 | + "\n", |
| 575 | + "display(\"3. grouped by 'directoryPath' and 'author' -----\")\n", |
| 576 | + "display(git_files_with_commit_statistics)\n", |
| 577 | + "\n", |
| 578 | + "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n", |
| 579 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n", |
| 580 | + " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", |
| 581 | + " # fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=lambda x: len(np.unique(np.concatenate(x.to_list())))),\n", |
| 582 | + " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n", |
| 583 | + " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n", |
| 584 | + " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n", |
| 585 | + " **common_named_aggregation\n", |
| 586 | + ")\n", |
| 587 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", |
| 588 | + "\n", |
| 589 | + "display(\"4. grouped by 'directoryPath' ----------------------\")\n", |
| 590 | + "display(git_files_with_commit_statistics)\n", |
| 591 | + "\n", |
| 592 | + "# git_files_with_commit_statistics['fileCount'] = (git_files_with_commit_statistics['fileCount'] / git_files_with_commit_statistics['authorCount']).astype(int)\n", |
| 593 | + "\n", |
| 594 | + "# display(\"4b. fixed file count ----------------------\")\n", |
| 595 | + "# display(git_files_with_commit_statistics)\n", |
| 596 | + "\n", |
| 597 | + "# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n", |
| 598 | + "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics)\n", |
| 599 | + "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics)\n", |
| 600 | + "\n", |
| 601 | + "display(\"5. added parent and name columns ------------\")\n", |
| 602 | + "display(git_files_with_commit_statistics)" |
| 603 | + ] |
| 604 | + }, |
| 605 | + { |
| 606 | + "cell_type": "markdown", |
| 607 | + "id": "114f8d4b", |
| 608 | + "metadata": {}, |
| 609 | + "source": [ |
| 610 | + "### File Data Preview" |
| 611 | + ] |
| 612 | + }, |
| 613 | + { |
| 614 | + "cell_type": "code", |
| 615 | + "execution_count": null, |
| 616 | + "id": "dc0c2d06", |
| 617 | + "metadata": {}, |
| 618 | + "outputs": [], |
| 619 | + "source": [ |
| 620 | + "git_files_with_commit_statistics.head(20)" |
| 621 | + ] |
| 622 | + }, |
426 | 623 | {
|
427 | 624 | "cell_type": "markdown",
|
428 | 625 | "id": "2d0df211",
|
|
504 | 701 | "### Directories by file count"
|
505 | 702 | ]
|
506 | 703 | },
|
| 704 | + { |
| 705 | + "cell_type": "code", |
| 706 | + "execution_count": null, |
| 707 | + "id": "bc0dc138", |
| 708 | + "metadata": {}, |
| 709 | + "outputs": [], |
| 710 | + "source": [ |
| 711 | + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", |
| 712 | + " create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", |
| 713 | + " values = git_files_with_commit_statistics['fileCount'],\n", |
| 714 | + " # create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n", |
| 715 | + " # values = git_file_directories_with_commit_statistics['fileCount'],\n", |
| 716 | + "))\n", |
| 717 | + "figure.update_layout(\n", |
| 718 | + " **plotly_treemap_layout_base_settings,\n", |
| 719 | + " title='Directories and their file count'\n", |
| 720 | + ")\n", |
| 721 | + "figure.show()" |
| 722 | + ] |
| 723 | + }, |
507 | 724 | {
|
508 | 725 | "cell_type": "code",
|
509 | 726 | "execution_count": null,
|
|
512 | 729 | "outputs": [],
|
513 | 730 | "source": [
|
514 | 731 | "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
|
| 732 | + " # create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n", |
| 733 | + " # values = git_files_with_commit_statistics['fileCount'],\n", |
515 | 734 | " create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n",
|
516 | 735 | " values = git_file_directories_with_commit_statistics['fileCount'],\n",
|
517 | 736 | "))\n",
|
|
0 commit comments