|
493 | 493 | " \"\"\"\n",
|
494 | 494 | " return values.iloc[1] if len(values) > 1 else None\n",
|
495 | 495 | "\n",
|
496 |
| - "def get_file_count_from_aggregated_file_paths(values: pd.Series):\n", |
| 496 | + "def get_flattened_unique_values(values: pd.Series):\n", |
497 | 497 | " \"\"\"\n",
|
498 |
| - " Return the file count from an array of array of file paths.\n", |
| 498 | + " Return an array of unique string values from an array of array of strings.\n", |
| 499 | + " Meant to be used as an aggregation function for dataframe grouping.\n", |
| 500 | + " values : Series : The pandas Series of values\n", |
| 501 | + " return : Series : The pandas Series of values\n", |
| 502 | + " \"\"\"\n", |
| 503 | + " return np.unique(np.concatenate(values.to_list()))\n", |
| 504 | + "\n", |
| 505 | + "def count_unique_aggregated_values(values: pd.Series):\n", |
| 506 | + " \"\"\"\n", |
| 507 | + " Return the number of unique values from an array of array of strings.\n", |
499 | 508 | " Meant to be used as an aggregation function for dataframe grouping.\n",
|
500 | 509 | " values : Series : The pandas Series of values\n",
|
501 | 510 | " return : int : The number of files\n",
|
|
573 | 582 | "# Define how common non-grouped columns will be aggregated.\n",
|
574 | 583 | "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
|
575 | 584 | "common_named_aggregation = dict(\n",
|
576 |
| - " commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", |
577 | 585 | " daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
|
578 | 586 | " daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
|
579 | 587 | " daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
|
|
588 | 596 | " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
|
589 | 597 | " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
|
590 | 598 | " fileExtensions=pd.NamedAgg(column=\"fileExtension\", aggfunc=collect_as_array),\n",
|
| 599 | + " commitHashes=pd.NamedAgg(column=\"commitHashes\", aggfunc=get_flattened_unique_values),\n", |
| 600 | + " intermediateCommitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=\"count\"),\n", |
591 | 601 | " **common_named_aggregation\n",
|
592 | 602 | ")\n",
|
593 | 603 | "\n",
|
594 | 604 | "# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
|
595 | 605 | "# The author with the most commits will then be listed first for each directory.\n",
|
596 |
| - "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n", |
| 606 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'intermediateCommitCount'], ascending=[True, False])\n", |
597 | 607 | "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
|
598 | 608 | "\n",
|
599 | 609 | "# Debug\n",
|
|
603 | 613 | "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
|
604 | 614 | "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
|
605 | 615 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
|
606 |
| - " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", |
| 616 | + " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n", |
607 | 617 | " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
|
608 | 618 | " mostFrequentFileExtension=pd.NamedAgg(column=\"fileExtensions\", aggfunc=get_most_frequent_entry),\n",
|
609 | 619 | " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
|
610 | 620 | " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
|
611 | 621 | " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
|
| 622 | + " commitCount=pd.NamedAgg(column=\"commitHashes\", aggfunc=count_unique_aggregated_values),\n", |
612 | 623 | " **common_named_aggregation\n",
|
613 | 624 | ")\n",
|
614 | 625 | "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
|
|
669 | 680 | "git_files_with_commit_statistics.head(30)"
|
670 | 681 | ]
|
671 | 682 | },
|
| 683 | + { |
| 684 | + "cell_type": "code", |
| 685 | + "execution_count": null, |
| 686 | + "id": "53fcd8b2", |
| 687 | + "metadata": {}, |
| 688 | + "outputs": [], |
| 689 | + "source": [ |
| 690 | + "# Print prepared data frame to CSV file\n", |
| 691 | + "# git_files_with_commit_statistics.to_csv('git_files_with_commit_statistics.csv', index=False)" |
| 692 | + ] |
| 693 | + }, |
672 | 694 | {
|
673 | 695 | "cell_type": "markdown",
|
674 | 696 | "id": "ccc11f52",
|
|
0 commit comments