Skip to content

Commit 67cfcea

Browse files
committed
fixup! Add git history file overview treemap
1 parent 1e962e5 commit 67cfcea

File tree

2 files changed

+243
-0
lines changed

2 files changed

+243
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// List git files with commit statistics
2+
3+
MATCH (git_file:File&Git&!Repository)
4+
WHERE git_file.deletedAt IS NULL // filter out deleted files
5+
WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch
6+
,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch
7+
,collect(git_file) AS git_files
8+
UNWIND git_files AS git_file
9+
WITH *
10+
,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp
11+
,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, git_file.createdAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp
12+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
13+
MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file)
14+
RETURN git_repository.name + '/' + git_file.relativePath AS filePath
15+
,split(git_commit.author, ' <')[0] AS author
16+
,count(DISTINCT git_commit.sha) AS commitCount
17+
,date(max(git_commit.date)) AS lastCommitDate
18+
,max(date(fileCreatedAtTimestamp)) AS lastCreationDate
19+
,max(date(fileLastModificationAtTimestamp)) AS lastModificationDate
20+
,duration.inDays(date(max(git_commit.date)), date()).days AS daysSinceLastCommit
21+
,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLastCreation
22+
,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLastModification
23+
,max(git_commit.sha) AS maxCommitSha
24+
ORDER BY filePath ASCENDING, commitCount DESCENDING

jupyter/GitHistoryGeneral.ipynb

+219
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,203 @@
423423
" return input_dataframe.copy().join(dataframe_split), dataframe_split.columns.tolist()"
424424
]
425425
},
426+
{
427+
"cell_type": "markdown",
428+
"id": "da109679",
429+
"metadata": {},
430+
"source": [
431+
"### File Data Preparation Functions"
432+
]
433+
},
434+
{
435+
"cell_type": "code",
436+
"execution_count": null,
437+
"id": "299b06ea",
438+
"metadata": {},
439+
"outputs": [],
440+
"source": [
441+
"def remove_last_file_path_element(file_path_elements: list) -> list:\n",
442+
" \"\"\"\n",
443+
" Removes the last element of the file path so that only the directory names retain.\n",
444+
" file_path_elements : list : The list of levels to remove\n",
445+
" return : list : The list of the directories\n",
446+
" \"\"\"\n",
447+
" return file_path_elements[:-1] if len(file_path_elements) > 1 else ['']\n",
448+
"\n",
449+
"def convert_path_elements_to_directories(file_path_elements: list) -> list:\n",
450+
" \"\"\"\n",
451+
" Converts the file path elements into directories.\n",
452+
" file_path_elements : list : The list of levels to convert\n",
453+
" return : list : The list of directories\n",
454+
" \"\"\"\n",
455+
" directories = remove_last_file_path_element(file_path_elements)\n",
456+
" return ['/'.join(directories[:i+1]) for i in range(len(directories))]\n",
457+
"\n",
458+
"def add_directory_column(input_dataframe: pd.DataFrame, file_path_column: str, directory_column: str = 'directoryPath'):\n",
459+
" \"\"\"\n",
460+
" Adds a directory column to the input DataFrame based on the file path column.\n",
461+
" input_dataframe : pd.DataFrame : The input DataFrame\n",
462+
" file_path_column : str : The name of the file path column\n",
463+
" directory_column : str : The name of the directory column to be added\n",
464+
" return : pd.DataFrame : The DataFrame with added directory column\n",
465+
" \"\"\"\n",
466+
" if directory_column in input_dataframe.columns:\n",
467+
" return input_dataframe # Column already exists\n",
468+
" \n",
469+
" input_dataframe.insert(0, directory_column, input_dataframe[file_path_column].str.split('/').apply(convert_path_elements_to_directories))\n",
470+
" input_dataframe = input_dataframe.explode(directory_column)\n",
471+
" return input_dataframe\n",
472+
"\n",
473+
"def add_directory_name_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_name_column: str = 'directoryName'):\n",
474+
" \"\"\"\n",
475+
" Adds a directory name column to the input DataFrame based on the directory column.\n",
476+
" input_dataframe : pd.DataFrame : The input DataFrame\n",
477+
" directory_column : str : The name of the directory column\n",
478+
" directory_name_column : str : The name of the directory name column to be added\n",
479+
" return : pd.DataFrame : The DataFrame with added directory name column\n",
480+
" \"\"\"\n",
481+
" if directory_name_column in input_dataframe.columns:\n",
482+
" return input_dataframe # Column already exists\n",
483+
" \n",
484+
" splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n",
485+
" input_dataframe.insert(1, directory_name_column, splitted_directories.apply(lambda x: (x[-1])))\n",
486+
" return input_dataframe\n",
487+
"\n",
488+
"def add_parent_directory_column(input_dataframe: pd.DataFrame, directory_column: str = 'directoryPath', directory_parent_column: str = 'directoryParentPath'):\n",
489+
" \"\"\"\n",
490+
" Adds a directory parent column to the input DataFrame based on the directory column.\n",
491+
" input_dataframe : pd.DataFrame : The input DataFrame\n",
492+
" directory_column : str : The name of the directory column\n",
493+
" directory_parent_column : str : The name of the directory parent column to be added\n",
494+
" return : pd.DataFrame : The DataFrame with added directory parent column\n",
495+
" \"\"\"\n",
496+
" if directory_parent_column in input_dataframe.columns:\n",
497+
" return input_dataframe # Column already exists\n",
498+
" \n",
499+
" # Remove last path element from directory_column to get the directory_parent_column\n",
500+
" splitted_directories = input_dataframe[directory_column].str.rsplit('/', n=1)\n",
501+
" input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n",
502+
" \n",
503+
" # Clear parent (set to empty string) when it equal to the directory\n",
504+
" input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[directory_column], directory_parent_column] = ''\n",
505+
" return input_dataframe\n",
506+
"\n",
507+
"def second_entry(values: pd.Series):\n",
508+
" \"\"\"\n",
509+
" Returns the second entry of a list of values.\n",
510+
" Meant to be used as an aggregation function for dataframe grouping.\n",
511+
" values : Series : The pandas Series of values\n",
512+
" return : any : The second entry\n",
513+
" \"\"\"\n",
514+
" return values.iloc[1] if len(values) > 1 else None\n",
515+
"\n",
516+
"def get_file_count_from_aggregated_file_paths(values: pd.Series):\n",
517+
" \"\"\"\n",
518+
" Return the file count from an array of array of file paths.\n",
519+
" Meant to be used as an aggregation function for dataframe grouping.\n",
520+
" values : Series : The pandas Series of values\n",
521+
" return : int : The number of files\n",
522+
" \"\"\"\n",
523+
" return len(np.unique(np.concatenate(values.to_list())))"
524+
]
525+
},
526+
{
527+
"cell_type": "markdown",
528+
"id": "09aeae9b",
529+
"metadata": {},
530+
"source": [
531+
"### File Data Preparation "
532+
]
533+
},
534+
{
535+
"cell_type": "code",
536+
"execution_count": null,
537+
"id": "682d8aa9",
538+
"metadata": {},
539+
"outputs": [],
540+
"source": [
541+
"git_files_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_with_commit_statistics_by_author.cypher\")\n",
542+
"\n",
543+
"display(\"1. query result ---------------------\")\n",
544+
"display(git_files_with_commit_statistics)\n",
545+
"\n",
546+
"# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n",
547+
"git_files_with_commit_statistics = add_directory_column(git_files_with_commit_statistics, 'filePath', 'directoryPath')\n",
548+
"\n",
549+
"display(\"2. added directoryPath --------------\")\n",
550+
"display(git_files_with_commit_statistics)\n",
551+
"\n",
552+
"# Define how common non-grouped columns will be aggregated.\n",
553+
"common_named_aggregation = dict(\n",
554+
" commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
555+
" daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
556+
" daysSinceLastCreation=pd.NamedAgg(column=\"daysSinceLastCreation\", aggfunc=\"min\"),\n",
557+
" daysSinceLastModification=pd.NamedAgg(column=\"daysSinceLastModification\", aggfunc=\"min\"),\n",
558+
" lastCommitDate=pd.NamedAgg(column=\"lastCommitDate\", aggfunc=\"max\"),\n",
559+
" lastCreationDate=pd.NamedAgg(column=\"lastCreationDate\", aggfunc=\"max\"),\n",
560+
" lastModificationDate=pd.NamedAgg(column=\"lastModificationDate\", aggfunc=\"max\"),\n",
561+
" maxCommitSha=pd.NamedAgg(column=\"maxCommitSha\", aggfunc=\"max\"),\n",
562+
")\n",
563+
"\n",
564+
"# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
565+
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
566+
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
567+
" **common_named_aggregation\n",
568+
")\n",
569+
"\n",
570+
"# Sort the grouped and aggregated entries by the name of the directory ascending and the number of commits descending.\n",
571+
"# The author with the most commits will then be listed first for each directory.\n",
572+
"git_files_with_commit_statistics = git_files_with_commit_statistics.sort_values(by=['directoryPath', 'commitCount'], ascending=[True, False])\n",
573+
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
574+
"\n",
575+
"display(\"3. grouped by 'directoryPath' and 'author' -----\")\n",
576+
"display(git_files_with_commit_statistics)\n",
577+
"\n",
578+
"# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
579+
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
580+
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
581+
" # fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=lambda x: len(np.unique(np.concatenate(x.to_list())))),\n",
582+
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
583+
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
584+
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
585+
" **common_named_aggregation\n",
586+
")\n",
587+
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
588+
"\n",
589+
"display(\"4. grouped by 'directoryPath' ----------------------\")\n",
590+
"display(git_files_with_commit_statistics)\n",
591+
"\n",
592+
"# git_files_with_commit_statistics['fileCount'] = (git_files_with_commit_statistics['fileCount'] / git_files_with_commit_statistics['authorCount']).astype(int)\n",
593+
"\n",
594+
"# display(\"4b. fixed file count ----------------------\")\n",
595+
"# display(git_files_with_commit_statistics)\n",
596+
"\n",
597+
"# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n",
598+
"git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics)\n",
599+
"git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics)\n",
600+
"\n",
601+
"display(\"5. added parent and name columns ------------\")\n",
602+
"display(git_files_with_commit_statistics)"
603+
]
604+
},
605+
{
606+
"cell_type": "markdown",
607+
"id": "114f8d4b",
608+
"metadata": {},
609+
"source": [
610+
"### File Data Preview"
611+
]
612+
},
613+
{
614+
"cell_type": "code",
615+
"execution_count": null,
616+
"id": "dc0c2d06",
617+
"metadata": {},
618+
"outputs": [],
619+
"source": [
620+
"git_files_with_commit_statistics.head(20)"
621+
]
622+
},
426623
{
427624
"cell_type": "markdown",
428625
"id": "2d0df211",
@@ -504,6 +701,26 @@
504701
"### Directories by file count"
505702
]
506703
},
704+
{
705+
"cell_type": "code",
706+
"execution_count": null,
707+
"id": "bc0dc138",
708+
"metadata": {},
709+
"outputs": [],
710+
"source": [
711+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
712+
" create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
713+
" values = git_files_with_commit_statistics['fileCount'],\n",
714+
" # create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n",
715+
" # values = git_file_directories_with_commit_statistics['fileCount'],\n",
716+
"))\n",
717+
"figure.update_layout(\n",
718+
" **plotly_treemap_layout_base_settings,\n",
719+
" title='Directories and their file count'\n",
720+
")\n",
721+
"figure.show()"
722+
]
723+
},
507724
{
508725
"cell_type": "code",
509726
"execution_count": null,
@@ -512,6 +729,8 @@
512729
"outputs": [],
513730
"source": [
514731
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
732+
" # create_treemap_commit_statistics_settings(git_files_with_commit_statistics),\n",
733+
" # values = git_files_with_commit_statistics['fileCount'],\n",
515734
" create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n",
516735
" values = git_file_directories_with_commit_statistics['fileCount'],\n",
517736
"))\n",

0 commit comments

Comments
 (0)