Skip to content

Commit a1d953a

Browse files
committed
fixup! Add git history file overview treemap
1 parent 394237c commit a1d953a

File tree

2 files changed

+181
-106
lines changed

2 files changed

+181
-106
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,51 @@
1-
// List git file directories and the number of files they contain
1+
// List git file directories and their statistics
22

33
MATCH (git_file:File&Git&!Repository)
4+
WHERE git_file.deletedAt IS NULL // filter out deleted files
5+
ORDER BY git_file.relativePath
46
WITH percentileDisc(git_file.createdAtEpoch, 0.5) AS medianCreatedAtEpoch
57
,percentileDisc(git_file.lastModificationAtEpoch, 0.5) AS medianLastModificationAtEpoch
6-
,collect(git_file) AS git_files
8+
,collect(git_file) AS git_files
79
UNWIND git_files AS git_file
810
WITH *
9-
,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS createdAtTimestamp
10-
,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS lastModificationAtTimestamp
11-
WHERE git_file.deletedAt IS NULL
12-
MATCH (git_file:File&Git&!Repository)
11+
,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS fileCreatedAtTimestamp
12+
,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS fileLastModificationAtTimestamp
1313
WITH *, split(git_file.relativePath, '/') AS pathElements
1414
WITH *, pathElements[-1] AS fileName
15-
WITH *
16-
,datetime.fromepochMillis(coalesce(git_file.createdAtEpoch, medianCreatedAtEpoch)) AS createdAtTimestamp
17-
,datetime.fromepochMillis(coalesce(git_file.lastModificationAtEpoch, medianLastModificationAtEpoch)) AS lastModificationAtTimestamp
18-
OPTIONAL MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
19-
OPTIONAL MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file)
20-
WITH *, sign(COUNT { (git_commit)-[:HAS_PARENT]-(:Commit) }) AS parentCommit
21-
ORDER BY git_commit.date DESC
15+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
16+
MATCH (git_commit:Git&Commit)-[:CONTAINS_CHANGE]->(git_change:Git&Change)-[]->(git_file)
17+
WITH pathElements
18+
,fileCreatedAtTimestamp
19+
,fileLastModificationAtTimestamp
20+
,fileName
21+
,git_file.relativePath AS fileRelativePath
22+
,max(git_repository.name) AS repository
23+
,COUNT(DISTINCT git_commit.sha) AS commitCount
24+
,COUNT(DISTINCT git_commit.author) AS authorCount
25+
,date(max(git_commit.date)) AS lastCommitDate
2226
UNWIND pathElements AS pathElement
2327
WITH *
24-
,coalesce(nullif(split(git_file.relativePath, '/' + pathElement)[0], git_file.relativePath), '') AS parent
28+
,coalesce(nullif(split(fileRelativePath, '/' + pathElement)[0], fileRelativePath), '') AS parent
2529
WITH *
2630
,coalesce(nullif(parent,'') + '/', '') + pathElement AS directory
27-
,fileName
2831
WHERE pathElement <> fileName
29-
WITH git_repository.name AS gitRepositoryName
30-
,directory AS gitDirectoryPath
31-
,parent AS directoryParentPath
32-
,split(parent, '/')[-1] AS directoryParentPathName
33-
,parent AS directoryParentName // TODO was directoryParentPathName
34-
,split(directory, '/')[-1] AS directoryPathName
35-
,directory AS directoryName // TODO was directoryPathName
36-
,size(split(directory, '/')) AS directoryPathLength
37-
,count(DISTINCT git_file.relativePath) AS fileCount
38-
,count(DISTINCT git_commit.sha) AS commitCount
39-
,sum(parentCommit) AS mergeCommitCount
40-
,count(DISTINCT git_commit.author) AS authorCount
41-
,date(max(git_commit.date)) AS latestCommitDate
42-
,max(date(createdAtTimestamp) ) AS latestCreationDate
43-
,max(date(lastModificationAtTimestamp)) AS latestModificationDate
44-
,duration.inDays(date(max(git_commit.date)), date()).days AS daysSinceLatestCommit
45-
,duration.inDays(max(createdAtTimestamp), datetime()).days AS daysSinceLatestCreation
46-
,duration.inDays(max(lastModificationAtTimestamp), datetime()).days AS daysSinceLatestModification
47-
,collect(DISTINCT git_file.relativePath)[0..9] AS relativePathExamples
48-
,collect(DISTINCT fileName)[0..9] AS fileNameExamples
49-
WHERE fileCount > 1 // Filter out single files and directories with only one file
50-
RETURN gitRepositoryName
51-
,gitDirectoryPath
52-
,directoryParentName
53-
,directoryParentPathName // TODO rename or delete?
54-
,directoryName
55-
,directoryPathName // TODO rename or delete?
56-
,directoryPathLength
57-
,fileCount
58-
,commitCount
59-
,mergeCommitCount
60-
,authorCount
61-
,latestCommitDate
62-
,latestCreationDate
63-
,latestModificationDate
64-
,daysSinceLatestCommit
65-
,daysSinceLatestCreation
66-
,daysSinceLatestModification
32+
RETURN repository AS gitRepositoryName
33+
,directory AS gitDirectoryPath
34+
,parent AS directoryParentPath
35+
,split(parent, '/')[-1] AS directoryParentPathName
36+
,parent AS directoryParentName // TODO was directoryParentPathName
37+
,split(directory, '/')[-1] AS directoryPathName
38+
,directory AS directoryName // TODO was directoryPathName
39+
,size(split(directory, '/')) AS directoryPathLength
40+
,count(DISTINCT fileRelativePath) AS fileCount
41+
,max(date(fileCreatedAtTimestamp) ) AS latestCreationDate
42+
,max(date(fileLastModificationAtTimestamp)) AS latestModificationDate
43+
,sum(commitCount) AS commitCount
44+
,sum(authorCount) AS authorCount
45+
,max(lastCommitDate) AS latestCommitDate
46+
,duration.inDays(max(lastCommitDate), date()).days AS daysSinceLatestCommit
47+
,duration.inDays(max(fileCreatedAtTimestamp), datetime()).days AS daysSinceLatestCreation
48+
,duration.inDays(max(fileLastModificationAtTimestamp), datetime()).days AS daysSinceLatestModification
49+
// Debugging
50+
//,collect(DISTINCT fileRelativePath)[0..4] AS relativePathExamples
51+
//,collect(DISTINCT fileName)[0..4] AS fileNameExamples

jupyter/GitHistoryGeneral.ipynb

+143-53
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
"source": [
2525
"import os\n",
2626
"import pandas as pd\n",
27+
"pd.options.mode.copy_on_write = True\n",
28+
"\n",
2729
"from neo4j import GraphDatabase"
2830
]
2931
},
@@ -37,7 +39,7 @@
3739
"import matplotlib.pyplot as plot\n",
3840
"import numpy as np\n",
3941
"import plotly.express as plotly_express\n",
40-
"import plotly.io as plotly_io"
42+
"from plotly import graph_objects as plotly_graph_objects"
4143
]
4244
},
4345
{
@@ -185,26 +187,11 @@
185187
]
186188
},
187189
{
188-
"cell_type": "code",
189-
"execution_count": null,
190-
"id": "83077395",
190+
"cell_type": "markdown",
191+
"id": "01da524e",
191192
"metadata": {},
192-
"outputs": [],
193193
"source": [
194-
"def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:\n",
195-
" \"\"\"\n",
196-
" Limits the values of the given column in the input data frame to the given quantile.\n",
197-
" The values are not filtered out but set to the limited (integer quantile value).\n",
198-
" input_data_frame : pd.DataFrame : The input data frame\n",
199-
" column_name : str : The name of the column to limit\n",
200-
" quantile : float : The quantile to limit the values to (default: 0.95)\n",
201-
" return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')\n",
202-
" \"\"\"\n",
203-
" data_frame=input_data_frame.copy()\n",
204-
" column_values = data_frame[column_name]\n",
205-
" column_limit = column_values.quantile(quantile)\n",
206-
" data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)\n",
207-
" return data_frame"
194+
"### Treemap Layout Functions and Constants"
208195
]
209196
},
210197
{
@@ -218,7 +205,8 @@
218205
"\n",
219206
"plotly_treemap_base_settings = dict(\n",
220207
" color_continuous_scale='Hot_r', # Hot_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n",
221-
" path=[plotly_express.Constant(\"root\"), 'gitRepositoryName', 'directoryParentName', 'directoryName'],\n",
208+
" # path=[plotly_express.Constant(\"root\"), 'gitRepositoryName', 'directoryParentName', 'directoryName'],\n",
209+
" path=['gitRepositoryName', 'directoryParentName', 'directoryName'],\n",
222210
" maxdepth=-1\n",
223211
")\n",
224212
"plotly_treemap_traces_base_settings = dict(\n",
@@ -231,7 +219,7 @@
231219
")\n",
232220
"plotly_treemap_figure_base_settings = dict(\n",
233221
" renderer=\"svg\" if is_command_line_execution() else None,\n",
234-
" width=1000,\n",
222+
" width=2000, #1000\n",
235223
" height=550\n",
236224
")"
237225
]
@@ -262,47 +250,110 @@
262250
]
263251
},
264252
{
265-
"cell_type": "markdown",
266-
"id": "acacc415",
253+
"cell_type": "code",
254+
"execution_count": null,
255+
"id": "b8cc624a",
267256
"metadata": {},
257+
"outputs": [],
268258
"source": [
269-
"### Data Preview"
259+
"def create_treemap_commit_statistics_settings(data_frame: pd.DataFrame):\n",
260+
" \"\"\"\n",
261+
" Creates a Plotly Treemap with the given settings and data frame.\n",
262+
" data_frame : pd.DataFrame : The input data frame\n",
263+
" return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n",
264+
" \"\"\"\n",
265+
" return plotly_graph_objects.Treemap(\n",
266+
" labels=treemap_data['directoryPathName'],\n",
267+
" parents=treemap_data['directoryParentPath'],\n",
268+
" ids=treemap_data['gitDirectoryPath'],\n",
269+
" customdata=treemap_data[['fileCount', 'commitCount', 'authorCount', 'latestCommitDate', 'daysSinceLatestCommit', 'latestCreationDate', 'daysSinceLatestCreation', 'latestModificationDate', 'daysSinceLatestModification', 'gitDirectoryPath']],\n",
270+
" hovertemplate='<b>%{label}</b><br>Files: %{customdata[0]}<br>Commits: %{customdata[1]}<br>Authors: %{customdata[2]}<br>Latest Commit: %{customdata[3]} (%{customdata[4]} days ago)<br>Last Created: %{customdata[5]} (%{customdata[6]} days ago)<br>Last Modified: %{customdata[7]} (%{customdata[8]} days ago)<br>Path: %{customdata[9]}',\n",
271+
" maxdepth=-1,\n",
272+
" root_color=\"lightgrey\",\n",
273+
" marker=dict(cornerradius=5),\n",
274+
" )"
270275
]
271276
},
272277
{
273-
"cell_type": "code",
274-
"execution_count": null,
275-
"id": "9c9de7c5",
278+
"cell_type": "markdown",
279+
"id": "acacc415",
276280
"metadata": {},
277-
"outputs": [],
278281
"source": [
279-
"git_file_directories_with_commit_statistics = query_cypher_to_data_frame(\n",
280-
" \"../cypher/GitLog/List_git_files_directories_with_commit_statistics.cypher\", limit=70).sort_values(by=\"fileCount\", ascending=False)\n",
281-
"git_file_directories_with_commit_statistics.sort_values(by=\"commitCount\", ascending=False).head(20)"
282+
"### Data Preparation Functions"
282283
]
283284
},
284285
{
285-
"cell_type": "markdown",
286-
"id": "80338c9c",
286+
"cell_type": "code",
287+
"execution_count": null,
288+
"id": "83077395",
287289
"metadata": {},
290+
"outputs": [],
288291
"source": [
289-
"### Null Checks"
292+
"def add_quantile_limited_column(input_data_frame : pd.DataFrame, column_name : str, quantile : float = 0.95) -> pd.DataFrame:\n",
293+
" \"\"\"\n",
294+
" Limits the values of the given column in the input data frame to the given quantile.\n",
295+
" The values are not filtered out but set to the limited (integer quantile value).\n",
296+
" input_data_frame : pd.DataFrame : The input data frame\n",
297+
" column_name : str : The name of the column to limit\n",
298+
" quantile : float : The quantile to limit the values to (default: 0.95)\n",
299+
" return : pd.DataFrame : The modified dataframe with the added column (column_name + '_limited')\n",
300+
" \"\"\"\n",
301+
" data_frame=input_data_frame.copy()\n",
302+
" column_values = data_frame[column_name]\n",
303+
" column_limit = column_values.quantile(quantile)\n",
304+
" data_frame[column_name + '_limited'] = np.where(column_values > column_limit, column_limit, column_values)\n",
305+
" return data_frame"
290306
]
291307
},
292308
{
293309
"cell_type": "code",
294310
"execution_count": null,
295-
"id": "6f95993e",
311+
"id": "009a7222",
296312
"metadata": {},
297313
"outputs": [],
298314
"source": [
299-
"# Null values in the DataFrame\n",
300-
"git_file_directories_with_commit_statistics.isnull().sum() "
315+
"def filter_out_non_existing_parent_ids(data_frame: pd.DataFrame, parent_column: str, id_column: str):\n",
316+
" \"\"\"\n",
317+
" Filters out all rows with a parent ID where there is no entry in the ID column.\n",
318+
" data_frame : pd.DataFrame : The input data frame\n",
319+
" parent_column : str : The name of the parent column\n",
320+
" id_column : str : The name of the ID column\n",
321+
" return : pd.DataFrame : The filtered data frame\n",
322+
" \"\"\"\n",
323+
" list_of_ids = data_frame[id_column].tolist() + ['']\n",
324+
" # For Debugging\n",
325+
" problems = data_frame[~data_frame[parent_column].isin(list_of_ids)]\n",
326+
" display(\"Filtered out rows with non-existing parent IDs:\")\n",
327+
" display(problems)\n",
328+
" return data_frame[data_frame[parent_column].isin(list_of_ids)]\n",
329+
"\n",
330+
"def replace_empty_parent_by_repository_name(data_frame: pd.DataFrame, column_name: str, repository_column_name: str = ''):\n",
331+
" \"\"\"\n",
332+
" Replaces the value 'root' in the given column by the repository name.\n",
333+
" data_frame : pd.DataFrame : The input data frame\n",
334+
" column_name : str : The name of the column\n",
335+
" gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n",
336+
" return : pd.DataFrame : The modified data frame\n",
337+
" \"\"\"\n",
338+
" repository_names = data_frame[repository_column_name]\n",
339+
" data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n",
340+
"\n",
341+
" return data_frame\n",
342+
"\n",
343+
"def prepare_treemap_commit_statistics_data(data_frame: pd.DataFrame) -> pd.DataFrame:\n",
344+
" \"\"\"\n",
345+
" data_frame : pd.DataFrame : The input data frame\n",
346+
" return : pd.DataFrame : The data frame prepared for treemap visualization\n",
347+
" \"\"\"\n",
348+
" prepared_data = data_frame\n",
349+
" prepared_data = filter_out_non_existing_parent_ids(prepared_data, 'directoryParentPath', 'gitDirectoryPath')\n",
350+
" prepared_data = replace_empty_parent_by_repository_name(prepared_data, 'directoryParentPath', 'gitRepositoryName')\n",
351+
" return prepared_data"
301352
]
302353
},
303354
{
304355
"cell_type": "markdown",
305-
"id": "f02b5d42",
356+
"id": "0b717f80",
306357
"metadata": {},
307358
"source": [
308359
"### Function to split file path levels"
@@ -311,10 +362,12 @@
311362
{
312363
"cell_type": "code",
313364
"execution_count": null,
314-
"id": "906f2ab6",
365+
"id": "6581ec23",
315366
"metadata": {},
316367
"outputs": [],
317368
"source": [
369+
"# TODO Still needed?\n",
370+
"\n",
318371
"def fill_array_to_length(length: int, fill_value=''):\n",
319372
" \"\"\"\n",
320373
" Fills the input array with the given fill value to the given length.\n",
@@ -351,33 +404,70 @@
351404
},
352405
{
353406
"cell_type": "markdown",
354-
"id": "ece07655",
407+
"id": "2d0df211",
355408
"metadata": {},
356409
"source": [
357-
"### Directories by file count"
410+
"### Data Preview"
358411
]
359412
},
360413
{
361414
"cell_type": "code",
362415
"execution_count": null,
363-
"id": "8a3725c0",
416+
"id": "9c9de7c5",
364417
"metadata": {},
365418
"outputs": [],
366419
"source": [
367-
"git_file_directories_with_commit_statistics_with_levels, level_column_names = add_file_path_levels(git_file_directories_with_commit_statistics, 'gitDirectoryPath', delimiter='/')\n",
368-
"#display(git_file_directories_with_commit_statistics_with_levels.head(13))\n",
420+
"git_file_directories_with_commit_statistics = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_directories_with_commit_statistics_new.cypher\")\n",
421+
"git_file_directories_with_commit_statistics = prepare_treemap_commit_statistics_data(git_file_directories_with_commit_statistics)\n",
369422
"\n",
370-
"figure = plotly_express.treemap(\n",
371-
" git_file_directories_with_commit_statistics_with_levels, \n",
372-
" # path=['gitRepositoryName', 'directoryParentName', 'directoryName'], \n",
373-
" path=['gitRepositoryName'] + level_column_names,\n",
374-
" hover_data=['gitDirectoryPath', 'commitCount', 'authorCount', 'latestCommitDate', 'daysSinceLatestCommit', 'latestCreationDate', 'daysSinceLatestCreation', 'latestModificationDate', 'daysSinceLatestModification'],\n",
375-
" values='fileCount', \n",
423+
"# Show a preview of the first 20 directories with the highest file count\n",
424+
"git_file_directories_with_commit_statistics.sort_values(by=\"fileCount\", ascending=False).head(10)"
425+
]
426+
},
427+
{
428+
"cell_type": "markdown",
429+
"id": "80338c9c",
430+
"metadata": {},
431+
"source": [
432+
"### Null Checks"
433+
]
434+
},
435+
{
436+
"cell_type": "code",
437+
"execution_count": null,
438+
"id": "6f95993e",
439+
"metadata": {},
440+
"outputs": [],
441+
"source": [
442+
"# Null values in the DataFrame\n",
443+
"git_file_directories_with_commit_statistics.isnull().sum() "
444+
]
445+
},
446+
{
447+
"cell_type": "markdown",
448+
"id": "ccc11f52",
449+
"metadata": {},
450+
"source": [
451+
"### Directories by file count"
452+
]
453+
},
454+
{
455+
"cell_type": "code",
456+
"execution_count": null,
457+
"id": "19d108bb",
458+
"metadata": {},
459+
"outputs": [],
460+
"source": [
461+
"figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
462+
" create_treemap_commit_statistics_settings(git_file_directories_with_commit_statistics),\n",
463+
" values = git_file_directories_with_commit_statistics['fileCount'],\n",
464+
" #marker=dict(cornerradius=5, colors=git_file_directories_with_commit_statistics['daysSinceLatestCommit'], colorscale='Hot_r'),\n",
465+
"))\n",
466+
"figure.update_layout(\n",
467+
" **plotly_treemap_layout_base_settings,\n",
376468
" title='Directories and their file count'\n",
377469
")\n",
378-
"figure.update_traces(**plotly_treemap_traces_base_settings)\n",
379-
"figure.update_layout(**plotly_treemap_layout_base_settings)\n",
380-
"figure.show(**plotly_treemap_figure_base_settings)"
470+
"figure.show()"
381471
]
382472
},
383473
{

0 commit comments

Comments
 (0)