Skip to content

Commit b109746

Browse files
committed
Add hyperparameter tuning to node embeddings
1 parent 839aa5b commit b109746

File tree

1 file changed

+227
-7
lines changed

1 file changed

+227
-7
lines changed

jupyter/NodeEmbeddingsJava.ipynb

+227-7
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,17 @@
6262
"from neo4j import GraphDatabase"
6363
]
6464
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": null,
68+
"id": "29b00ea6",
69+
"metadata": {},
70+
"outputs": [],
71+
"source": [
72+
"# Main Colormap\n",
73+
"main_color_map = 'nipy_spectral'"
74+
]
75+
},
6576
{
6677
"cell_type": "code",
6778
"execution_count": null,
@@ -171,7 +182,7 @@
171182
"# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
172183
"# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
173184
"\n",
174-
"def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n",
185+
"def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n",
175186
" \"\"\"\n",
176187
" Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
177188
" runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
@@ -200,8 +211,11 @@
200211
" empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
201212
" return empty_result\n",
202213
"\n",
203-
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
204-
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
214+
" if ignore_existing:\n",
215+
" embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n",
216+
" else: \n",
217+
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
218+
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
205219
" display(embeddings.head()) # Display the first entries of the table\n",
206220
" return embeddings"
207221
]
@@ -256,6 +270,7 @@
256270
" \"codeUnit\": embeddings.codeUnitName,\n",
257271
" \"artifact\": embeddings.projectName,\n",
258272
" \"communityId\": embeddings.communityId,\n",
273+
" \"clusteringTunedHDBSCANLabel\": embeddings.clusteringTunedHDBSCANLabel,\n",
259274
" \"centrality\": embeddings.centrality,\n",
260275
" \"x\": [value[0] for value in two_dimension_node_embeddings],\n",
261276
" \"y\": [value[1] for value in two_dimension_node_embeddings]\n",
@@ -273,15 +288,16 @@
273288
"outputs": [],
274289
"source": [
275290
"def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
276-
" if embeddings.empty:\n",
291+
" if node_embeddings_for_visualization.empty:\n",
277292
" print(\"No projected data to plot available\")\n",
278293
" return\n",
279294
"\n",
280295
" plot.scatter(\n",
281296
" x=node_embeddings_for_visualization.x,\n",
282297
" y=node_embeddings_for_visualization.y,\n",
283298
" s=node_embeddings_for_visualization.centrality * 300,\n",
284-
" c=node_embeddings_for_visualization.communityId,\n",
299+
" # c=node_embeddings_for_visualization.communityId,\n",
300+
" c=node_embeddings_for_visualization.clusteringTunedHDBSCANLabel,\n",
285301
" cmap=main_color_map,\n",
286302
" )\n",
287303
" plot.title(title)\n",
@@ -363,7 +379,208 @@
363379
" \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
364380
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
365381
"}\n",
366-
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
382+
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)"
383+
]
384+
},
385+
{
386+
"cell_type": "code",
387+
"execution_count": null,
388+
"id": "84642495",
389+
"metadata": {},
390+
"outputs": [],
391+
"source": [
392+
"import numpy.typing as numpy_typing\n",
393+
"\n",
394+
"class TunedClusteringResult:\n",
395+
" def __init__(self, labels : list, probabilities : list):\n",
396+
" self.labels = labels\n",
397+
" self.probabilities = probabilities\n",
398+
" self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n",
399+
" self.noise_count = np.sum(labels == -1)\n",
400+
" self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0\n",
401+
" def __repr__(self):\n",
402+
" return f\"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )\"\n",
403+
"\n",
404+
"def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n",
405+
" \"\"\"\n",
406+
" Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
407+
" The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
408+
" The result is a list of cluster ids for each node embedding.\n",
409+
" \"\"\"\n",
410+
" from sklearn.model_selection import GridSearchCV\n",
411+
" from sklearn.cluster import HDBSCAN\n",
412+
" from sklearn.metrics import adjusted_rand_score\n",
413+
" import numpy as np\n",
414+
"\n",
415+
" # specify parameters and distributions to sample from\n",
416+
" hyper_parameter_distributions = {\n",
417+
" \"min_samples\": [2, 3, 4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
418+
" \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
419+
" \"cluster_selection_method\": [\"eom\", \"leaf\"],\n",
420+
" \"metric\": [\"euclidean\", \"manhattan\"],\n",
421+
" }\n",
422+
" \n",
423+
" def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n",
424+
" \"\"\"\n",
425+
" Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n",
426+
" Input:\n",
427+
" - community_references: The true labels of the communities for the data points.\n",
428+
" Output:\n",
429+
" - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n",
430+
" \"\"\"\n",
431+
" def ari_score_with_penalty(estimator, embeddings):\n",
432+
" clustering_result = estimator.fit_predict(embeddings)\n",
433+
" \n",
434+
" if np.unique(clustering_result[clustering_result != -1]).size < 2:\n",
435+
" return -1 # Return worst score if only one cluster is found or all points are noise\n",
436+
" \n",
437+
" # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n",
438+
" noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n",
439+
"\n",
440+
" if noise_ratio > 0.50:\n",
441+
" return -1 # Return worst score if more than 50% percent of the points are unlabeled noise\n",
442+
"\n",
443+
" ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n",
444+
"\n",
445+
" # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n",
446+
" penalty = 1.0 - noise_ratio \n",
447+
" \n",
448+
" return ari * penalty\n",
449+
" return ari_score_with_penalty\n",
450+
"\n",
451+
"\n",
452+
" # Use custom CV that feeds all data to each fold (no slicing)\n",
453+
" all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n",
454+
"\n",
455+
" tuned_hdbscan = GridSearchCV(\n",
456+
" estimator=HDBSCAN(),\n",
457+
" refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n",
458+
" param_grid=hyper_parameter_distributions,\n",
459+
" n_jobs=4,\n",
460+
" scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n",
461+
" cv=all_data_without_slicing_cross_validator,\n",
462+
" verbose=1\n",
463+
" )\n",
464+
"\n",
465+
" tuned_hdbscan.fit(embeddings)\n",
466+
"\n",
467+
" #print(\"Best adjusted rand score with noise penalty:\", tuned_hdbscan.best_score_)\n",
468+
" print(\"Tuned HDBSCAN parameters:\", tuned_hdbscan.best_params_)\n",
469+
"\n",
470+
" # Run the clustering again with the best parameters\n",
471+
" cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, allow_single_cluster=False)\n",
472+
" best_model = cluster_algorithm.fit(embeddings)\n",
473+
"\n",
474+
" results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n",
475+
" print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n",
476+
" return results"
477+
]
478+
},
479+
{
480+
"cell_type": "code",
481+
"execution_count": null,
482+
"id": "8e1f0227",
483+
"metadata": {},
484+
"outputs": [],
485+
"source": [
486+
"import numpy.typing as numpy_typing\n",
487+
"\n",
488+
"class CommunityComparingScores:\n",
489+
" def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n",
490+
" self.adjusted_rand_index = adjusted_rand_index\n",
491+
" self.normalized_mutual_information = normalized_mutual_information\n",
492+
" self.scores = {\n",
493+
" \"Adjusted Rand Index\": adjusted_rand_index,\n",
494+
" \"Normalized Mutual Information\": normalized_mutual_information\n",
495+
" }\n",
496+
" def __repr__(self):\n",
497+
" return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n",
498+
"\n",
499+
"def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n",
500+
" \"\"\"\n",
501+
" Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n",
502+
" The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n",
503+
" \"\"\"\n",
504+
" from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n",
505+
"\n",
506+
" # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n",
507+
" mask = cluster_labels != -1\n",
508+
" ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n",
509+
" nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n",
510+
"\n",
511+
" return CommunityComparingScores(ari, nmi)"
512+
]
513+
},
514+
{
515+
"cell_type": "code",
516+
"execution_count": null,
517+
"id": "3c4e8821",
518+
"metadata": {},
519+
"outputs": [],
520+
"source": [
521+
"def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n",
522+
" \"\"\"\n",
523+
" Adds the clustering results to the embeddings DataFrame.\n",
524+
" \"\"\"\n",
525+
" embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n",
526+
" embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n",
527+
" return embeddings\n",
528+
"\n",
529+
"def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n",
530+
" \"\"\"\n",
531+
" Returns the clustering results distribution for the given clustering name.\n",
532+
" \"\"\"\n",
533+
" return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n",
534+
" probability=('clustering' + clustering_name + 'Probability', 'mean'),\n",
535+
" count=('codeUnitName', 'count'),\n",
536+
" communityIds=('communityId', lambda x: list(set(x))),\n",
537+
" codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n",
538+
" ).reset_index().sort_values(by='count', ascending=False)"
539+
]
540+
},
541+
{
542+
"cell_type": "code",
543+
"execution_count": null,
544+
"id": "c27ec0ec",
545+
"metadata": {},
546+
"outputs": [],
547+
"source": [
548+
"def add_tuned_hierarchical_density_based_spatial_clustering(embeddings: pd.DataFrame) -> pd.DataFrame:\n",
549+
" \"\"\"\n",
550+
" Applies the tuned hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
551+
" The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
552+
" The result is the input DataFrame with the clustering results added.\n",
553+
" \"\"\"\n",
554+
" # Apply the tuned HDBSCAN clustering algorithm\n",
555+
" embeddings_values = np.array(embeddings.embedding.tolist())\n",
556+
" community_reference_ids = np.array(embeddings.communityId.tolist())\n",
557+
" \n",
558+
" clustering_result = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n",
559+
" print(clustering_result)\n",
560+
" \n",
561+
" community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids)\n",
562+
" print(community_comparing_scores)\n",
563+
"\n",
564+
" # Add the clustering results to the embeddings DataFrame\n",
565+
" embeddings = add_clustering_results_to_embeddings(embeddings, clustering_result, \"TunedHDBSCAN\")\n",
566+
" \n",
567+
" # Get the clustering results distribution\n",
568+
" clustering_results_distribution = get_clustering_results_distribution(embeddings, \"TunedHDBSCAN\")\n",
569+
" \n",
570+
" # Display the clustering results distribution\n",
571+
" display(clustering_results_distribution)\n",
572+
" \n",
573+
" return embeddings"
574+
]
575+
},
576+
{
577+
"cell_type": "code",
578+
"execution_count": null,
579+
"id": "0b42ed2a",
580+
"metadata": {},
581+
"outputs": [],
582+
"source": [
583+
"embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)"
367584
]
368585
},
369586
{
@@ -429,9 +646,11 @@
429646
" \"dependencies_projection_node\": \"Package\",\n",
430647
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
431648
" \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
432-
" \"dependencies_projection_embedding_dimension\":\"64\"\n",
649+
" \"dependencies_projection_embedding_dimension\":\"128\"\n",
433650
"}\n",
434651
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
652+
"embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n",
653+
"\n",
435654
"node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
436655
"plot_2d_node_embeddings(\n",
437656
" node_embeddings_for_visualization, \n",
@@ -462,6 +681,7 @@
462681
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
463682
"}\n",
464683
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
684+
"embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n",
465685
"node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
466686
"plot_2d_node_embeddings(\n",
467687
" node_embeddings_for_visualization, \n",

0 commit comments

Comments
 (0)