|
62 | 62 | "from neo4j import GraphDatabase"
|
63 | 63 | ]
|
64 | 64 | },
|
| 65 | + { |
| 66 | + "cell_type": "code", |
| 67 | + "execution_count": null, |
| 68 | + "id": "29b00ea6", |
| 69 | + "metadata": {}, |
| 70 | + "outputs": [], |
| 71 | + "source": [ |
| 72 | + "# Main Colormap\n", |
| 73 | + "main_color_map = 'nipy_spectral'" |
| 74 | + ] |
| 75 | + }, |
65 | 76 | {
|
66 | 77 | "cell_type": "code",
|
67 | 78 | "execution_count": null,
|
|
171 | 182 | "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
|
172 | 183 | "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
|
173 | 184 | "\n",
|
174 |
| - "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n", |
| 185 | + "def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n", |
175 | 186 | " \"\"\"\n",
|
176 | 187 | " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
|
177 | 188 | " runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
|
|
200 | 211 | " empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
|
201 | 212 | " return empty_result\n",
|
202 | 213 | "\n",
|
203 |
| - " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
204 |
| - " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", |
| 214 | + " if ignore_existing:\n", |
| 215 | + " embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n", |
| 216 | + " else: \n", |
| 217 | + " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
| 218 | + " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", |
205 | 219 | " display(embeddings.head()) # Display the first entries of the table\n",
|
206 | 220 | " return embeddings"
|
207 | 221 | ]
|
|
256 | 270 | " \"codeUnit\": embeddings.codeUnitName,\n",
|
257 | 271 | " \"artifact\": embeddings.projectName,\n",
|
258 | 272 | " \"communityId\": embeddings.communityId,\n",
|
| 273 | + " \"clusteringTunedHDBSCANLabel\": embeddings.clusteringTunedHDBSCANLabel,\n", |
259 | 274 | " \"centrality\": embeddings.centrality,\n",
|
260 | 275 | " \"x\": [value[0] for value in two_dimension_node_embeddings],\n",
|
261 | 276 | " \"y\": [value[1] for value in two_dimension_node_embeddings]\n",
|
|
273 | 288 | "outputs": [],
|
274 | 289 | "source": [
|
275 | 290 | "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
|
276 |
| - " if embeddings.empty:\n", |
| 291 | + " if node_embeddings_for_visualization.empty:\n", |
277 | 292 | " print(\"No projected data to plot available\")\n",
|
278 | 293 | " return\n",
|
279 | 294 | "\n",
|
280 | 295 | " plot.scatter(\n",
|
281 | 296 | " x=node_embeddings_for_visualization.x,\n",
|
282 | 297 | " y=node_embeddings_for_visualization.y,\n",
|
283 | 298 | " s=node_embeddings_for_visualization.centrality * 300,\n",
|
284 |
| - " c=node_embeddings_for_visualization.communityId,\n", |
| 299 | + " # c=node_embeddings_for_visualization.communityId,\n", |
| 300 | + " c=node_embeddings_for_visualization.clusteringTunedHDBSCANLabel,\n", |
285 | 301 | " cmap=main_color_map,\n",
|
286 | 302 | " )\n",
|
287 | 303 | " plot.title(title)\n",
|
|
363 | 379 | " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
|
364 | 380 | " \"dependencies_projection_embedding_dimension\":\"32\"\n",
|
365 | 381 | "}\n",
|
366 |
| - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n" |
| 382 | + "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)" |
| 383 | + ] |
| 384 | + }, |
| 385 | + { |
| 386 | + "cell_type": "code", |
| 387 | + "execution_count": null, |
| 388 | + "id": "84642495", |
| 389 | + "metadata": {}, |
| 390 | + "outputs": [], |
| 391 | + "source": [ |
| 392 | + "import numpy.typing as numpy_typing\n", |
| 393 | + "\n", |
| 394 | + "class TunedClusteringResult:\n", |
| 395 | + " def __init__(self, labels : list, probabilities : list):\n", |
| 396 | + " self.labels = labels\n", |
| 397 | + " self.probabilities = probabilities\n", |
| 398 | + " self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n", |
| 399 | + " self.noise_count = np.sum(labels == -1)\n", |
| 400 | + " self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0\n", |
| 401 | + " def __repr__(self):\n", |
| 402 | + " return f\"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )\"\n", |
| 403 | + "\n", |
| 404 | + "def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n", |
| 405 | + " \"\"\"\n", |
| 406 | + " Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n", |
| 407 | + " The parameters are tuned to get results similar to the ones of the community detection algorithm.\n", |
| 408 | + " The result is a list of cluster ids for each node embedding.\n", |
| 409 | + " \"\"\"\n", |
| 410 | + " from sklearn.model_selection import GridSearchCV\n", |
| 411 | + " from sklearn.cluster import HDBSCAN\n", |
| 412 | + " from sklearn.metrics import adjusted_rand_score\n", |
| 413 | + " import numpy as np\n", |
| 414 | + "\n", |
| 415 | + " # specify parameters and distributions to sample from\n", |
| 416 | + " hyper_parameter_distributions = {\n", |
| 417 | + " \"min_samples\": [2, 3, 4, 5, 6, 7, 10, 20, 30, 50, 100],\n", |
| 418 | + " \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n", |
| 419 | + " \"cluster_selection_method\": [\"eom\", \"leaf\"],\n", |
| 420 | + " \"metric\": [\"euclidean\", \"manhattan\"],\n", |
| 421 | + " }\n", |
| 422 | + " \n", |
| 423 | + " def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n", |
| 424 | + " \"\"\"\n", |
| 425 | + " Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n", |
| 426 | + " Input:\n", |
| 427 | + " - community_references: The true labels of the communities for the data points.\n", |
| 428 | + " Output:\n", |
| 429 | + " - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n", |
| 430 | + " \"\"\"\n", |
| 431 | + " def ari_score_with_penalty(estimator, embeddings):\n", |
| 432 | + " clustering_result = estimator.fit_predict(embeddings)\n", |
| 433 | + " \n", |
| 434 | + " if np.unique(clustering_result[clustering_result != -1]).size < 2:\n", |
| 435 | + " return -1 # Return worst score if only one cluster is found or all points are noise\n", |
| 436 | + " \n", |
| 437 | + " # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n", |
| 438 | + " noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n", |
| 439 | + "\n", |
| 440 | + " if noise_ratio > 0.50:\n", |
| 441 | + " return -1 # Return worst score if more than 50% percent of the points are unlabeled noise\n", |
| 442 | + "\n", |
| 443 | + " ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n", |
| 444 | + "\n", |
| 445 | + " # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n", |
| 446 | + " penalty = 1.0 - noise_ratio \n", |
| 447 | + " \n", |
| 448 | + " return ari * penalty\n", |
| 449 | + " return ari_score_with_penalty\n", |
| 450 | + "\n", |
| 451 | + "\n", |
| 452 | + " # Use custom CV that feeds all data to each fold (no slicing)\n", |
| 453 | + " all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n", |
| 454 | + "\n", |
| 455 | + " tuned_hdbscan = GridSearchCV(\n", |
| 456 | + " estimator=HDBSCAN(),\n", |
| 457 | + " refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n", |
| 458 | + " param_grid=hyper_parameter_distributions,\n", |
| 459 | + " n_jobs=4,\n", |
| 460 | + " scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n", |
| 461 | + " cv=all_data_without_slicing_cross_validator,\n", |
| 462 | + " verbose=1\n", |
| 463 | + " )\n", |
| 464 | + "\n", |
| 465 | + " tuned_hdbscan.fit(embeddings)\n", |
| 466 | + "\n", |
| 467 | + " #print(\"Best adjusted rand score with noise penalty:\", tuned_hdbscan.best_score_)\n", |
| 468 | + " print(\"Tuned HDBSCAN parameters:\", tuned_hdbscan.best_params_)\n", |
| 469 | + "\n", |
| 470 | + " # Run the clustering again with the best parameters\n", |
| 471 | + " cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, allow_single_cluster=False)\n", |
| 472 | + " best_model = cluster_algorithm.fit(embeddings)\n", |
| 473 | + "\n", |
| 474 | + " results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n", |
| 475 | + " print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n", |
| 476 | + " return results" |
| 477 | + ] |
| 478 | + }, |
| 479 | + { |
| 480 | + "cell_type": "code", |
| 481 | + "execution_count": null, |
| 482 | + "id": "8e1f0227", |
| 483 | + "metadata": {}, |
| 484 | + "outputs": [], |
| 485 | + "source": [ |
| 486 | + "import numpy.typing as numpy_typing\n", |
| 487 | + "\n", |
| 488 | + "class CommunityComparingScores:\n", |
| 489 | + " def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n", |
| 490 | + " self.adjusted_rand_index = adjusted_rand_index\n", |
| 491 | + " self.normalized_mutual_information = normalized_mutual_information\n", |
| 492 | + " self.scores = {\n", |
| 493 | + " \"Adjusted Rand Index\": adjusted_rand_index,\n", |
| 494 | + " \"Normalized Mutual Information\": normalized_mutual_information\n", |
| 495 | + " }\n", |
| 496 | + " def __repr__(self):\n", |
| 497 | + " return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n", |
| 498 | + "\n", |
| 499 | + "def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n", |
| 500 | + " \"\"\"\n", |
| 501 | + " Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n", |
| 502 | + " The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n", |
| 503 | + " \"\"\"\n", |
| 504 | + " from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n", |
| 505 | + "\n", |
| 506 | + " # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n", |
| 507 | + " mask = cluster_labels != -1\n", |
| 508 | + " ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n", |
| 509 | + " nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n", |
| 510 | + "\n", |
| 511 | + " return CommunityComparingScores(ari, nmi)" |
| 512 | + ] |
| 513 | + }, |
| 514 | + { |
| 515 | + "cell_type": "code", |
| 516 | + "execution_count": null, |
| 517 | + "id": "3c4e8821", |
| 518 | + "metadata": {}, |
| 519 | + "outputs": [], |
| 520 | + "source": [ |
| 521 | + "def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n", |
| 522 | + " \"\"\"\n", |
| 523 | + " Adds the clustering results to the embeddings DataFrame.\n", |
| 524 | + " \"\"\"\n", |
| 525 | + " embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n", |
| 526 | + " embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n", |
| 527 | + " return embeddings\n", |
| 528 | + "\n", |
| 529 | + "def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n", |
| 530 | + " \"\"\"\n", |
| 531 | + " Returns the clustering results distribution for the given clustering name.\n", |
| 532 | + " \"\"\"\n", |
| 533 | + " return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n", |
| 534 | + " probability=('clustering' + clustering_name + 'Probability', 'mean'),\n", |
| 535 | + " count=('codeUnitName', 'count'),\n", |
| 536 | + " communityIds=('communityId', lambda x: list(set(x))),\n", |
| 537 | + " codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n", |
| 538 | + " ).reset_index().sort_values(by='count', ascending=False)" |
| 539 | + ] |
| 540 | + }, |
| 541 | + { |
| 542 | + "cell_type": "code", |
| 543 | + "execution_count": null, |
| 544 | + "id": "c27ec0ec", |
| 545 | + "metadata": {}, |
| 546 | + "outputs": [], |
| 547 | + "source": [ |
| 548 | + "def add_tuned_hierarchical_density_based_spatial_clustering(embeddings: pd.DataFrame) -> pd.DataFrame:\n", |
| 549 | + " \"\"\"\n", |
| 550 | + " Applies the tuned hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n", |
| 551 | + " The parameters are tuned to get results similar to the ones of the community detection algorithm.\n", |
| 552 | + " The result is the input DataFrame with the clustering results added.\n", |
| 553 | + " \"\"\"\n", |
| 554 | + " # Apply the tuned HDBSCAN clustering algorithm\n", |
| 555 | + " embeddings_values = np.array(embeddings.embedding.tolist())\n", |
| 556 | + " community_reference_ids = np.array(embeddings.communityId.tolist())\n", |
| 557 | + " \n", |
| 558 | + " clustering_result = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n", |
| 559 | + " print(clustering_result)\n", |
| 560 | + " \n", |
| 561 | + " community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids)\n", |
| 562 | + " print(community_comparing_scores)\n", |
| 563 | + "\n", |
| 564 | + " # Add the clustering results to the embeddings DataFrame\n", |
| 565 | + " embeddings = add_clustering_results_to_embeddings(embeddings, clustering_result, \"TunedHDBSCAN\")\n", |
| 566 | + " \n", |
| 567 | + " # Get the clustering results distribution\n", |
| 568 | + " clustering_results_distribution = get_clustering_results_distribution(embeddings, \"TunedHDBSCAN\")\n", |
| 569 | + " \n", |
| 570 | + " # Display the clustering results distribution\n", |
| 571 | + " display(clustering_results_distribution)\n", |
| 572 | + " \n", |
| 573 | + " return embeddings" |
| 574 | + ] |
| 575 | + }, |
| 576 | + { |
| 577 | + "cell_type": "code", |
| 578 | + "execution_count": null, |
| 579 | + "id": "0b42ed2a", |
| 580 | + "metadata": {}, |
| 581 | + "outputs": [], |
| 582 | + "source": [ |
| 583 | + "embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)" |
367 | 584 | ]
|
368 | 585 | },
|
369 | 586 | {
|
|
429 | 646 | " \"dependencies_projection_node\": \"Package\",\n",
|
430 | 647 | " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
|
431 | 648 | " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
|
432 |
| - " \"dependencies_projection_embedding_dimension\":\"64\"\n", |
| 649 | + " \"dependencies_projection_embedding_dimension\":\"128\"\n", |
433 | 650 | "}\n",
|
434 | 651 | "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
|
| 652 | + "embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n", |
| 653 | + "\n", |
435 | 654 | "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
|
436 | 655 | "plot_2d_node_embeddings(\n",
|
437 | 656 | " node_embeddings_for_visualization, \n",
|
|
462 | 681 | " \"dependencies_projection_embedding_dimension\":\"32\"\n",
|
463 | 682 | "}\n",
|
464 | 683 | "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
|
| 684 | + "embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n", |
465 | 685 | "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
|
466 | 686 | "plot_2d_node_embeddings(\n",
|
467 | 687 | " node_embeddings_for_visualization, \n",
|
|
0 commit comments