From 6e3ac7e25950b1bd9acd587218b21048d9510887 Mon Sep 17 00:00:00 2001 From: "xunalei.lin" Date: Tue, 16 Jul 2024 10:32:22 +0000 Subject: [PATCH] [ALGOS-267] feat(algos): Improve the Louvain algorithm --- GDBMS_ALGO/community/louvain.gsql | 546 +++++++++++++++++++----------- 1 file changed, 343 insertions(+), 203 deletions(-) diff --git a/GDBMS_ALGO/community/louvain.gsql b/GDBMS_ALGO/community/louvain.gsql index f7c13e32..41050d90 100644 --- a/GDBMS_ALGO/community/louvain.gsql +++ b/GDBMS_ALGO/community/louvain.gsql @@ -1,13 +1,20 @@ -CREATE TEMPLATE QUERY GDBMS_ALGO.community.louvain(SET v_type_set, SET e_type_set, STRING weight_attribute = "weight", INT maximum_iteration = 10, - STRING result_attribute = "cid", STRING file_path = "", BOOL print_stats = FALSE) SYNTAX V1 { +CREATE TEMPLATE QUERY GDBMS_ALGO.community.louvain( + SET v_type_set, + SET e_type_set, + STRING weight_attribute = "weight", + UINT total_passes_count = 3, + UINT maximum_iteration = 10, + UINT total_batch_count = 12, + STRING result_attribute = "", + STRING file_path = "" +) SYNTAX V2 { /* - First Author: - First Commit Date: - - Recent Author: - Recent Commit Date: + First Author: xuanlei.lin@tigergraph.com + First Commit Date: 2024-07-16 + Recent Author: xuanlei.lin@tigergraph.com + Recent Commit Date: 2024-07-16 Repository: https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community @@ -16,231 +23,364 @@ CREATE TEMPLATE QUERY GDBMS_ALGO.community.louvain(SET v_type_set, SET move; - SumAccum @sum_ac; #sum of the degrees of all the vertices in community C of the vertex - ListAccum @cc_list; #the community center - SumAccum @sum_weight; # total weight incident to this vertex - SumAccum @sum_cc_weight; # total weight incident to the cc vertex - MapAccum> @A_map; #A[c]: sum of the edge weights for the edges in community c - MaxAccum @max_best_move; # highest dQ, highest -Outdegree, highest cc - ListAccum @cm_list; #community member list - SumAccum @@sum_m; # total edge weight - SumAccum @sum_outdegree; # helper variable for outdegree calculation - SumAccum @@sum_cc_change; - MapAccum> @@community_map; - MapAccum> @@community_size_count; + TYPEDEF TUPLE MyTuple; + SumAccum @@m; // The sum of the weights of all the links in the network. + MinAccum @community_id; // The community ID of the node. + SumAccum @k; // The sum of the weights of the links incident to the node. + SumAccum @k_in; // The sum of the weights of the links inside the previous community of the node. + SumAccum @k_self_loop; // The weight of the self-loop link. + MapAccum> @community_k_in_map; // Community of the neighbors of the nodes -> Sum of the weights of the links inside the community. + MapAccum> @@community_sum_total_map; // Community ID C -> Sum of the weights of the links incident to nodes in C. + SumAccum @community_sum_total; // Sum of the weights of the links incident to nodes in the community of the node. + MapAccum> @@community_sum_in_map; // Community ID -> Sum of the weights of the links inside the community. + MapAccum>> @@source_target_k_in_map; // Source community ID -> (Target community ID -> Sum of the weights of the links from the source community to the target community). + SumAccum @delta_Q_remove; // Delta Q to remove the node from the previous community. + MaxAccum @best_move; // Best move of the node with the highest delta Q to move the isolated node into the new community. + MaxAccum @@min_double; // Used to reset the @best_move. + OrAccum @to_change_community; + SumAccum @batch_id; + SumAccum @vid; FILE f(file_path); - // initialize - Start = {v_type_set}; - Start = SELECT s - FROM Start:s -(e_type_set:e)- :t - ACCUM - @@sum_m += e.getAttr(weight_attribute, "FLOAT")*0.5, - s.@sum_weight += e.getAttr(weight_attribute, "FLOAT")*1.0, - s.@sum_cc_weight += e.getAttr(weight_attribute, "FLOAT")*1.0, - s.@sum_outdegree += 1 - // mark @cc only for vertices with more than 1 neighbors - // and only the marked vertices will participate in the actual louvain algorithm - // the unmorked vertices will be resolved by the vertex following heuristic - POST-ACCUM - IF s.@sum_outdegree > 1 THEN - s.@cc_list += s - END; - IF print_stats THEN - PRINT Start.size() AS AllVertexCount; - END; + // Virtual edges + CREATE DIRECTED VIRTUAL EDGE belongs_to (FROM *, TO *, layer_set SET); + CREATE UNDIRECTED VIRTUAL EDGE links_to (FROM *, TO *, layer_weight_map MAP); - // special @cc update in the first iteration - Start = SELECT t - FROM Start:s -(e_type_set:e)- :t - WHERE s.@sum_outdegree > 1 AND t.@sum_outdegree > 1 - ACCUM - t.@max_best_move += move(e.getAttr(weight_attribute, "FLOAT")*1.0 + @@sum_m*t.@sum_weight * - (t.@sum_weight - s.@sum_weight), -s.@sum_cc_weight, s.@cc_list.get(0)) - POST-ACCUM - IF t.@max_best_move.deltaQ > 0 THEN - IF -t.@max_best_move.weight < t.@sum_cc_weight THEN - t.@cc_list.clear(), - t.@cc_list += t.@max_best_move.cc, - t.@sum_cc_weight = -t.@max_best_move.weight, - @@sum_cc_change += 1 - ELSE - IF -t.@max_best_move.weight == t.@sum_cc_weight AND getvid(t) < getvid(t.@max_best_move.cc) THEN - t.@cc_list.clear(), - t.@cc_list += t.@max_best_move.cc, - t.@sum_cc_weight = -t.@max_best_move.weight, - @@sum_cc_change += 1 + // -------------------- 1. First pass -------------------- + // Initialization + All_Nodes = {v_type_set}; + Pass_Nodes = SELECT s + FROM All_Nodes:s -(e_type_set:e)- :t + ACCUM @@m += e.getAttr(weight_attribute, "DOUBLE") / 2, + s.@k += e.getAttr(weight_attribute, "DOUBLE"), + IF s == t THEN // Self-loop link + s.@k_self_loop += e.getAttr(weight_attribute, "DOUBLE") END - END - END; - IF print_stats THEN - PRINT @@sum_cc_change AS InitChangeCount; + POST-ACCUM + s.@community_id = s, + s.@vid = getvid(s), + s.@batch_id = s.@vid % total_batch_count; + IF @@m < 0.00000000001 THEN + RETURN; END; - // main loop - WHILE @@sum_cc_change > 0 LIMIT maximum_iteration DO - // initialize for iteration - @@sum_cc_change = 0; - Start = SELECT s - FROM Start:s - WHERE s.@sum_outdegree > 1 + // Local moving + INT hop = 0; + Candidates (ANY) = Pass_Nodes; + WHILE Candidates.size() > 0 AND hop < maximum_iteration DO + hop = hop + 1; + IF hop == 1 THEN // First iteration + ChangedNodes = SELECT s + FROM Candidates:s -(e_type_set:e)- :t + WHERE s.@community_id != t.@community_id + ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@community_id) + POST-ACCUM + IF s.@best_move.delta_Q_add > 0 THEN // The gain (delta Q) is positive + s.@to_change_community = TRUE + END + HAVING s.@to_change_community == TRUE; + ELSE // Remaining iterations + // Calculate sum_total + Tmp = SELECT s + FROM Pass_Nodes:s + POST-ACCUM + @@community_sum_total_map += (s.@community_id -> s.@k); + Tmp = SELECT s + FROM Pass_Nodes:s + POST-ACCUM + s.@community_sum_total = @@community_sum_total_map.get(s.@community_id); + @@community_sum_total_map.clear(); + // Find the best move + ChangedNodes = {}; + FOREACH batch_id IN RANGE[0, total_batch_count-1] DO + // Calculate the delta Q to remove the node from the previous community + Nodes = SELECT s + FROM Candidates:s -(e_type_set:e)- :t + WHERE s.@batch_id == batch_id + ACCUM IF s.@community_id == t.@community_id THEN + s.@k_in += e.getAttr(weight_attribute, "DOUBLE") + ELSE + s.@community_k_in_map += (t.@community_id -> e.getAttr(weight_attribute, "DOUBLE")) + END + POST-ACCUM + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = MyTuple(@@min_double, s); // Reset the delta_Q_add + // Find the best move + Nodes = SELECT s + FROM Nodes:s -(e_type_set:e)- :t + WHERE s.@community_id != t.@community_id + ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@community_id) - s.@k * t.@community_sum_total / @@m, + s.@best_move += MyTuple(delta_Q_add, t.@community_id) + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // The gain (delta Q) is positive + s.@to_change_community = TRUE + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; + ChangedNodes = ChangedNodes UNION Nodes; + END; + END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s + FROM ChangedNodes:s -(e_type_set:e)- :t + WHERE s.@best_move.community == t.@community_id + AND t.@to_change_community == TRUE + AND t.@best_move.community == s.@community_id + // Only change the one with larger delta Q or the one with smaller @vid if delta Q are the same + AND (s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add + OR (abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 + AND s.@vid > t.@vid)) + POST-ACCUM + s.@to_change_community = FALSE; + ChangedNodes = ChangedNodes MINUS SwapNodes; + // Place each node of ChangedNodes in the community in which the gain is maximum + ChangedNodes = SELECT s + FROM ChangedNodes:s + POST-ACCUM + s.@community_id = s.@best_move.community, + s.@to_change_community = FALSE; + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t + FROM ChangedNodes:s -(e_type_set:e)- :t + WHERE t.@community_id != s.@community_id; + END; + + // Coarsening + UINT new_layer = 0; + @@community_sum_total_map.clear(); + Tmp = SELECT s + FROM Pass_Nodes:s -(e_type_set:e)- :t + ACCUM IF s.@community_id == t.@community_id THEN + @@community_sum_in_map += (s.@community_id -> e.getAttr(weight_attribute, "DOUBLE")) + END + POST-ACCUM + VERTEX cid = s.@community_id, + INSERT INTO belongs_to VALUES(s, cid, new_layer), + IF @@community_sum_in_map.containsKey(s) THEN + INSERT INTO links_to VALUES(s, s, (new_layer -> @@community_sum_in_map.get(s))) + END; + @@community_sum_in_map.clear(); + Tmp = SELECT s + FROM Pass_Nodes:s -(e_type_set:e)- :t + ACCUM IF s.@community_id != t.@community_id THEN + @@source_target_k_in_map += (s.@community_id -> (t.@community_id -> e.getAttr(weight_attribute, "DOUBLE"))) + END + POST-ACCUM + IF @@source_target_k_in_map.containsKey(s) THEN + FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO + INSERT INTO links_to VALUES(s, target_community, (new_layer -> k_in)) + END + END; + @@source_target_k_in_map.clear(); + + // -------------------- 2. Remaining passes -------------------- + SumAccum @@sum; + INT layer = 0; + WHILE layer < total_passes_count - 1 DO + // Reset + Tmp = SELECT s + FROM Pass_Nodes:s -(links_to:e)- :t + ACCUM @@sum += 1 + POST-ACCUM // Reset + s.@k = 0, + s.@k_in = 0, + s.@k_self_loop = 0, + s.@best_move = MyTuple(@@min_double, s); + // Initialization + Pass_Nodes = SELECT s + FROM Pass_Nodes:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + s.@k += weight, + IF s == t THEN // Self-loop link + s.@k_self_loop += weight + END + POST-ACCUM + s.@community_id = s; + IF @@m < 0.00000000001 THEN + PRINT "Warning: the sum of the weights in the edges should be greater than zero!"; + RETURN; + END; + + // Local moving + INT hop = 0; + Candidates = Pass_Nodes; + WHILE Candidates.size() > 0 AND hop < maximum_iteration DO + hop = hop + 1; + IF hop == 1 THEN // First iteration + ChangedNodes = SELECT s + FROM Candidates:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@community_id != t.@community_id + ACCUM s.@best_move += MyTuple(1 - s.@k * t.@k / (2 * @@m), t.@community_id) + POST-ACCUM + IF s.@best_move.delta_Q_add > 0 THEN // The gain (delta Q) is positive + s.@to_change_community = TRUE + END + HAVING s.@to_change_community == TRUE; + ELSE // Remaining iterations + // Calculate sum_total + Tmp = SELECT s + FROM Pass_Nodes:s POST-ACCUM - s.@sum_ac = 0, - s.@cm_list.clear(), - s.@A_map.clear(); - - Start = SELECT s - FROM Start:s - ACCUM - FOREACH v IN s.@cc_list DO - CASE WHEN getvid(v) != -1 THEN - v.@cm_list += s - END - END; - - Start = SELECT s - FROM Start:s -(e_type_set:e)- :t - WHERE t.@sum_outdegree > 1 - ACCUM - s.@A_map += (t.@cc_list.get(0) -> e.getAttr(weight_attribute, "FLOAT")*1.0); - - Start = SELECT s - FROM Start:s - ACCUM - FOREACH v IN s.@cc_list DO - CASE WHEN getvid(v) != -1 THEN - v.@sum_ac += s.@sum_weight - END - END; - - Start = SELECT s - FROM Start:s - ACCUM - FOREACH v IN s.@cm_list DO - CASE WHEN getvid(v) != -1 THEN - v.@sum_ac = s.@sum_ac - END - END; - - // compute @max_dQ - Start = SELECT s - FROM Start:s -(e_type_set:e)- :t - WHERE t.@sum_outdegree > 1 - ACCUM - INT A_s = 0, - IF s.@A_map.containsKey(s) THEN - A_s = s.@A_map.get(s) - END, - s.@max_best_move += move(s.@A_map.get(t.@cc_list.get(0)) - A_s + - 1/@@sum_m*s.@sum_weight*(s.@sum_ac-t.@sum_ac), -t.@sum_cc_weight, t.@cc_list.get(0)) + @@community_sum_total_map += (s.@community_id -> s.@k); + Tmp = SELECT s + FROM Pass_Nodes:s POST-ACCUM - IF s.@max_best_move.deltaQ > 0 THEN - IF -s.@max_best_move.weight < s.@sum_cc_weight THEN // smallest best_move weight < current weight - s.@cc_list.clear(), - s.@cc_list += s.@max_best_move.cc, - s.@sum_cc_weight = -s.@max_best_move.weight, - @@sum_cc_change += 1 - ELSE - IF -s.@max_best_move.weight == s.@sum_cc_weight AND getvid(s.@cc_list.get(0)) < getvid(s.@max_best_move.cc) THEN - s.@cc_list.clear(), - s.@cc_list += s.@max_best_move.cc, - s.@sum_cc_weight = -s.@max_best_move.weight, - @@sum_cc_change += 1 - END - END - END; - IF print_stats THEN - PRINT @@sum_cc_change AS IterChangeCount; + s.@community_sum_total = @@community_sum_total_map.get(s.@community_id); + @@community_sum_total_map.clear(); + // Find the best move + ChangedNodes = {}; + // Calculate the delta Q to remove the node from the previous community + Nodes = SELECT s + FROM Candidates:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s.@community_id == t.@community_id THEN + s.@k_in += weight + ELSE + s.@community_k_in_map += (t.@community_id -> weight) + END + POST-ACCUM + s.@delta_Q_remove = 2 * s.@k_self_loop - 2 * s.@k_in + s.@k * (s.@community_sum_total - s.@k) / @@m, + s.@k_in = 0, + s.@best_move = MyTuple(@@min_double, s); // Reset the delta_Q_add + // Find the best move + Nodes = SELECT s + FROM Nodes:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@community_id != t.@community_id + ACCUM DOUBLE delta_Q_add = 2 * s.@community_k_in_map.get(t.@community_id) - s.@k * t.@community_sum_total / @@m, + s.@best_move += MyTuple(delta_Q_add, t.@community_id) + POST-ACCUM + IF s.@delta_Q_remove + s.@best_move.delta_Q_add > 0 THEN // The gain (delta Q) is positive + s.@to_change_community = TRUE + END, + s.@community_k_in_map.clear() + HAVING s.@to_change_community == TRUE; + ChangedNodes = ChangedNodes UNION Nodes; END; - END; + // If two nodes swap, only change the community of one of them + SwapNodes = SELECT s + FROM ChangedNodes:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND s.@best_move.community == t.@community_id + AND t.@to_change_community == TRUE + AND t.@best_move.community == s.@community_id + // Only change the one with larger delta Q or the one with smaller @vid if delta Q are the same + AND (s.@delta_Q_remove + s.@best_move.delta_Q_add < t.@delta_Q_remove + t.@best_move.delta_Q_add + OR (abs((s.@delta_Q_remove + s.@best_move.delta_Q_add) - (t.@delta_Q_remove + t.@best_move.delta_Q_add)) < 0.00000000001 + AND s.@vid > t.@vid)) + POST-ACCUM + s.@to_change_community = FALSE; + ChangedNodes = ChangedNodes MINUS SwapNodes; + // Place each node of ChangedNodes in the community in which the gain is maximum + ChangedNodes = SELECT s + FROM ChangedNodes:s + POST-ACCUM + s.@community_id = s.@best_move.community, + s.@to_change_community = FALSE; + // Get all neighbours of the changed node that do not belong to the node’s new community + Candidates = SELECT t + FROM ChangedNodes:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + AND t.@community_id != s.@community_id; + END; - // process node with outdegree=1 - // follow the vertex to its neighbor's community - // if the neighbor also have outdegree=1, mark the two vertices as one community - Start = {v_type_set}; - Start = SELECT s - FROM Start:s -(e_type_set:e)- :t - WHERE s.@sum_outdegree == 1 AND t.@sum_outdegree != 1 - ACCUM - s.@cc_list += t.@cc_list.get(0); - IF print_stats THEN - PRINT Start.size() AS VertexFollowedToCommunity; + // Coarsening + UINT new_layer = layer + 1; + @@community_sum_total_map.clear(); + Tmp = SELECT s + FROM Pass_Nodes:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM IF s.@community_id == t.@community_id THEN + DOUBLE weight = e.layer_weight_map.get(layer), + @@community_sum_in_map += (s.@community_id -> weight) + END + POST-ACCUM + VERTEX cid = s.@community_id, + INSERT INTO belongs_to VALUES(s, cid, new_layer), + IF @@community_sum_in_map.containsKey(s) THEN + INSERT INTO links_to VALUES(s, s, (new_layer -> @@community_sum_in_map.get(s))) + END; + @@community_sum_in_map.clear(); + Tmp = SELECT s + FROM Pass_Nodes:s -(links_to:e)- :t + WHERE e.layer_weight_map.containsKey(layer) + ACCUM DOUBLE weight = e.layer_weight_map.get(layer), + IF s.@community_id != t.@community_id THEN + @@source_target_k_in_map += (s.@community_id -> (t.@community_id -> weight)) + END + POST-ACCUM + IF @@source_target_k_in_map.containsKey(s) THEN + FOREACH (target_community, k_in) IN @@source_target_k_in_map.get(s) DO + INSERT INTO links_to VALUES(s, target_community, (new_layer -> k_in)) + END + END; + @@source_target_k_in_map.clear(); + layer = layer + 1; END; - Start = {v_type_set}; - Start = SELECT s - FROM Start:s -(e_type_set:e)- :t - WHERE s.@sum_outdegree == 1 AND t.@sum_outdegree == 1 - ACCUM - IF getvid(s) <= getvid(t) THEN - s.@cc_list += s - ELSE - s.@cc_list += t - END; - IF print_stats THEN - PRINT Start.size() AS VertexFollowedToVertex; - END; + // -------------------- 3. Final community and output -------------------- + // Top layer + layer = total_passes_count - 1; + Nodes = SELECT s + FROM All_Nodes:s -(belongs_to>:e)- :t + WHERE layer IN e.layer_set + ACCUM s.@community_id = t; - // process node with outdegree=0 - // assign them to communities containing only itself - Start = {v_type_set}; - Start = SELECT s - FROM Start:s - WHERE s.@sum_outdegree == 0 - ACCUM - s.@cc_list += s; - IF print_stats THEN - PRINT Start.size() AS VertexAssignedToItself; + // Other layers + WHILE Nodes.size() > 0 AND layer > 0 DO + layer = layer - 1; + Nodes = SELECT s + FROM All_Nodes:s -(belongs_to>:e)- :t + WHERE layer IN e.layer_set + ACCUM s.@community_id = t.@community_id; END; - // save result - Start = {v_type_set}; - Start = SELECT s - FROM Start:s + // Output results + Nodes = SELECT s + FROM All_Nodes:s POST-ACCUM - IF result_attribute != "" THEN - s.setAttr(result_attribute, getvid(s.@cc_list.get(0))) - END, - IF file_path != "" THEN - f.println(s, getvid(s.@cc_list.get(0))) - END; - - // print result satistic - IF print_stats THEN - Start = SELECT s - FROM Start:s - WHERE s.@cc_list.size() > 0 - POST-ACCUM - @@community_map += (getvid(s.@cc_list.get(0)) -> 1); - PRINT @@community_map.size() AS FinalCommunityCount; - END; + IF result_attribute != "" THEN + s.setAttr(result_attribute, getvid(s.@community_id)) + END, + IF file_path != "" THEN + IF v_type_set.size() == 1 THEN + f.println(s.id, s.@community_id) + ELSE + VERTEX node = s.@community_id, + f.println(s.type, s, node.type, node) + END + END; }