diff --git a/GDBMS_ALGO/community/lcc.gsql b/GDBMS_ALGO/community/lcc.gsql index 2354c4ac..9c763cda 100644 --- a/GDBMS_ALGO/community/lcc.gsql +++ b/GDBMS_ALGO/community/lcc.gsql @@ -1,13 +1,19 @@ -CREATE TEMPLATE QUERY GDBMS_ALGO.community.lcc (STRING v_type, STRING e_type,INT top_k=100,BOOL print_results = True, STRING result_attribute = "", - STRING file_path = "", BOOL display_edges = FALSE) SYNTAX V1 { +CREATE TEMPLATE QUERY GDBMS_ALGO.community.lcc ( + SET v_type_set, + SET e_type_set, + UINT top_k = 100, + BOOL print_results = FALSE, + STRING result_attribute = "", + STRING file_path = "", + BOOL display_edges = FALSE +) SYNTAX V1 { /* - First Author: - First Commit Date: - - Recent Author: - Recent Commit Date: + First Author: xuanlei.lin@tigergraph.com + First Commit Date: 2024-07-15 + Recent Author: xuanlei.lin@tigergraph.com + Recent Commit Date: 2024-07-15 Repository: https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community @@ -16,10 +22,9 @@ CREATE TEMPLATE QUERY GDBMS_ALGO.community.lcc (STRING v_type, STRING e_type,INT Production Description: - The Local Clustering Coefficient algorithm computes the local clustering coefficient - for each node in the graph. - lcc = Number_trangles/((n-1)n/2) - Here n is the outdegreeof vertex. + This query computes the Local Clustering Coefficient (LCC) for each node in the graph. + LCC = Number_of_triangles / ((n-1) * n / 2) + where n is the outdegree of the vertex. Publications: NA @@ -28,83 +33,100 @@ CREATE TEMPLATE QUERY GDBMS_ALGO.community.lcc (STRING v_type, STRING e_type,INT https://docs.tigergraph.com/graph-ml/current/community-algorithms/local-clustering-coefficient Parameters: - v_type: - vertex types to traverse + v_type_set: + The set of vertex types to traverse. + e_type_set: + The set of edge types to traverse. + top_k: + Number of top scores to report. print_results: - If True, print JSON output - e_type: - edge types to traverse + If True, print JSON output. result_attribute: - INT attribute to store results to - top_k: - report only this many top scores + Attribute to store the results. file_path: - file to write CSV output to + File to write CSV output to. display_edges: - If True, output edges for visualization + If True, output edges for visualization. WARNING: Avoid displaying edges for large datasets. */ - - TYPEDEF TUPLE Vertex_Score; - HeapAccum(top_k, score DESC) @@top_scores_heap; - SumAccum @sum_tri; #number of trangles - SumAccum @sum_lcc; #lcc value - SetAccum @neighbors_set; #neighbors set - OrAccum @or_self_con; #check if the vertex is self-connect - SetAccum @@edge_set; - FILE f (file_path); - # Here we compute the intersection for 2 points on the triangle. - - Start = {v_type}; - Start = SELECT s - FROM Start:s-(e_type)-v_type:t - ACCUM - IF getvid(s) != getvid(t) THEN - t.@neighbors_set += getvid(s) - ELSE - t.@or_self_con+=TRUE - END;# check id the vertex is self-connect - - Start = SELECT s - FROM Start:s-(e_type)-v_type:t - WHERE s.outdegree(e_type)>1 - ACCUM - s.@sum_tri+=COUNT((s.@neighbors_set INTERSECT t.@neighbors_set)) - POST-ACCUM - IF s.@or_self_con AND s.outdegree(e_type)<3 THEN - s.@sum_lcc+=0 - ELSE IF s.@or_self_con AND s.outdegree(e_type)>2 THEN - s.@sum_lcc+= (((s.@sum_tri+1-s.outdegree(e_type)))/((s.outdegree(e_type)-2)*(s.outdegree(e_type)-1))) - ELSE - s.@sum_lcc+= ((s.@sum_tri)/((s.outdegree(e_type)-0)*(s.outdegree(e_type)-1))) + + TYPEDEF TUPLE Vertex_Score; // Define a tuple for storing vertex scores + HeapAccum(top_k, score DESC) @@top_scores_heap; // Heap to store top-k scores + SumAccum @sum_outdegree; // Accumulator for the outdegree of vertices + SetAccum @set_nodes_in_frontier; // Set to store nodes in the frontier + MapAccum @map_node_tri_count; // Map: node in the frontier -> triangle count + SumAccum @sum_tri_count; // Accumulator for the count of triangles + SumAccum @sum_lcc; // Accumulator for the LCC value + SetAccum @@edge_set; // Set of edges for visualization + FILE f (file_path); // File to write results to + + // Calculate the outdegree for each vertex + AllNodes = {v_type_set}; + Nodes = SELECT s + FROM AllNodes:s-(e_type_set)-v_type_set:t + WHERE s != t + ACCUM s.@sum_outdegree += 1; + + // Find neighbors and prepare for triangle counting + Neighbors = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) + ACCUM t.@set_nodes_in_frontier += s; + + // Calculate the number of triangles involving nodes in the frontier + Tmp = SELECT s + FROM Neighbors:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) + ACCUM FOREACH node_in_frontier IN s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier DO + s.@map_node_tri_count += (node_in_frontier -> 1), + // Increment triangle count for s and t + s.@sum_tri_count += 1, + t.@sum_tri_count += 1 END; - - #output - Start = SELECT s - FROM Start:s - # Calculate Closeness Centrality for each vertex - POST-ACCUM - IF result_attribute != "" THEN - s.setAttr(result_attribute, s.@sum_lcc) - END, - IF print_results THEN - @@top_scores_heap += Vertex_Score(s, s.@sum_lcc) - END, - IF file_path != "" THEN - f.println(s, s.@sum_lcc) - END; - - IF file_path != "" THEN - f.println("Vertex_ID", "lcc"); - END; + // Sum up the triangle counts for nodes in the frontier + Tmp = SELECT s + FROM Tmp:s-(e_type_set)-v_type_set:t + WHERE s.@map_node_tri_count.containsKey(t) + ACCUM t.@sum_tri_count += s.@map_node_tri_count.get(t); + + // Calculate the LCC for nodes in the frontier + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + IF s.@sum_outdegree > 1 THEN + s.@sum_lcc = s.@sum_tri_count * 2.0 / (s.@sum_outdegree * (s.@sum_outdegree - 1)) + END; + + // Reset variables + Neighbors = SELECT s + FROM Neighbors:s + POST-ACCUM (s) + s.@map_node_tri_count.clear(), + s.@set_nodes_in_frontier.clear(); + + // Output results + AllNodes = SELECT s + FROM AllNodes:s + POST-ACCUM + IF result_attribute != "" THEN + s.setAttr(result_attribute, s.@sum_lcc) + END, + IF print_results THEN + @@top_scores_heap += Vertex_Score(s, s.@sum_lcc) + END, + IF file_path != "" THEN + f.println(s, s.@sum_lcc) + END; + + // Print results if print_results is True IF print_results THEN - PRINT @@top_scores_heap AS top_scores; - IF display_edges THEN - PRINT Start[Start.@sum_lcc]; - Start = SELECT s - FROM Start:s -(e_type:e)-:t - ACCUM @@edge_set += e; - PRINT @@edge_set; - END; + PRINT @@top_scores_heap AS top_scores; + IF display_edges THEN + PRINT AllNodes[AllNodes.@sum_lcc]; + AllNodes = SELECT s + FROM AllNodes:s -(e_type_set:e)-:t + ACCUM @@edge_set += e; + PRINT @@edge_set; + END; END; } diff --git a/GDBMS_ALGO/community/lcc_small_world.gsql b/GDBMS_ALGO/community/lcc_small_world.gsql new file mode 100644 index 00000000..4cda683c --- /dev/null +++ b/GDBMS_ALGO/community/lcc_small_world.gsql @@ -0,0 +1,310 @@ +CREATE TEMPLATE QUERY GDBMS_ALGO.community.lcc_small_world ( + SET v_type_set, + SET e_type_set, + UINT supernode_min_degree = 100000, + UINT threshold = 100000, + UINT top_k = 100, + BOOL print_results = FALSE, + STRING result_attribute = "", + STRING file_path = "", + BOOL display_edges = FALSE +) SYNTAX V1 { + + /* + First Author: xuanlei.lin@tigergraph.com + First Commit Date: 2024-07-15 + + Recent Author: xuanlei.lin@tigergraph.com + Recent Commit Date: 2024-07-18 + + Repository: + https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community + + Maturity: + Production + + Description: + This query computes the Local Clustering Coefficient (LCC) for each node in the graph. + LCC = Number_of_triangles / ((n-1) * n / 2) + where n is the outdegree of the vertex. + This query offers better memory efficiency on small-world graphs compared to the standard LCC. + However, it may take longer to execute. If memory usage is a concern, please use this version. + Otherwise, opt for the standard version. + + Publications: + NA + + TigerGraph Documentation: + https://docs.tigergraph.com/graph-ml/current/community-algorithms/local-clustering-coefficient + + Parameters: + v_type_set: + The set of vertex types to traverse. + e_type_set: + The set of edge types to traverse. + supernode_min_degree: + The minimum degree for a vertex to be considered a supernode. + The default value is 100000. + threshold: + Threshold for choosing initial pivot vertices. Only vertices whose product of indegree + and outdegree exceeds this threshold will be considered candidates for the pivot vertex. + The default value is 100000. + top_k: + Number of top scores to report. + print_results: + If True, print JSON output. + result_attribute: + Attribute to store the results. + file_path: + File to write CSV output to. + display_edges: + If True, output edges for visualization. WARNING: Avoid displaying edges for large datasets. + */ + + TYPEDEF TUPLE Vertex_Score; + HeapAccum(top_k, score DESC) @@top_scores_heap; // Heap to store top-k scores + SumAccum @sum_outdegree; // Accumulator for the outdegree + SumAccum @sum_indegree; // Accumulator for the indegree + SumAccum @sum_degree_product; // Accumulator for the product of outdegree and indegree + OrAccum @or_in_frontier; // Flag to check if the vertex is in the frontier + OrAccum @or_lcc_calculated; // Flag to check if the LCC of the vertex has been calculated + OrAccum @or_is_neighbor; // Flag to check if the vertex is a neighbor of nodes in the frontier + SetAccum @set_nodes_in_frontier; // Set of nodes in the frontier + MapAccum @map_node_tri_count; // Map: node in the frontier -> triangle count + SumAccum @sum_tri_count; // Accumulator for the count of triangles + SumAccum @sum_lcc; // Accumulator for LCC value + SetAccum @@edge_set; // Set of edges for visualization + FILE f (file_path); // File to write results to + + // -------------------- 1. Initialization -------------------- + // Calculate the product of indegree and outdegree, + // and filter vertices with product no less than the threshold + AllNodes = {v_type_set}; + PivotCandidates = SELECT s + FROM AllNodes:s-(e_type_set)-v_type_set:t + WHERE s != t + ACCUM s.@sum_outdegree += 1, + s.@sum_indegree += 1 + POST-ACCUM (s) + s.@sum_degree_product = s.@sum_indegree * s.@sum_outdegree + HAVING s.@sum_degree_product >= threshold; + + // -------------------- 2. Handle the supernodes -------------------- + // Calculate LCC for supernodes + SuperNodes = SELECT s + FROM PivotCandidates:s + WHERE s.@sum_outdegree >= supernode_min_degree; + WHILE SuperNodes.size() > 0 DO + // Select some supernodes and set them as frontier + Nodes = SELECT s + FROM SuperNodes:s + LIMIT 10; + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier += TRUE; + + // Find neighbors of nodes in the frontier + Neighbors = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE s != t + AND t.@or_lcc_calculated == FALSE // Don't visit nodes whose all triangles have been counted + AND (t.@or_in_frontier == FALSE // Neighbor not in the frontier + OR getvid(s) > getvid(t)) // If neighbor is in the frontier, only consider one direction + ACCUM t.@set_nodes_in_frontier += s + POST-ACCUM (t) + t.@or_is_neighbor += TRUE; + + // Calculate the number of triangles involving nodes in the frontier + Tmp = SELECT s + FROM Neighbors:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) // Traverse only one direction of the undirected edge + AND t.@or_is_neighbor == TRUE + ACCUM FOREACH node_in_frontier IN s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier DO + s.@map_node_tri_count += (node_in_frontier -> 1), + // s and t also increase the triangle count by 1 + s.@sum_tri_count += 1, + t.@sum_tri_count += 1 + END; + + // Calculate the sum of triangle counts for nodes in the frontier + Tmp = SELECT s + FROM Tmp:s-(e_type_set)-v_type_set:t + WHERE t.@or_in_frontier == TRUE + AND s.@map_node_tri_count.containsKey(t) + ACCUM t.@sum_tri_count += s.@map_node_tri_count.get(t); + + // Calculate LCC for nodes in the frontier + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier = FALSE, + s.@or_lcc_calculated = TRUE, + IF s.@sum_outdegree > 1 THEN + s.@sum_lcc = s.@sum_tri_count * 2.0 / (s.@sum_outdegree * (s.@sum_outdegree - 1)) + END; + + // Reset variables for the next iteration + Neighbors = SELECT s + FROM Neighbors:s + POST-ACCUM (s) + s.@or_is_neighbor = FALSE, + s.@map_node_tri_count.clear(), + s.@set_nodes_in_frontier.clear(); + + // Remove visited vertices from the SuperNodes set + SuperNodes = SuperNodes MINUS Nodes; + PivotCandidates = PivotCandidates MINUS Nodes; + END; + + // -------------------- 3. Handle nodes in large WCCs -------------------- + // Calculate LCC for nodes in large WCCs + WHILE PivotCandidates.size() > 0 DO + // Select the initial pivot vertex with the largest product of indegree and outdegree + Nodes = SELECT s + FROM PivotCandidates:s + ORDER BY s.@sum_degree_product DESC + LIMIT 1; + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier += TRUE; + + // Use BFS to find all elements in its connected component + WHILE Nodes.size() > 0 DO + // Find neighbors of nodes in the frontier + Neighbors = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE s != t + AND t.@or_lcc_calculated == FALSE // Don't visit nodes whose LCCs have been calculated + AND (t.@or_in_frontier == FALSE // Neighbor not in the frontier + OR getvid(s) > getvid(t)) // If neighbor is in the frontier, only consider one direction + ACCUM t.@set_nodes_in_frontier += s + POST-ACCUM (t) + t.@or_is_neighbor += TRUE; + + // Calculate the number of triangles involving nodes in the frontier + Tmp = SELECT s + FROM Neighbors:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) // Traverse only one direction of the undirected edge + AND t.@or_is_neighbor == TRUE + ACCUM FOREACH node_in_frontier IN s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier DO + s.@map_node_tri_count += (node_in_frontier -> 1), + // s and t also increase the triangle count by 1 + s.@sum_tri_count += 1, + t.@sum_tri_count += 1 + END; + + // Calculate the sum of triangle counts for nodes in the frontier + Tmp = SELECT s + FROM Tmp:s-(e_type_set)-v_type_set:t + WHERE t.@or_in_frontier == TRUE + AND s.@map_node_tri_count.containsKey(t) + ACCUM t.@sum_tri_count += s.@map_node_tri_count.get(t); + + // Calculate LCC for nodes in the frontier + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier = FALSE, + s.@or_lcc_calculated = TRUE, + IF s.@sum_outdegree > 1 THEN + s.@sum_lcc = s.@sum_tri_count * 2.0 / (s.@sum_outdegree * (s.@sum_outdegree - 1)) + END; + + // Reset variables for the next iteration + Neighbors = SELECT s + FROM Neighbors:s + POST-ACCUM (s) + s.@or_is_neighbor = FALSE, + s.@map_node_tri_count.clear(), + s.@set_nodes_in_frontier.clear(); + + // Use BFS to visit the next frontier + Nodes = SELECT t + FROM Nodes:s-(e_type_set:e)-v_type_set:t + WHERE t.@or_lcc_calculated == FALSE + POST-ACCUM t.@or_in_frontier += TRUE; + END; + + // Remove visited vertices from the PivotCandidates set + PivotCandidates = SELECT s + FROM PivotCandidates:s + WHERE s.@or_lcc_calculated == FALSE; + END; + + // -------------------- 4. Handle nodes in small WCCs -------------------- + // Calculate LCC for remaining vertices in small WCCs + Nodes = SELECT s + FROM AllNodes:s + WHERE s.@or_lcc_calculated == FALSE; + + // Find neighbors of the nodes + Neighbors = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE t.@or_lcc_calculated == FALSE + AND getvid(s) > getvid(t) + ACCUM t.@set_nodes_in_frontier += s + POST-ACCUM + t.@or_is_neighbor = TRUE; + + // Calculate the number of triangles involving nodes in the frontier + Tmp = SELECT s + FROM Neighbors:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) + AND t.@or_is_neighbor == TRUE + ACCUM FOREACH node_in_frontier IN s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier DO + s.@map_node_tri_count += (node_in_frontier -> 1), + // s and t also increase the triangle count by 1 + s.@sum_tri_count += 1, + t.@sum_tri_count += 1 + END; + + // Calculate the sum of triangle counts for nodes in the frontier + Tmp = SELECT s + FROM Tmp:s-(e_type_set)-v_type_set:t + WHERE s.@map_node_tri_count.containsKey(t) + ACCUM t.@sum_tri_count += s.@map_node_tri_count.get(t); + + // Calculate LCC for nodes in the frontier + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + IF s.@sum_outdegree > 1 THEN + s.@sum_lcc = s.@sum_tri_count * 2.0 / (s.@sum_outdegree * (s.@sum_outdegree - 1)) + END; + + // Reset variables for the next iteration + Neighbors = SELECT s + FROM Neighbors:s + POST-ACCUM (s) + s.@map_node_tri_count.clear(), + s.@set_nodes_in_frontier.clear(); + + // -------------------- 5. Output -------------------- + // Output the results + AllNodes = SELECT s + FROM AllNodes:s + POST-ACCUM + IF result_attribute != "" THEN + s.setAttr(result_attribute, s.@sum_lcc) + END, + IF print_results THEN + @@top_scores_heap += Vertex_Score(s, s.@sum_lcc) + END, + IF file_path != "" THEN + f.println(s, s.@sum_lcc) + END; + + // Print results if required + IF print_results THEN + PRINT @@top_scores_heap AS top_scores; + IF display_edges THEN + PRINT AllNodes[AllNodes.@sum_lcc]; + AllNodes = SELECT s + FROM AllNodes:s -(e_type_set:e)-:t + ACCUM @@edge_set += e; + PRINT @@edge_set; + END; + END; +} diff --git a/GDBMS_ALGO/community/tri_count.gsql b/GDBMS_ALGO/community/tri_count.gsql index a5e704d9..27f92d2e 100644 --- a/GDBMS_ALGO/community/tri_count.gsql +++ b/GDBMS_ALGO/community/tri_count.gsql @@ -1,12 +1,14 @@ -CREATE TEMPLATE QUERY GDBMS_ALGO.community.tri_count(STRING v_type, STRING e_type) SYNTAX V1 { +CREATE TEMPLATE QUERY GDBMS_ALGO.community.tri_count( + SET v_type_set, + SET e_type_set +) SYNTAX V1 { -/* - First Author: - First Commit Date: - - Recent Author: - Recent Commit Date: + /* + First Author: xuanlei.lin@tigergraph.com + First Commit Date: 2024-07-17 + Recent Author: xuanlei.lin@tigergraph.com + Recent Commit Date: 2024-07-17 Repository: https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community @@ -15,7 +17,7 @@ CREATE TEMPLATE QUERY GDBMS_ALGO.community.tri_count(STRING v_type, STRING e_typ Production Description: - This algorithm uses the classic edge-iterator method to count triangles. It is slower than the fast version, but uses less memory. + This query computes the total number of triangles in the graph. Publications: NA @@ -24,31 +26,30 @@ CREATE TEMPLATE QUERY GDBMS_ALGO.community.tri_count(STRING v_type, STRING e_typ https://docs.tigergraph.com/graph-ml/current/community-algorithms/triangle-counting Parameters: - v_type: - Vertex type to count - e_type: - Edge type to traverse + v_type_set: + The set of vertex types to traverse. + e_type_set: + The set of edge types to traverse. */ -# Compute the total number of triangles in the GRAPH. No input parameters are needed. -SumAccum @@sum_cnt; -SetAccum @self_set; - -all = {v_type}; -all = SELECT s - FROM all:s - ACCUM s.@self_set += s; - -# For each edge e, the number of triangles that contain e is equivalent -# to the number of common neighbors between vertices s and t - -tmp = SELECT t - FROM all:s -(e_type:e) -:t - WHERE getvid(s) > getvid(t) - ACCUM INT c1 = COUNT(s.neighbors(e_type) MINUS s.@self_set), - INT c2 = COUNT((s.neighbors(e_type) MINUS s.@self_set) MINUS (t.neighbors(e_type) MINUS t.@self_set)), - @@sum_cnt += c1-c2; - -# Each triangle is counted 3 times for each edge, so final result is divided by 3 -PRINT @@sum_cnt/3 AS num_triangles; + SumAccum @@sum_tri_count; + SetAccum @set_neighbors; + Nodes = {v_type_set}; + + // Build neighbor sets manually, only for vertices with smaller IDs in the triangle. + // This ensures that only two of the three vertices in a triangle will build neighbor sets. + Tmp = SELECT t + FROM Nodes:s-(e_type_set)- v_type_set:t + WHERE getvid(s) > getvid(t) + ACCUM t.@set_neighbors += s; + + // Compute the intersection of neighbor sets to count triangles. + // This step ensures that each triangle is counted only once. + Tmp = SELECT t + FROM Nodes:s-(e_type_set)- :t + WHERE getvid(s) > getvid(t) + ACCUM @@sum_tri_count += COUNT(s.@set_neighbors INTERSECT t.@set_neighbors); + + // Output the results + PRINT @@sum_tri_count AS num_triangles; } diff --git a/GDBMS_ALGO/community/tri_count_fast.gsql b/GDBMS_ALGO/community/tri_count_fast.gsql deleted file mode 100644 index 5cae093d..00000000 --- a/GDBMS_ALGO/community/tri_count_fast.gsql +++ /dev/null @@ -1,56 +0,0 @@ -CREATE TEMPLATE QUERY GDBMS_ALGO.community.tri_count_fast(STRING v_type, STRING e_type) SYNTAX V1 { - - /* - First Author: - First Commit Date: - - Recent Author: - Recent Commit Date: - - - Repository: - https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community - - Maturity: - Production - - Description: - The fast version of the Triangle Counting algorithm is faster than the standard version, but uses some additional memory. - - Publications: - NA - - TigerGraph Documentation: - https://docs.tigergraph.com/graph-ml/current/community-algorithms/triangle-counting - - Parameters: - v_type: - Vertex type to count - e_type: - Edge type to traverse - */ - -# Compute the total number of triangles in the graph -# This algorithm is faster than tri_count but uses additional memory for temporary storage -SumAccum @@sum_cnt; -SetAccum @neighbors_set; -//SumAccum @sum_outdegree; -all = {v_type}; - -# We build up our neighbor lists manually because we'll only build them up on the 2 smaller vertices on a triangle. - -tmp = SELECT t - FROM all:s-(e_type)- v_type:t - WHERE getvid(s) > getvid(t) - ACCUM t.@neighbors_set += s; - -# Here we compute the intersection for 2 points on the triangle. -tmp = SELECT t - FROM all:s-(e_type)- :t - WHERE getvid(s) > getvid(t) - ACCUM @@sum_cnt += COUNT(s.@neighbors_set INTERSECT t.@neighbors_set); - -# print result -PRINT @@sum_cnt AS num_triangles; - -} diff --git a/GDBMS_ALGO/community/tri_count_small_world.gsql b/GDBMS_ALGO/community/tri_count_small_world.gsql new file mode 100644 index 00000000..6b8611eb --- /dev/null +++ b/GDBMS_ALGO/community/tri_count_small_world.gsql @@ -0,0 +1,206 @@ +CREATE TEMPLATE QUERY GDBMS_ALGO.community.tri_count_small_world( + SET v_type_set, + SET e_type_set, + UINT supernode_min_degree = 100000, + UINT threshold = 100000 +) SYNTAX V1 { + + /* + First Author: xuanlei.lin@tigergraph.com + First Commit Date: 2024-07-18 + + Recent Author: xuanlei.lin@tigergraph.com + Recent Commit Date: 2024-07-18 + + Repository: + https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community + + Maturity: + Production + + Description: + This query computes the total number of triangles in the graph. + It is optimized for small-world graphs to save memory. + + Publications: + NA + + TigerGraph Documentation: + https://docs.tigergraph.com/graph-ml/current/community-algorithms/triangle-counting + + Parameters: + v_type_set: + The set of vertex types to traverse. + e_type_set: + The set of edge types to traverse. + supernode_min_degree: + The minimum degree for a vertex to be considered a supernode. + The default value is 100000. + threshold: + The threshold for choosing initial pivot vertices. Only vertices whose product of indegree + and outdegree exceeds this threshold will be considered candidates for the pivot vertex. + The default value is 100000. + */ + + SumAccum @sum_outdegree; // Accumulator for the outdegree + SumAccum @sum_indegree; // Accumulator for the indegree + SumAccum @sum_degree_product; // Accumulator for the product of outdegree and indegree + OrAccum @or_in_frontier; // Flag to check if the vertex is in the frontier + OrAccum @or_tri_counted; // Flag to check if all triangles that contain the vertex have been counted + OrAccum @or_is_neighbor; // Flag to check if the vertex is a neighbor of nodes in the frontier + SetAccum @set_nodes_in_frontier; // Set of nodes in the frontier + SumAccum @@sum_tri_count; // The count of triangles + + // -------------------- 1. Initialization -------------------- + // Calculate the product of indegree and outdegree, + // and filter vertices with a product no less than the threshold + AllNodes = {v_type_set}; + PivotCandidates = SELECT s + FROM AllNodes:s-(e_type_set)-v_type_set:t + WHERE s != t + ACCUM s.@sum_outdegree += 1, + s.@sum_indegree += 1 + POST-ACCUM (s) + s.@sum_degree_product = s.@sum_indegree * s.@sum_outdegree + HAVING s.@sum_degree_product >= threshold; + + // -------------------- 2. Handle the supernodes -------------------- + // Count the number of triangles for supernodes + SuperNodes = SELECT s + FROM PivotCandidates:s + WHERE s.@sum_outdegree >= supernode_min_degree; + WHILE SuperNodes.size() > 0 DO + // Select some supernodes and set them as frontier + Nodes = SELECT s + FROM SuperNodes:s + LIMIT 10; + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier += TRUE; + + // Find neighbors of nodes in the frontier + Neighbors = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE s != t + AND t.@or_tri_counted == FALSE // Don't visit nodes whose all triangles have been counted + AND (t.@or_in_frontier == FALSE // Neighbor not in the frontier + OR getvid(s) > getvid(t)) // If neighbor is in the frontier, only consider one direction + ACCUM t.@set_nodes_in_frontier += s + POST-ACCUM (t) + t.@or_is_neighbor += TRUE; + + // Calculate the number of triangles involving nodes in the frontier + Tmp = SELECT s + FROM Neighbors:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) // Traverse only one direction of the undirected edge + AND t.@or_is_neighbor == TRUE + ACCUM @@sum_tri_count += COUNT(s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier); + + // Reset variables for nodes in the frontier + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier = FALSE, + s.@or_tri_counted = TRUE; + + // Reset variables for the next iteration + Neighbors = SELECT s + FROM Neighbors:s + POST-ACCUM (s) + s.@or_is_neighbor = FALSE, + s.@set_nodes_in_frontier.clear(); + + // Remove visited vertices from the SuperNodes set + SuperNodes = SuperNodes MINUS Nodes; + PivotCandidates = PivotCandidates MINUS Nodes; + END; + + // -------------------- 3. Handle nodes in large WCCs -------------------- + // Count the number of triangles for nodes in large WCCs + WHILE PivotCandidates.size() > 0 DO + // Select the initial pivot vertex with the largest product of indegree and outdegree + Nodes = SELECT s + FROM PivotCandidates:s + ORDER BY s.@sum_degree_product DESC + LIMIT 1; + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier += TRUE; + + // Use BFS to find all elements in its connected component + WHILE Nodes.size() > 0 DO + // Find neighbors of nodes in the frontier + Neighbors = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE s != t + AND t.@or_tri_counted == FALSE // Don't visit nodes whose all triangles have been counted + AND (t.@or_in_frontier == FALSE // Neighbor not in the frontier + OR getvid(s) > getvid(t)) // If neighbor is in the frontier, only consider one direction + ACCUM t.@set_nodes_in_frontier += s + POST-ACCUM (t) + t.@or_is_neighbor += TRUE; + + // Calculate the number of triangles involving nodes in the frontier + Tmp = SELECT s + FROM Neighbors:s-(e_type_set)-v_type_set:t + WHERE getvid(s) > getvid(t) // Traverse only one direction of the undirected edge + AND t.@or_is_neighbor == TRUE + ACCUM @@sum_tri_count += COUNT(s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier); + + // Reset variables for nodes in the frontier + Nodes = SELECT s + FROM Nodes:s + POST-ACCUM (s) + s.@or_in_frontier = FALSE, + s.@or_tri_counted = TRUE; + + // Reset variables for the next iteration + Neighbors = SELECT s + FROM Neighbors:s + POST-ACCUM (s) + s.@or_is_neighbor = FALSE, + s.@set_nodes_in_frontier.clear(); + + // Use BFS to visit the next frontier + Nodes = SELECT t + FROM Nodes:s-(e_type_set:e)-v_type_set:t + WHERE t.@or_tri_counted == FALSE + POST-ACCUM t.@or_in_frontier += TRUE; + END; + + // Remove visited vertices from the PivotCandidates set + PivotCandidates = SELECT s + FROM PivotCandidates:s + WHERE s.@or_tri_counted == FALSE; + END; + + // -------------------- 4. Handle nodes in small WCCs -------------------- + // For remaining vertices in small WCCs + Nodes = SELECT s + FROM AllNodes:s + WHERE s.@or_tri_counted == FALSE; + + // Build neighbor sets manually, only for vertices with smaller IDs in the triangle. + // This ensures that only two of the three vertices in a triangle will build neighbor sets. + Tmp = SELECT t + FROM Nodes:s-(e_type_set)-v_type_set:t + WHERE t.@or_tri_counted == FALSE + AND getvid(s) > getvid(t) + ACCUM t.@set_nodes_in_frontier += s + POST-ACCUM + t.@or_is_neighbor = TRUE; + + // Compute the intersection of neighbor sets to count triangles. + // This step ensures that each triangle is counted only once. + Tmp = SELECT t + FROM Nodes:s-(e_type_set)-:t + WHERE getvid(s) > getvid(t) + AND t.@or_is_neighbor == TRUE + ACCUM @@sum_tri_count += COUNT(s.@set_nodes_in_frontier INTERSECT t.@set_nodes_in_frontier); + + // -------------------- 5. Output -------------------- + // Output the results + PRINT @@sum_tri_count AS num_triangles; +}