Skip to content

Commit 0c6d666

Browse files
authored
Merge pull request #157 from tigergraph/ALGOS-266
[ALGOS-266] feat(algos): Improve the algorithm Label Propagation
2 parents d95a213 + 76fcb43 commit 0c6d666

File tree

1 file changed

+156
-74
lines changed

1 file changed

+156
-74
lines changed

GDBMS_ALGO/community/label_prop.gsql

Lines changed: 156 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
1-
CREATE TEMPLATE QUERY GDBMS_ALGO.community.label_prop (SET<STRING> v_type_set, SET<STRING> e_type_set, INT maximum_iteration, INT print_limit,
2-
BOOL print_results = TRUE, STRING file_path = "", STRING result_attribute = "") SYNTAX V1 {
3-
1+
CREATE TEMPLATE QUERY GDBMS_ALGO.community.label_prop(
2+
SET<STRING> v_type_set,
3+
SET<STRING> e_type_set,
4+
UINT maximum_iteration = 10,
5+
UINT sample_edge_num = 1000,
6+
UINT batch_num = 12,
7+
INT print_limit,
8+
BOOL print_results = TRUE,
9+
STRING result_attribute = "",
10+
STRING file_path=""
11+
) FOR GRAPH MyGraph SYNTAX V1 {
412

513
/*
6-
First Author: <First Author Name>
7-
First Commit Date: <First Commit Date>
8-
9-
Recent Author: <Recent Commit Author Name>
10-
Recent Commit Date: <Recent Commit Date>
14+
First Author: [email protected]
15+
First Commit Date: 2024-07-15
1116

17+
Recent Author: [email protected]
18+
Recent Commit Date: 2024-07-16
1219

1320
Repository:
1421
https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/Community
@@ -17,89 +24,164 @@ CREATE TEMPLATE QUERY GDBMS_ALGO.community.label_prop (SET<STRING> v_type_set, S
1724
Production
1825

1926
Description:
20-
Partition the vertices into communities, according to the Label Propagation method.
21-
Indicate community membership by assigning each vertex a community ID.
22-
23-
Publications:
24-
NA
27+
This query partitions vertices into communities using the Label Propagation method.
28+
It assigns a community ID to each vertex based on its neighbors' community IDs.
2529

2630
TigerGraph Documentation:
2731
https://docs.tigergraph.com/graph-ml/current/community-algorithms/label-propagation
2832

2933
Parameters:
3034
v_type_set:
31-
Names of vertex types to use
35+
The set of vertex types to traverse.
3236
e_type_set:
33-
Names of edge types to use
37+
The set of edge types to traverse.
3438
maximum_iteration:
35-
Number of maximum iteration of the algorithm
39+
The maximum number of iterations for the algorithm.
40+
sample_edge_num:
41+
The number of edges to sample for super nodes.
42+
batch_num:
43+
The number of batches. Using batches reduces memory consumption.
3644
print_limit:
37-
If >=0, max number of vertices to output to JSON.
45+
If >= 0, the maximum number of vertices to output to JSON.
3846
print_results:
39-
If True, output JSON to standard output
47+
If True, output JSON to standard output. WARNING: Avoid printing results for large datasets.
4048
result_attribute:
41-
If not empty, store community id values (INT) to this attribute
49+
If not empty, store community ID values (INT) in this attribute.
4250
file_path:
43-
If not empty, write output to this file.
51+
File to write CSV output to.
4452
*/
4553

46-
OrAccum @@or_changed = true;
47-
MapAccum<INT, INT> @map; # <communityId, numNeighbors>
48-
MapAccum<INT, INT> @@comm_sizes_map; # <communityId, members>
49-
SumAccum<INT> @sum_label, @sum_num;
50-
FILE f (file_path);
51-
Start = {v_type_set};
54+
TYPEDEF TUPLE <DOUBLE score, VERTEX community> MoveScore;
55+
MinAccum<VERTEX> @community_id; // Community ID of the node
56+
SumAccum<INT> @vid; // Vertex's internal ID
57+
SumAccum<INT> @batch_id; // Batch ID for the node
58+
SumAccum<INT> @degree; // Outdegree of the node
59+
SumAccum<INT> @@vertex_num; // Total number of vertices
60+
MapAccum<VERTEX, SumAccum<DOUBLE>> @community_k_in_map; // Number of neighbors belonging to each community
61+
MaxAccum<MoveScore> @best_move; // Best move for the node with the highest score
62+
MaxAccum<DOUBLE> @@min_double; // Used to reset the @best_move
63+
OrAccum @to_change_community; // Flag to check if the node needs to change community
64+
MapAccum<VERTEX, INT> @@comm_sizes_map; // Map: community ID -> size of the community
65+
FILE f(file_path); // File to write results to
66+
67+
// Initialization
68+
All_Nodes = {v_type_set};
69+
Tmp = SELECT s
70+
FROM All_Nodes:s -(e_type_set:e)- :t
71+
POST-ACCUM
72+
s.@community_id = s,
73+
s.@vid = getvid(s),
74+
s.@batch_id = s.@vid % batch_num,
75+
s.@degree = s.outdegree(e_type_set);
76+
@@vertex_num = All_Nodes.size();
77+
@@vertex_num = @@vertex_num / batch_num;
78+
79+
// Label propagation
80+
INT hop = 0;
81+
Candidates = All_Nodes;
82+
WHILE Candidates.size() > 0 AND hop < maximum_iteration DO
83+
hop = hop + 1;
84+
// Find the best move
85+
IF hop == 1 THEN // First iteration
86+
ChangedNodes = SELECT s
87+
FROM Candidates:s -(e_type_set:e)- :t
88+
WHERE s.@degree < t.@degree
89+
ACCUM s.@best_move += MoveScore(t.@degree, t.@community_id)
90+
POST-ACCUM
91+
IF s.@best_move.community != s.@community_id THEN
92+
s.@to_change_community = TRUE
93+
END
94+
HAVING s.@to_change_community == TRUE;
95+
ELSE // Remaining iterations
96+
IF Candidates.size() < @@vertex_num OR batch_num == 1 THEN // No batch processing
97+
ChangedNodes = SELECT s
98+
FROM Candidates:s -(e_type_set:e)- :t
99+
SAMPLE sample_edge_num EDGE WHEN s.outdegree(e_type_set) > sample_edge_num
100+
ACCUM s.@community_k_in_map += (t.@community_id -> 1)
101+
POST-ACCUM
102+
s.@best_move = MoveScore(@@min_double, s), // Reset best move
103+
FOREACH (community_id, k_in) IN s.@community_k_in_map DO
104+
s.@best_move += MoveScore(k_in, community_id)
105+
END,
106+
IF s.@best_move.community != s.@community_id THEN
107+
s.@to_change_community = TRUE
108+
END,
109+
s.@community_k_in_map.clear()
110+
HAVING s.@to_change_community == TRUE;
111+
ELSE // Use batch processing
112+
ChangedNodes = {};
113+
FOREACH batch_id IN RANGE[0, batch_num-1] DO
114+
Nodes = SELECT s
115+
FROM Candidates:s
116+
WHERE s.@batch_id == batch_id;
117+
Nodes = SELECT s
118+
FROM Nodes:s -(e_type_set:e)- :t
119+
SAMPLE sample_edge_num EDGE WHEN s.outdegree(e_type_set) > sample_edge_num
120+
ACCUM s.@community_k_in_map += (t.@community_id -> 1)
121+
POST-ACCUM
122+
s.@best_move = MoveScore(@@min_double, s), // Reset best move
123+
FOREACH (community_id, k_in) IN s.@community_k_in_map DO
124+
s.@best_move += MoveScore(k_in, community_id)
125+
END,
126+
IF s.@best_move.community != s.@community_id THEN
127+
s.@to_change_community = TRUE
128+
END,
129+
s.@community_k_in_map.clear()
130+
HAVING s.@to_change_community == TRUE;
131+
ChangedNodes = ChangedNodes UNION Nodes;
132+
END;
133+
END;
134+
END;
135+
136+
// Handle nodes that swap communities
137+
SwapNodes = SELECT s
138+
FROM ChangedNodes:s -(e_type_set:e)- :t
139+
WHERE s.@best_move.community == t.@community_id
140+
AND t.@to_change_community == TRUE
141+
AND t.@best_move.community == s.@community_id
142+
AND (s.@best_move.score < t.@best_move.score
143+
OR (abs(s.@best_move.score - t.@best_move.score) < 0.00000000001
144+
AND s.@vid > t.@vid))
145+
POST-ACCUM
146+
s.@to_change_community = FALSE;
147+
ChangedNodes = ChangedNodes MINUS SwapNodes;
52148

53-
# Assign unique labels to each vertex
54-
Start = SELECT s
55-
FROM Start:s
56-
ACCUM s.@sum_label = getvid(s);
149+
// Update community IDs
150+
ChangedNodes = SELECT s
151+
FROM ChangedNodes:s
152+
POST-ACCUM
153+
s.@community_id = s.@best_move.community,
154+
s.@to_change_community = FALSE;
57155

58-
# Propagate labels to neighbors until labels converge or the max iterations is reached
59-
WHILE @@or_changed == true LIMIT maximum_iteration DO
60-
@@or_changed = false;
61-
Start = SELECT s
62-
FROM Start:s -(e_type_set:e)- :t
63-
ACCUM t.@map += (s.@sum_label -> 1) # count the occurrences of neighbor's labels
64-
POST-ACCUM
65-
INT max_v = 0,
66-
INT label = 0,
67-
# Iterate over the map to get the neighbor label that occurs most often
68-
FOREACH (k,v) IN t.@map DO
69-
CASE WHEN v > max_v THEN
70-
max_v = v,
71-
label = k
72-
END
73-
END,
74-
# When the neighbor search finds a label AND it is a new label
75-
# AND the label's count has increased, update the label.
76-
CASE WHEN label != 0 AND t.@sum_label != label AND max_v > t.@sum_num THEN
77-
@@or_changed += true,
78-
t.@sum_label = label,
79-
t.@sum_num = max_v
80-
END,
81-
82-
END;
156+
// Find candidates for the next iteration
157+
Candidates = SELECT t
158+
FROM ChangedNodes:s -(e_type_set:e)- :t
159+
WHERE t.@community_id != s.@community_id;
160+
END;
83161

84-
Start = {v_type_set};
85-
Start = SELECT s
86-
FROM Start:s
87-
POST-ACCUM
88-
IF result_attribute != "" THEN
89-
s.setAttr(result_attribute, s.@sum_label)
90-
END,
91-
92-
IF file_path != "" THEN
93-
f.println(s, s.@sum_label)
94-
END,
95-
96-
IF print_results THEN
97-
@@comm_sizes_map += (s.@sum_label -> 1)
98-
END
99-
LIMIT print_limit;
162+
// Output results
163+
Nodes = SELECT s
164+
FROM All_Nodes:s
165+
POST-ACCUM
166+
IF result_attribute != "" THEN
167+
s.setAttr(result_attribute, getvid(s.@community_id))
168+
END,
169+
IF print_results THEN
170+
@@comm_sizes_map += (s.@community_id -> 1)
171+
END,
172+
IF file_path != "" THEN
173+
IF v_type_set.size() == 1 THEN
174+
f.println(s.id, s.@community_id)
175+
ELSE
176+
VERTEX node = s.@community_id,
177+
f.println(s.type, s, node.type, node)
178+
END
179+
END
180+
LIMIT print_limit;
100181

101-
IF print_results THEN
182+
// Print results if print_results is True
183+
IF print_results THEN
102184
PRINT @@comm_sizes_map;
103-
PRINT Start[Start.@sum_label];
104-
END;
185+
PRINT Nodes[Nodes.@community_id];
186+
END;
105187
}

0 commit comments

Comments
 (0)