-
Notifications
You must be signed in to change notification settings - Fork 72
/
Copy pathtg_fastRP.gsql
279 lines (244 loc) · 10.5 KB
/
tg_fastRP.gsql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
CREATE QUERY tg_fastRP(
SET<STRING> v_type_set,
SET<STRING> e_type_set,
SET<STRING> output_v_type_set,
STRING iteration_weights,
FLOAT beta,
INT embedding_dimension,
INT default_index = 0,
INT default_length,
FLOAT default_weight = 1,
SET<STRING> embedding_dim_map,
INT sampling_constant = 3,
INT random_seed = 42,
STRING result_attribute="",
STRING component_attribute="",
INT batch_number=0,
STRING filepath="",
BOOL print_results=FALSE,
INT choose_k=5) SYNTAX V1 {
/*
First Author: Wyatt Joyner
First Commit Date:
Recent Author: Parker Erickson
Recent Commit Date:
Repository:
https://github.com/tigergraph/gsql-graph-algorithms/tree/master/algorithms/GraphML
Maturity:
Beta
Description:
FastRP is a vertex embedding algorithm. Embedding algorithms create vector representations of each
vertex in a reduced dimension ("embedding dimension"). These embeddings then can be used in downstream
tasks such as cosine similarity between two vertices, or machine learning algorithms. FastRP in particular
utilizes random projections to preserve pairwise distances between two vertices given their topological information
in the graph.
The original FastRP implementation does not account for heterogenous graphs (graphs with multiple edge types).
This implementation allows users to section off certain areas of the embedding vector to be modified when defined
edge types are traversed. If users do not specify any particular embedding regions, all edge types will be considered.
NOTE: This query needs to be modified based upon your schema to set the embedding
attribute accordingly if you wish to store the embeddings in your graph. Each vertex should have
an attribute with type LIST<DOUBLE>. If you do not wish to store the embeddings in graph, remove the applicable lines.
NOTE: FastRP was originally created to work on undirected graphs. It can work on directed graphs, although
we recommend to have reverse edges in your schema so the embeddings can be passed in both directions. If
using a directed graph, there is a modification to allow it to run with sink vertices, but this may
affect the embeddings.
Publications:
https://arxiv.org/pdf/1908.11512.pdf
TigerGraph Documentation:
Parameters:
v_type_set:
Set of all vertex types to traverse in the graph.
e_type_set:
Set of all edge types to traverse in the graph.
output_v_type_set:
Set of vertex types to produce output embeddings for.
iteration_weights:
A comma-separated string of numbers to weight each iteration of the embedding process.
beta:
A float value that normalizes high-degree vertices in the graph. Usually between -1 and 0, where
-1 penalizes high-degree vertices heavily. If desired, one can use a positive value to weight
high-degree vertices more than low-degree vertices.
embedding_dimension:
The total dimension of the desired embedding.
default_index:
Index in the embedding vector default edges (edge types that are not explicitly defined in `embedding_dim_map`) start to
be incorporated in the vector.
default_length:
Length of the embedding region that incorporates default edge type information.
default_weight:
Influence that the default edge types have in the region of the embedding they are incorporated. Defaults to 1.
embedding_dim_map:
Set of comma-separated tuples of '<edge type>,<length>,<weight>,<starting index>'. Use if incorporating specific edge types
into certain regions of the embedding is desired. If the empty set is used, all edge types will be considered "default" edge types.
sampling_constant:
Parameter defined from the FastRP paper. Controls the sparsity of the embedding. Usually an integer between 1 (fully dense embedding)
and 3 works well.
random_seed:
Seed for random number generator. Defaults to 42.
result_attribute:
Attribute to store the result embedding to. Must be of type LIST<FLOAT>. If empty, algorithm does not write the embedding to an attribute.
component_attribute:
If batching the embedding process by connected components, specify the vertex attribute that contains the
integer ID of the component.
batch_number:
If batching by connected component, specify which batch to compute the embeddings for.
filepath:
If specified, what file to write the computed embeddings to.
print_results:
If TRUE, all vertex embeddings will be printed.
choose_k:
Will print a sample of K vertices and their embeddings. Defaults to 5.
*/
TYPEDEF TUPLE<INT min_dim, INT max_dim, FLOAT weight> Dim_Tuple;
MapAccum<STRING, Dim_Tuple> @@feature_dim_map, @@embedding_dim_map;
MapAccum<INT, SumAccum<FLOAT>> @embedding_arr;
MapAccum<INT, SumAccum<FLOAT>> @final_embedding_arr;
ListAccum<DOUBLE> @final_embedding_list;
SumAccum<FLOAT> @L, @@m;
ListAccum<FLOAT> @@weights;
OrAccum @include, @stop;
INT weight_idx, defined_dims;
STRING temp_dim_key;
INT idx_1, idx_2, idx_3, temp_length, temp_weight, emb_idx;
INT _edge_sample_cutoff;
FILE f(filepath);
// PRE-INITIALIZATION
// initialization of internal variables for PRNG
INT _mod, _mult, _inc;
_mod = pow(2, 31)-1;
_mult = 1664525;
_inc = 1013904223;
FLOAT p1, p2, p3, v1, v2, v3;
v1 = sqrt(sampling_constant);
v2 = -v1;
v3 = 0.0;
p1 = 0.5 / sampling_constant;
p2 = p1;
p3 = 1 - 1.0 / sampling_constant;
// extract explicitly defined topological embedding regions
FOREACH string_tuple IN embedding_dim_map DO
idx_1 = instr(string_tuple, ",");
idx_2 = instr(string_tuple, ",", 0, 2);
idx_3 = instr(string_tuple, ",", 0, 3);
temp_dim_key = substr(string_tuple, 0, idx_1);
temp_length = str_to_int(substr(string_tuple, idx_1+1, idx_2-1-idx_1));
temp_weight = tg_str_to_float(substr(string_tuple, idx_2+1));
idx_3 = str_to_int(substr(string_tuple, idx_3+1));
@@embedding_dim_map += (temp_dim_key -> Dim_Tuple(idx_3, idx_3+temp_length, temp_weight));
defined_dims = defined_dims + temp_length;
END;
@@embedding_dim_map += ("default" -> Dim_Tuple(default_index, default_index+default_length, default_weight));
PRINT @@embedding_dim_map;
// extract weights into usable accumulator
idx_1 = 0;
weight_idx = 1;
WHILE idx_1 != -1 DO
idx_2 = instr(iteration_weights, ",", idx_1);
IF idx_2 == -1 THEN
@@weights += tg_str_to_float(substr(iteration_weights, idx_1));
BREAK;
END;
@@weights += tg_str_to_float(substr(iteration_weights, idx_1, idx_2-idx_1));
idx_1 = idx_2+1;
weight_idx = weight_idx + 1;
END;
verts = {v_type_set};
// component_attr should indicate which batch each vertex belongs to, if none defined, just use all vertex types provided
IF component_attribute != "" THEN
verts =
SELECT s FROM verts:s
WHERE s.getAttr(component_attribute, "INT") == batch_number
POST-ACCUM s.@include += TRUE;
ELSE
verts =
SELECT s FROM verts:s POST-ACCUM s.@include += TRUE;
END;
// INITIALIZATION
// L stores the normalized diagonal elements of an inverse degree matrix
// this is defined in the original algorithm and is useful for initialization
verts =
SELECT s FROM verts:s -(e_type_set:e)- v_type_set:t
WHERE t.@include == TRUE
ACCUM @@m += 1
POST-ACCUM s.@L = pow(s.outdegree(e_type_set) / @@m, beta);
// randomly initialize the topological embedding regions
verts =
SELECT s FROM verts:s -(e_type_set:e)- v_type_set:t
WHERE t.@include == TRUE
ACCUM
// PRNG code
INT inc = (getvid(s)+_inc),
INT r = ((inc+_mult*random_seed) % _mod),
FLOAT mr = 0,
STRING temp_e_type = e.type,
// if the edge type wasn't explicitly specified, but was provided in e_type_set,
// then its information will reside in the 'default' region of the embedding vector
IF @@embedding_dim_map.containsKey(e.type) == FALSE THEN
temp_e_type = "default"
END,
FLOAT weight = @@embedding_dim_map.get(temp_e_type).weight,
FOREACH i IN RANGE[@@embedding_dim_map.get(temp_e_type).min_dim, @@embedding_dim_map.get(temp_e_type).max_dim-1] DO
r = ((r * _mult + inc) % _mod),
mr = r / (_mod * 1.0),
if (mr <= p1) THEN
t.@embedding_arr += (i -> v1 * s.@L * weight)
ELSE IF (mr <= p1 + p2) THEN
t.@embedding_arr += (i -> v2 * s.@L * weight)
ELSE
t.@embedding_arr += (i -> v3 * s.@L * weight)
END
END;
// MAIN EXECUTION
FOREACH depth IN RANGE[0, @@weights.size()-1] DO
// propagate embeddings to neighbors and normalize
verts =
SELECT s FROM verts:s -(e_type_set)- v_type_set:t
WHERE t.@include == TRUE
ACCUM
t.@embedding_arr += s.@embedding_arr
POST-ACCUM
// first calculate square sum to help normalize
FLOAT square_sum = 0,
FLOAT out = max([1.0, t.outdegree(e_type_set)]),
FOREACH (i,total) IN t.@embedding_arr DO
square_sum = square_sum + pow(total / out, 2)
END,
square_sum = sqrt(square_sum),
FLOAT value = 0,
FOREACH i IN RANGE[0, embedding_dimension-1] DO
IF square_sum == 0.0 THEN
BREAK
END,
t.@final_embedding_arr += (i -> t.@embedding_arr.get(i) / out / square_sum * @@weights.get(depth)),
value = t.@embedding_arr.get(i) / out / square_sum,
t.@embedding_arr += (i -> -t.@embedding_arr.get(i)),
t.@embedding_arr += (i -> value)
END;
depth = depth + 1;
END;
// OUTPUT
// clearing the unneeded arrays will save on memory
verts =
SELECT s FROM verts:s
POST-ACCUM s.@embedding_arr.clear();
// the arrays are converted to lists for compatibility with GSQL's LIST type
verts =
SELECT s FROM verts:s
WHERE output_v_type_set.size() == 0 OR output_v_type_set.contains(s.type)
POST-ACCUM
FOREACH i IN RANGE[0, embedding_dimension-1] DO
s.@final_embedding_list += s.@final_embedding_arr.get(i) / @@weights.size() // Average by # of iterations
END;
IF print_results THEN
res = SELECT a FROM verts:a;
PRINT res[res.@final_embedding_arr];
END;
// (un)comment depending on whether you want to write to graph
// GSQL does not yet support dynamic setAttr calls for LIST types
IF result_attribute != "" THEN
storeEmbeddings = SELECT s FROM verts:s POST-ACCUM s.setAttr(result_attribute, s.@final_embedding_list);
END;
sample_verts =
SELECT s FROM verts:s LIMIT choose_k;
PRINT sample_verts;
}