-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtrsm.ecl
324 lines (316 loc) · 17.7 KB
/
trsm.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
/*##############################################################################
## HPCC SYSTEMS software Copyright (C) 2016 HPCC Systems. All rights reserved.
############################################################################## */
IMPORT $ as PBblas;
IMPORT PBblas.Types;
IMPORT Std.BLAS;
IMPORT PBblas.internal as int;
IMPORT int.Types as iTypes;
IMPORT int.MatDims;
IMPORT int.Converted;
IMPORT std.system.Thorlib;
value_t := Types.value_t;
matrix_t := iTypes.matrix_t;
Triangle := Types.Triangle;
Lower := Types.Triangle.Lower;
Upper := Types.Triangle.Upper;
Diagonal := Types.Diagonal;
Side := Types.Side;
Layout_Part := iTypes.Layout_Part;
Layout_Cell := Types.Layout_Cell;
Layout_Dims := iTypes.Layout_Dims;
Layout_Target := iTypes.Layout_Target;
dimension_t := Types.dimension_t;
OpType := iTypes.OpType;
BaseTerm := 1;
RightTerm := 3;
LeftTerm := 2;
/**
* Partitioned block parallel triangular matrix solver.
*
* <p>Solves for X using: AX = B or XA = B.
* <p>A is is a triangular matrix, X and B have the same dimensions.
* <p>A may be an upper triangular matrix (UX = B or XU = B), or a lower
* triangular matrix (LX = B or XL = B).
* <p>Allows optional transposing and scaling of A.
* <p>Partially based upon an approach discussed by MJ DAYDE, IS DUFF, AP CERFACS.
* A Parallel Block implementation of Level-3 BLAS for MIMD Vector Processors
* ACM Tran. Mathematical Software, Vol 20, No 2, June 1994 pp 178-193
* and other papers about PB-BLAS by Choi and Dongarra.
*
* <p>This module supports the "Myriad" style interface, allowing many independent problems
* to be worked on at once. Corresponding A and B matrixes are related by a common
* work-item identifier (wi_id) within each cell of the matrix. The returned X matrix
* will contain cells for the same set of work-items as specified for the A and B matrices.
*
* @param s Types.Side enumeration indicating whether we are solving AX = B or XA = B
* @param tri Types.Triangle enumeration indicating whether we are solving an Upper or
* Lower triangle.
* @param transposeA Boolean indicating whether or not to transpose the A matrix before
* solving.
* @param diag Types.Diagonal enumeration indicating whether A is a unit matrix or not.
* This is primarily used after factoring matrixes using getrf (LU factorization).
* That module produces a factored matrix stored within the same space as the
* original matrix. Since the diagonal is used by both factors, by convention,
* the Lower triangle has a unit matrix (diagonal all 1's) while the Upper
* triangle uses the diagonal cells. Setting this to UnitTri, causes the
* contents of the diagonal to be ignored, and assumed to be 1. NotUnitTri
* should be used for most other cases.
* @param alpha Multiplier to scale A.
* @param A_in The A matrix in Layout_Cell format.
* @param B_in The B matrix in Layout_Cell format.
* @return X solution matrix in Layout_Cell format.
* @see Types.Layout_Cell
* @see Types.Triangle
* @see Types.Side
*/
EXPORT DATASET(Layout_Cell) trsm(Side s, Triangle tri, BOOLEAN transposeA, Diagonal diag,
value_t alpha,
DATASET(Layout_Cell) A_in,
DATASET(Layout_Cell) B_in) := FUNCTION
nodes := Thorlib.nodes();
// Get the basic A matrix dimensions
a_dims_n := MatDims.FromCells(A_in, 'A');
a_dims_t := MatDims.TransposeDims(a_dims_n);
// a_dims := ASSERT(IF(transposeA, a_dims_t, a_dims_n), FALSE, m_label + '(' + wi_id + '}: ' +
// 'r,c=' + m_rows + ', ' + m_cols + ' br,bc=' + block_rows + ', ' + block_cols + ' rb, cb = ' + row_blocks + ', ' + col_blocks);
a_dims := IF(transposeA, a_dims_t, a_dims_n);
// Get the basic B matrix dims
// b_dims := ASSERT(MatDims.FromCells(B_in, 'B'), FALSE, m_label + '(' + wi_id + '}: ' +
// 'r,c=' + m_rows + ', ' + m_cols + ' br,bc=' + block_rows + ', ' + block_cols + ' rb, cb = ' + row_blocks + ', ' + col_blocks);
b_dims := MatDims.FromCells(B_in, 'B');
op_type := IF(s = Side.Ax, OpType.solve_Ax, OpType.solve_xA);
// Get partitioning schemes for A and B
// part_dims := ASSERT(MatDims.PartitionedFromDimsForOp(op_type, a_dims + b_dims), FALSE, m_label + '(' + wi_id + '}: ' +
// 'r,c=' + m_rows + ', ' + m_cols + ' br,bc=' + block_rows + ', ' + block_cols + ' rb, cb = ' + row_blocks + ', ' + col_blocks);
part_dims := MatDims.PartitionedFromDimsForOp(op_type, a_dims + b_dims);
a_part_dims := part_dims(m_label='A');
b_part_dims := part_dims(m_label='B');
// Convert A and B to partitions. Transpose A in the process if requested
a_parts := SORTED(Converted.FromCells(a_part_dims, A_in, transposeA), partition_id, wi_id);
b_parts := SORTED(Converted.FromCells(b_part_dims, B_in), partition_id, wi_id);
// Definitions for four cases to be handled
Upper_xA := s=Side.xA AND tri=Upper;
Upper_Ax := s=Side.Ax AND tri=Upper;
Lower_xA := s=Side.xA AND tri=Lower;
Lower_Ax := s=Side.Ax AND tri=Lower;
// Extend A partitions to include B's row_blocks. We will need this later and
// is probably best to do here in one pass
Layout_Part_ext := RECORD(Layout_Part)
dimension_t b_row_blocks := 0;
dimension_t b_col_blocks := 0;
END;
Layout_Part_ext extend_a(Layout_Part l, Layout_Dims r) := TRANSFORM
SELF.b_row_blocks := r.row_blocks;
SELF.b_col_blocks := r.col_blocks;
SELF := l;
END;
a_parts_ext1 := JOIN(a_parts, b_part_dims, LEFT.wi_id = RIGHT.wi_id, extend_a(LEFT, RIGHT),
LOOKUP) : ONWARNING(4531, IGNORE);
a_parts_ext := SORTED(a_parts_ext1, partition_id, wi_id);
// Loop body, by diagonal, right to left for upper and left to right for lower
loopBody(DATASET(Layout_Part) parts, UNSIGNED4 loop_c, dimension_t max_a_dim,
dimension_t max_b_dim) := FUNCTION
// Process is as follows:
// - Solve each block at the current row or column position with its diagonal in A
// This forms the base (starting point) for the solution
// - Subtract from the base every earlier solution multiplied by its corresponding
// A coefficient.
// reverse implies left-to-right for column solutions or bottom-to-top for row solutions
reverse := IF(tri=Upper, s=Side.Ax, s=Side.xA);
// rc_pos is the current row or column index. It runs forward or backward depending
// on setting of "reverse" above.
rc_pos := IF(reverse, 1 + max_a_dim - loop_c, loop_c);
// remaining indicates how many more rows or columns there are to process.
remaining := max_a_dim - loop_c; // Rows or Columns same
// Transform to solve each block
Layout_Part solveBlock(Layout_Part b_rec, Layout_Part a_rec) := TRANSFORM
b_part_rows := b_rec.part_rows;
b_part_cols := b_rec.part_cols;
a_part_rows := a_rec.part_rows;
SELF.mat_part := BLAS.dtrsm(s, tri, FALSE, diag,
b_part_rows,
b_part_cols,
a_part_rows,
alpha, a_rec.mat_part, b_rec.mat_part);
SELF := b_rec;
END;
// Solve each of the B blocks at the current index with the diagonal
// of A for that row or column.
solved := JOIN(parts, a_parts(block_row=rc_pos AND block_col=rc_pos),
LEFT.wi_id = RIGHT.wi_id
AND ((s=Side.Ax AND LEFT.block_row=RIGHT.block_row)
OR (s=Side.xA AND LEFT.block_col=RIGHT.block_col)),
solveBlock(LEFT, RIGHT), SMART) : ONWARNING(4531, IGNORE);
// Base parts stay in place, just need routing for transform
Layout_Target prepBase(Layout_Part base) := TRANSFORM
SELF.t_part_id := base.partition_id;
SELF.t_node_id := base.node_id;
SELF.t_block_row:= base.block_row;
SELF.t_block_col:= base.block_col;
SELF.t_term := BaseTerm;
SELF := base;
END;
// The blocks at the current rc_pos have been solved. The rest need to be
// updated based on this solution.
// Solved blocks not in update, loop filter has removed prior solves
// Also, when we are going in reverse, we are going to skip any
// blocks that are lower than the rc_pos. We don't want to update until
// after we have processed the first diagonal block.
parts4update := parts((s=Side.Ax AND block_row <> rc_pos AND (rc_pos <= row_blocks))
OR (s=Side.xA AND block_col <> rc_pos AND (rc_pos <= col_blocks)));
// Save any skipped records for next time through.
skipped := parts((s=Side.Ax AND rc_pos > row_blocks) OR (s=Side.xA AND rc_pos > col_blocks));
// All later blocks get their base established by starting with their corresponding
// B value.
need_upd := SORTED(PROJECT(parts4update, prepBase(LEFT)), t_part_id, wi_id);
// Functions to replicate B and A blocks for propagation to their targets
// Replicate B blocks Function
Layout_Target repPartB(Layout_Part inPart, dimension_t repl) := TRANSFORM
row_base := IF(s=Side.Ax, rc_pos, 0);
col_base := IF(s=Side.xA, rc_pos, 0);
offset := IF(reverse, -repl, repl);
t_row := IF(s=Side.Ax,row_base + offset, inPart.block_row);
t_col := IF(s=Side.xA, col_base + offset, inPart.block_col);
// Skip if we're outside our bounds due to use of max_a_dim in the normalize
valid := t_row <= inPart.row_blocks AND t_col <= inPart.col_blocks;
b_row_blocks := IF(valid, inPart.row_blocks, SKIP);
t_part := ((t_col-1) * b_row_blocks) + t_row;
SELF.t_part_id := t_part;
SELF.t_node_id := (HASH32(inPart.wi_id) + t_part-1) % nodes;
SELF.t_block_row := t_row;
SELF.t_block_col := t_col;
SELF.t_term := IF(s=Side.Ax, RightTerm, LeftTerm);
SELF := inPart;
END;
// Replicate A blocks Function
Layout_Target repPartA(Layout_Part_ext inPart, dimension_t repl) := TRANSFORM
t_row := IF(s=Side.xA, repl, inPart.block_row);
t_col := IF(s=Side.Ax, repl, inPart.block_col);
// We need B's row blocks amd col_blocks here, which we couldn't normally
// get from A, which is why we extended A to include B's block counts.
b_row_blocks := inPart.b_row_blocks;
b_col_blocks := inPart.b_col_blocks;
// Skip if we're outside our bounds due to use of max_a_dim in the normalize
valid := t_row <= b_row_blocks AND t_col <= b_col_blocks;
t_part := IF(valid, ((t_col-1) * b_row_blocks) + t_row, SKIP);
SELF.t_part_id := t_part;
SELF.t_node_id := (HASH32(inPart.wi_id) + t_part-1) % nodes;
SELF.t_block_row := t_row;
SELF.t_block_col := t_col;
SELF.t_term := IF(s=Side.Ax, LeftTerm, RightTerm);
SELF := inPart;
END;
// Replicate each solved block to all remaining rows or columns. These will
// be combined with (i.e. multiplied by) the corresponding A coefficients, and
// subtracted from the base for that row or column.
s0_repl := NORMALIZE(solved, remaining, repPartB(LEFT, COUNTER));
replSolv := SORT(DISTRIBUTE(s0_repl, t_node_id), t_part_id, wi_id, LOCAL);
// Replicate the A blocks and route to the same target partition id as the
// corresponding solved blocks. In the reverse cases, we eliminate any work-items
// where we haven't reached the first solved block.
neededCf := a_parts_ext((Upper_Ax AND block_col=rc_pos AND block_row<rc_pos AND rc_pos <= row_blocks )
OR (Lower_Ax AND block_col=rc_pos AND block_row>rc_pos)
OR (Upper_xA AND block_row=rc_pos AND block_col>rc_pos)
OR (Lower_xA AND block_row=rc_pos AND block_col<rc_pos AND rc_pos <= row_blocks));
c0_repl := NORMALIZE(neededCf, max_b_dim, repPartA(LEFT, COUNTER));
replCoef := SORT(DISTRIBUTE(c0_repl, t_node_id), t_part_id, wi_id, LOCAL);
// Transform to multiply each solved block by its A coefficient and subract from
// the base.
Layout_Target updatePart(Layout_Target l, DATASET(Layout_Target) blocks) := TRANSFORM
matrix_t EmptyMat := [];
have_lft:= EXISTS(blocks(t_term = LeftTerm));
have_rgt:= EXISTS(blocks(t_term = RightTerm));
do_mult := have_lft AND have_rgt;
have_base := EXISTS(blocks(t_term=BaseTerm));
// We need dimension information about B matrix in order to calc the following.
// We know that the base term and either the right or left term will be derived
// from B, so we can use it's dimension info. The partition id will be the same
// for all blocks, but the B will give us more dimension info.
// In the event that there is no B part, we'll skip, since the partition will
// always be zeros in that case.
have_B_term := IF(s=Side.Ax, have_lft, have_rgt);
B_term := IF(s=Side.Ax, blocks(t_term=RightTerm)[1], blocks(t_term=LeftTerm)[1]);
valid := IF(have_base OR have_B_term, TRUE, SKIP);
proto := IF(valid AND have_B_term, B_term, blocks(t_term=BaseTerm)[1]);
base_set := IF(have_base, blocks(t_term=BaseTerm)[1].mat_part, EmptyMat);
lft_set := blocks(t_term=LeftTerm)[1].mat_part;
lft_cols:= blocks(t_term=LeftTerm)[1].part_cols;
rgt_set := blocks(t_term=RightTerm)[1].mat_part;
part_id := proto.t_part_id; //all records have same value
SELF.node_id := proto.t_node_id; // all records have the same
SELF.partition_id := part_id;
block_rows := ROUNDUP(proto.m_rows / proto.row_blocks);
block_cols := ROUNDUP(proto.m_cols / proto.col_blocks);
first_row := ((part_id-1) % proto.row_blocks) * block_rows + 1;
first_col := ((part_id-1) DIV proto.row_blocks) * block_cols + 1;
SELF.block_row := proto.t_block_row;
SELF.block_col := proto.t_block_col;
SELF.first_row := first_row;
SELF.first_col := first_col;
part_rows := MIN([block_rows, proto.m_rows - first_row + 1]);
part_cols := MIN([block_cols, proto.m_cols - first_col + 1]);
SELF.part_rows := part_rows;
SELF.part_cols := part_cols;
// Note that dgemm here multiplies the current solution by the corresponding A
// coefficient, negates it (i.e. -1/alpha) and adds to the current B value.
SELF.mat_part := IF(do_mult,
BLAS.dgemm(FALSE, FALSE,
part_rows, // M
part_cols, // N
lft_cols, // K
-1/alpha, lft_set, rgt_set, 1.0, base_set),
base_set);
SELF.t_node_id := proto.t_node_id;
SELF.t_part_id := proto.t_part_id;
SELF.t_block_row:= proto.t_block_row;
SELF.t_block_col:= proto.t_block_col;
SELF.t_term := proto.t_term;
SELF := proto;
END;
// Merge all of the targets for each partition
// Need_upd has the base for each partition (original B value)
// ReplSolv has the earlier solve to be multiplied by the corresponding A coefficients.
// and subtracted from the base
// ReplCoef has the A coefficients to be multiplied with the previous solved blocks.
inpSet := MERGE(need_upd, replCoef, replSolv, SORTED(t_part_id, wi_id), LOCAL);
// For each non-solved partition-id, multiply each A coefficient and its
// corresponding solved
// block and negate to subtract the term from the base.
inpSetGrouped := GROUP(inpSet, t_part_id, wi_id, LOCAL);
updated := ROLLUP(inpSetGrouped, GROUP, updatePart(LEFT, ROWS(LEFT)));
// assemble the revised parts
// Convert from target to partition. Targets should already be distributed
// correctly based on node_id, so shouldn't need to re-distribute.
parts_updated := PROJECT(updated, Layout_Part);
// Each block was either updated or solved or skipped. Return them all.
rslt := SORT(parts_updated + solved + skipped, partition_id, wi_id, LOCAL);
RETURN rslt;
END; // loopBody
// Maximum dimension of A for all work-items
max_a_dim := MAX(a_part_dims, a_part_dims.row_blocks);
max_b_rows := MAX(b_part_dims, b_part_dims.row_blocks);
max_b_cols := MAX(b_part_dims, b_part_dims.col_blocks);
// The max dimension of B's unique dimension (i.e. the one not shared with A)
max_b_dim := IF(s=Side.Ax, max_b_cols, max_b_rows);
// Run the solver
// Each loop runs one row or column of the solution. The four variants are:
// Lower_Ax -- LX = B : Down -- Solve rows of A from top to bottom
// Lower_xA -- XL = B : Left -- Solve columns of A from right to left
// Upper_Ax -- UX = B : Up -- Solve rows of A from bottom to top
// Upper_xA -- XU = B : Right -- Solve columns of A from left to right
// where L := A is a Lower triangular matrix and U := A is a Upper triangular
// matrix. X is the result being solved for.
// Since A and B may contain many separate matrixes (via myriad interface), use
// the loop-count of the largest A matrix. For any smaller matrixes we won't
// find any work until we are in range of its rows or columns.
x_parts := LOOP(b_parts, max_a_dim, // A is square, so xA and Ax same count
(Upper_Ax AND LEFT.block_row <= 1 + max_a_dim - COUNTER) OR
(Lower_Ax AND LEFT.block_row >= COUNTER) OR
(Upper_xA AND LEFT.block_col >= COUNTER) OR
(Lower_xA AND LEFT.block_col <= 1 + max_a_dim - COUNTER),
loopBody(ROWS(LEFT), COUNTER, max_a_dim, max_b_dim));
// Convert from partitions back to cells
x_cells := Converted.FromParts(x_parts);
return x_cells;
END;