Skip to content

Commit 368f613

Browse files
committed
wip
1 parent 16898a3 commit 368f613

File tree

3 files changed

+97
-10
lines changed

3 files changed

+97
-10
lines changed

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
156156
code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
157157
code << " const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
158158
}
159-
CeedCallBackend(CeedBasisDestroy(&basis));
160159

161160
// Load basis data
162161
code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
@@ -240,6 +239,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
240239
break; // TODO: Not implemented
241240
// LCOV_EXCL_STOP
242241
}
242+
CeedCallBackend(CeedBasisDestroy(&basis));
243243
return CEED_ERROR_SUCCESS;
244244
}
245245

@@ -319,10 +319,21 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
319319
<< strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
320320
break;
321321
}
322+
case CEED_RESTRICTION_POINTS: {
323+
CeedInt comp_stride;
324+
325+
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
326+
code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
327+
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
328+
code << " // CompStride: " << comp_stride << "\n";
329+
data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
330+
code << " ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size"
331+
<< var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
332+
break;
333+
}
322334
// LCOV_EXCL_START
323335
case CEED_RESTRICTION_ORIENTED:
324336
case CEED_RESTRICTION_CURL_ORIENTED:
325-
case CEED_RESTRICTION_POINTS:
326337
break; // TODO: Not implemented
327338
// LCOV_EXCL_STOP
328339
}
@@ -358,10 +369,21 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
358369
<< strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
359370
break;
360371
}
372+
case CEED_RESTRICTION_POINTS: {
373+
CeedInt comp_stride;
374+
375+
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
376+
code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
377+
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
378+
code << " // CompStride: " << comp_stride << "\n";
379+
data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
380+
code << " WriteLVecAtPoints" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
381+
<< ", elem, indices.outputs[" << i << "], points.num_per_elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
382+
break;
383+
}
361384
// LCOV_EXCL_START
362385
case CEED_RESTRICTION_ORIENTED:
363386
case CEED_RESTRICTION_CURL_ORIENTED:
364-
case CEED_RESTRICTION_POINTS:
365387
break; // TODO: Not implemented
366388
// LCOV_EXCL_STOP
367389
}
@@ -406,20 +428,35 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
406428
}
407429
break;
408430
case CEED_EVAL_INTERP:
409-
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
410-
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
411-
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
431+
if (is_at_points) {
432+
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
433+
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
434+
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
435+
} else {
436+
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n";
437+
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
438+
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
439+
440+
code << " InterpAtPoints<" << dim << ", num_comp" << var_suffix << ", max_num_points, " << Q_name
441+
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
442+
}
412443
break;
413444
case CEED_EVAL_GRAD:
414445
if (use_3d_slices) {
415446
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
416447
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
417448
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
418449
} else {
419-
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
420-
code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
421-
<< ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
422-
<< var_suffix << ");\n";
450+
if (is_at_points) {
451+
} else {
452+
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*max_num_points];\n";
453+
code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
454+
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
455+
456+
code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
457+
<< ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
458+
<< var_suffix << ");\n";
459+
}
423460
}
424461
break;
425462
case CEED_EVAL_WEIGHT: {

backends/cuda-gen/ceed-cuda-gen.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
3939

4040
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen));
4141
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen));
42+
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Cuda_gen));
4243
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda));
4344
return CEED_ERROR_SUCCESS;
4445
}

include/ceed/jit-source/cuda/cuda-gen-templates.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,22 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt
6363
}
6464
}
6565

66+
//------------------------------------------------------------------------------
67+
// E-vector -> L-vector, AtPoints
68+
//------------------------------------------------------------------------------
69+
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
70+
inline __device__ void writeDofsAtPoints1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
71+
const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
72+
if (data.t_id_x < P_1d) {
73+
const CeedInt node = data.t_id_x;
74+
const CeedInt ind = indices[node + elem * P_1d];
75+
76+
if (node < points_per_elem[elem]) {
77+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
78+
}
79+
}
80+
}
81+
6682
//------------------------------------------------------------------------------
6783
// E-vector -> L-vector, strided
6884
//------------------------------------------------------------------------------
@@ -123,6 +139,22 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt
123139
}
124140
}
125141

142+
//------------------------------------------------------------------------------
143+
// E-vector -> L-vector, AtPoints
144+
//------------------------------------------------------------------------------
145+
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
146+
inline __device__ void writeDofsAtPoints2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
147+
const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
148+
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
149+
const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
150+
const CeedInt ind = indices[node + elem * P_1d * P_1d];
151+
152+
if (node < points_per_elem[elem]) {
153+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
154+
}
155+
}
156+
}
157+
126158
//------------------------------------------------------------------------------
127159
// E-vector -> L-vector, strided
128160
//------------------------------------------------------------------------------
@@ -215,6 +247,23 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
215247
}
216248
}
217249

250+
//------------------------------------------------------------------------------
251+
// E-vector -> L-vector, AtPoints
252+
//------------------------------------------------------------------------------
253+
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
254+
inline __device__ void writeDofsAtPoints3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
255+
const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
256+
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
257+
for (CeedInt z = 0; z < P_1d; z++) {
258+
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
259+
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];
260+
261+
if (node < points_per_elem[elem]) {
262+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
263+
}
264+
}
265+
}
266+
218267
//------------------------------------------------------------------------------
219268
// E-vector -> L-vector, strided
220269
//------------------------------------------------------------------------------

0 commit comments

Comments
 (0)