1
+ #include " ffCudaNn.h"
2
+
3
+ #include < cuda_runtime.h>
4
+ #include < random>
5
+ #include < assert.h>
6
+
7
+ namespace ff
8
+ {
9
+ // /////////////////////////////////////////////////////////////////////
10
+ static std::default_random_engine g_generator;
11
+ static std::normal_distribution<double > g_distribution;
12
+
13
+ CudaTensor::CudaTensor () : _d0(0 ), _d1(0 ), _d2(0 ), _d3(0 ), _dataSize(0 ), _dataGpu(nullptr ), _dataGpuSize(0 )
14
+ {
15
+ }
16
+
17
+ CudaTensor::CudaTensor (int d0, int d1, int d2, int d3) : _dataGpu(nullptr ), _dataGpuSize(0 )
18
+ {
19
+ ResetTensor (d0, d1, d2, d3);
20
+ }
21
+
22
+ CudaTensor::~CudaTensor ()
23
+ {
24
+ if (nullptr != _dataGpu) cudaFree (_dataGpu);
25
+ }
26
+
27
+ void CudaTensor::ResetTensor (int d0, int d1, int d2, int d3)
28
+ {
29
+ _d0 = d0; _d1 = d1; _d2 = d2; _d3 = d3;
30
+ _dataSize = _d0 * _d1 * _d2 * _d3;
31
+ _data.clear ();
32
+ _data.resize (_dataSize);
33
+
34
+ if (_dataGpuSize < _dataSize)
35
+ {
36
+ _dataGpuSize = _dataSize;
37
+ if (_dataGpu) cudaFree (_dataGpu);
38
+ cudaError_t err = cudaMalloc (&_dataGpu, _dataGpuSize * sizeof (double ));
39
+ assert (err == cudaSuccess);
40
+ }
41
+ }
42
+
43
+ void CudaTensor::Random (const double multiplier)
44
+ {
45
+ for (int i = 0 ; i < _dataSize; ++i)
46
+ {
47
+ _data[i] = g_distribution (g_generator) * multiplier;
48
+ }
49
+ cudaMemcpy (_dataGpu, &_data[0 ], _dataSize * sizeof (double ), cudaMemcpyKind::cudaMemcpyHostToDevice);
50
+ }
51
+
52
+ void CudaTensor::Zero ()
53
+ {
54
+ memset (&_data[0 ], 0 , _data.size () * sizeof (double ));
55
+ cudaMemcpy (_dataGpu, &_data[0 ], _dataSize * sizeof (double ), cudaMemcpyKind::cudaMemcpyHostToDevice);
56
+ }
57
+
58
+ void CudaTensor::Push ()
59
+ {
60
+ cudaMemcpy (_dataGpu, &_data[0 ], _dataSize * sizeof (double ), cudaMemcpyKind::cudaMemcpyHostToDevice);
61
+ }
62
+
63
+ void CudaTensor::Pull ()
64
+ {
65
+ cudaMemcpy (&_data[0 ], _dataGpu, _dataSize * sizeof (double ), cudaMemcpyKind::cudaMemcpyDeviceToHost);
66
+ }
67
+
68
+ // /////////////////////////////////////////////////////////////////////
69
+ __global__ void LinearTransform_Cuda (double * y, const double * x, const double * w, const double * b, int xw, int ww)
70
+ {
71
+ int r = blockIdx.x ;
72
+ int c = threadIdx.x ;
73
+ double v = 0.0 ;
74
+ for (int i = 0 ; i < xw; ++i)
75
+ {
76
+ v += x[i + r * xw] * w[c + i * ww];
77
+ }
78
+ y[c + r * ww] = v;
79
+ }
80
+
81
+ void LinearTransform (CudaTensor* y, const CudaTensor* x, const CudaTensor* w, const CudaTensor* b)
82
+ {
83
+ // y = xw+b
84
+ dim3 block (x->_d1 ), threads (w->_d0 );
85
+ LinearTransform_Cuda <<< block, threads >>> (y->_dataGpu , x->_dataGpu , w->_dataGpu , b->_dataGpu , x->_d0 , w->_d0 );
86
+ assert (cudaGetLastError () == cudaSuccess);
87
+ }
88
+
89
+ __global__ void ComputeWg_Cuda (double * wG, const double * x, const double * yG, int xTh, int xTw, int yGw)
90
+ {
91
+ int r = blockIdx.x ;
92
+ int c = threadIdx.x ;
93
+ double v = 0.0 ;
94
+ for (int i = 0 ; i < xTw; ++i)
95
+ {
96
+ v += x[r + i * xTh] * yG[c + i * yGw];
97
+ }
98
+ wG[c + r * yGw] = v;
99
+ }
100
+
101
+ void ComputeWg (CudaTensor* wG, const CudaTensor* x, const CudaTensor* yG)
102
+ {
103
+ // wG = x.T * yG
104
+ dim3 block (x->_d0 ), threads (yG->_d0 );
105
+ ComputeWg_Cuda << < block, threads >> > (wG->_dataGpu , x->_dataGpu , yG->_dataGpu , x->_d0 , x->_d1 , yG->_d0 );
106
+ assert (cudaGetLastError () == cudaSuccess);
107
+ }
108
+
109
+ __global__ void ComputeXg_Cuda (double * xG, const double * yG, const double * w, int yGw, int yGh, int wTh)
110
+ {
111
+ int r = blockIdx.x ;
112
+ int c = threadIdx.x ;
113
+ double v = 0.0 ;
114
+ for (int i = 0 ; i < yGw; ++i)
115
+ {
116
+ v += yG[i + r * yGw] * w[i + c * wTh];
117
+ }
118
+ xG[c + r * yGh] = v;
119
+ }
120
+
121
+ void ComputeXg (CudaTensor* xG, const CudaTensor* yG, const CudaTensor* w)
122
+ {
123
+ // xG = yG * w.T
124
+ dim3 block (yG->_d1 ), threads (w->_d1 );
125
+ ComputeWg_Cuda << < block, threads >> > (xG->_dataGpu , yG->_dataGpu , w->_dataGpu , yG->_d0 , yG->_d1 , w->_d0 );
126
+ assert (cudaGetLastError () == cudaSuccess);
127
+ }
128
+
129
+ __global__ void ComputeSumOfSquresGradient (double * yG, const double * y, const double * yLabel, int nCol)
130
+ {
131
+ int r = blockIdx.x ;
132
+ int c = threadIdx.x ;
133
+ int index = c + r * nCol;
134
+ double diff = y[index ] - yLabel[index ];
135
+ yG[index ] = 2 .0f * diff;
136
+ }
137
+
138
+ __global__ void UpdateWs_Cuda (int nCol, double learningRate, double * w, const double * wG)
139
+ {
140
+ int r = blockIdx.x ;
141
+ int c = threadIdx.x ;
142
+ int index = c + r * nCol;
143
+ w[index ] -= wG[index ] * learningRate;
144
+ }
145
+
146
+ __global__ void Relu_Cuda (double * relu_x, const double * x, int nCol)
147
+ {
148
+ int r = blockIdx.x ;
149
+ int c = threadIdx.x ;
150
+ int index = c + r * nCol;
151
+ relu_x[index ] = fmax (x[index ], 0.0 );
152
+ }
153
+
154
+ __global__ void ReluG_Cuda (double * xG, const double * x, int nCol)
155
+ {
156
+ int r = blockIdx.x ;
157
+ int c = threadIdx.x ;
158
+ int index = c + r * nCol;
159
+ xG[index ] = xG[index ] * (fmax (x[index ], 1e-32 ) / x[index ]);
160
+ // if (x[index] < 0.0) xG[index] = 0.0;
161
+ }
162
+
163
+ __global__ void SoftmaxBackward_Cuda (double * lossG, const double * softmax, const double * yLabel, int nCol)
164
+ {
165
+ int r = blockIdx.x ;
166
+ int c = threadIdx.x ;
167
+ int index = c + r * nCol;
168
+ lossG[index ] = softmax[index ];
169
+ if ((int )yLabel[r] == c) lossG[index ] -= 1.0 ;
170
+ }
171
+
172
+ // /////////////////////////////////////////////////////////////////////
173
+ FcLayer::FcLayer (int inDim, int outDim) : _pX(nullptr )
174
+ {
175
+ _w.ResetTensor (outDim, inDim);
176
+ _w.Random (1.0 / sqrt (inDim));
177
+ _wG.ResetTensor (outDim, inDim);
178
+ _b.ResetTensor (outDim);
179
+ _b.Zero ();
180
+ _bG.ResetTensor (outDim);
181
+ }
182
+
183
+ const CudaTensor* FcLayer::Forward (const CudaTensor* x)
184
+ {
185
+ if (x->_d0 != _w._d1 )
186
+ return nullptr ;
187
+
188
+ _pX = x;
189
+ _y.ResetTensor (_w._d0 , _pX->_d1 );
190
+ LinearTransform (&_y, _pX, &_w, &_b);
191
+ return &_y;
192
+ }
193
+
194
+ const CudaTensor* FcLayer::Backward (const CudaTensor* yG, const int layerIndex)
195
+ {
196
+ ComputeWg (&_wG, _pX, yG);
197
+ if (layerIndex > 0 )
198
+ {
199
+ _xG.ResetTensor (_pX->_d0 , _pX->_d1 );
200
+ ComputeXg (&_xG, yG, &_w);
201
+ }
202
+ return &_xG;
203
+ }
204
+
205
+ void FcLayer::UpdateWs (double learningRate)
206
+ {
207
+ dim3 block (_w._d1 ), threads (_w._d0 );
208
+ UpdateWs_Cuda <<< block, threads >>> (_w._d0 , learningRate, _w._dataGpu , _wG._dataGpu );
209
+ assert (cudaGetLastError () == cudaSuccess);
210
+ }
211
+
212
+ ReluFcLayer::ReluFcLayer (int inDim, int outDim) : _pX(nullptr )
213
+ {
214
+ _w.ResetTensor (outDim, inDim);
215
+ _w.Random (1.0 / sqrt (inDim));
216
+ _wG.ResetTensor (outDim, inDim);
217
+ _b.ResetTensor (outDim);
218
+ _b.Zero ();
219
+ _bG.ResetTensor (outDim);
220
+ }
221
+
222
+ const CudaTensor* ReluFcLayer::Forward (const CudaTensor* x)
223
+ {
224
+ if (x->_d0 != _w._d1 )
225
+ return nullptr ;
226
+
227
+ _pX = x;
228
+ _xRelu.ResetTensor (_pX->_d0 , _pX->_d1 );
229
+ dim3 block (_xRelu._d1 ), threads (_xRelu._d0 );
230
+ Relu_Cuda<<< block, threads >>>(_xRelu._dataGpu , _pX->_dataGpu , _xRelu._d0 );
231
+ assert (cudaGetLastError () == cudaSuccess);
232
+
233
+ _y.ResetTensor (_w._d0 , _pX->_d1 );
234
+ LinearTransform (&_y, &_xRelu, &_w, &_b);
235
+ return &_y;
236
+ }
237
+
238
+ const CudaTensor* ReluFcLayer::Backward (const CudaTensor* yG, const int layerIndex)
239
+ {
240
+ ComputeWg (&_wG, &_xRelu, yG);
241
+ if (layerIndex > 0 )
242
+ {
243
+ _xG.ResetTensor (_pX->_d0 , _pX->_d1 );
244
+ ComputeXg (&_xG, yG, &_w);
245
+ dim3 block (_xG._d1 ), threads (_xG._d0 );
246
+ ReluG_Cuda<<< block, threads >>> (_xG._dataGpu , _pX->_dataGpu , _xG._d0 );
247
+ assert (cudaGetLastError () == cudaSuccess);
248
+ }
249
+ return &_xG;
250
+ }
251
+
252
+ void ReluFcLayer::UpdateWs (double learningRate)
253
+ {
254
+ dim3 block (_w._d1 ), threads (_w._d0 );
255
+ UpdateWs_Cuda <<< block, threads >>> (_w._d0 , learningRate, _w._dataGpu , _wG._dataGpu );
256
+ assert (cudaGetLastError () == cudaSuccess);
257
+ }
258
+
259
+ const CudaTensor* SoftmaxLayer::Forward (const CudaTensor* x)
260
+ {
261
+ _softmax.ResetTensor (x->_d0 , x->_d1 );
262
+ _lossG.ResetTensor (x->_d0 , x->_d1 );
263
+
264
+ const_cast <CudaTensor*>(x)->Pull ();
265
+ for (int r = 0 ; r < x->_d1 ; ++r)
266
+ {
267
+ double maxValue = x->_data [0 + x->_d0 * r];
268
+ for (int i = 1 ; i < x->_d0 ; ++i)
269
+ {
270
+ double currValue = x->_data [i + x->_d0 * r];
271
+ if (maxValue < currValue)
272
+ {
273
+ maxValue = currValue;
274
+ }
275
+ }
276
+
277
+ double sum = 0.0 ;
278
+ for (int i = 0 ; i < x->_d0 ; ++i)
279
+ {
280
+ sum += exp (x->_data [i + x->_d0 * r] - maxValue); // stable softmax
281
+ }
282
+ for (int i = 0 ; i < x->_d0 ; ++i)
283
+ {
284
+ _softmax._data [i + _softmax._d0 * r] = exp (x->_data [i + x->_d0 * r] - maxValue) / sum;
285
+ }
286
+ }
287
+ _softmax.Push ();
288
+ return &_softmax;
289
+ }
290
+
291
+ const CudaTensor* SoftmaxLayer::Backward (const CudaTensor* yG, const int layerIndex)
292
+ {
293
+ dim3 block (yG->_d1 ), threads (_softmax._d0 );
294
+ SoftmaxBackward_Cuda <<< block, threads >>> (_lossG._dataGpu , _softmax._dataGpu , yG->_dataGpu , _lossG._d0 );
295
+ assert (cudaGetLastError () == cudaSuccess);
296
+ return &_lossG;
297
+ }
298
+
299
+ SumOfSquaresLayer::SumOfSquaresLayer () : _pY(nullptr )
300
+ {
301
+ }
302
+
303
+ const CudaTensor* SumOfSquaresLayer::Forward (const CudaTensor* x)
304
+ {
305
+ _pY = x;
306
+ return _pY;
307
+ }
308
+
309
+ const CudaTensor* SumOfSquaresLayer::Backward (const CudaTensor* yLabel, const int layerIndex)
310
+ {
311
+ _yG.ResetTensor (yLabel->_d0 , yLabel->_d1 );
312
+
313
+ dim3 block (_yG._d1 ), threads (_yG._d0 );
314
+ ComputeSumOfSquresGradient <<< block, threads >>> (_yG._dataGpu , _pY->_dataGpu , yLabel->_dataGpu , _yG._d0 );
315
+ assert (cudaGetLastError () == cudaSuccess);
316
+ return &_yG;
317
+ }
318
+
319
+ // /////////////////////////////////////////////////////////////////////
320
+ CudaNn::~CudaNn ()
321
+ {
322
+ InitializeCudaNn (" " );
323
+ }
324
+
325
+ bool CudaNn::InitializeCudaNn (const char * desc)
326
+ {
327
+ size_t numLayers = _layers.size ();
328
+ for (size_t i = 0 ; i < numLayers; ++i)
329
+ {
330
+ delete _layers[i];
331
+ }
332
+ _layers.clear ();
333
+
334
+ return true ;
335
+ }
336
+
337
+ bool CudaNn::AddFc (int inDim, int outDim)
338
+ {
339
+ _layers.push_back (new FcLayer (inDim, outDim));
340
+ return true ;
341
+ }
342
+
343
+ bool CudaNn::AddReluFc (int inDim, int outDim)
344
+ {
345
+ _layers.push_back (new ReluFcLayer (inDim, outDim));
346
+ return true ;
347
+ }
348
+
349
+ bool CudaNn::AddSoftmax ()
350
+ {
351
+ _layers.push_back (new SoftmaxLayer);
352
+ return true ;
353
+ }
354
+
355
+ bool CudaNn::AddSumOfSquares ()
356
+ {
357
+ _layers.push_back (new SumOfSquaresLayer);
358
+ return true ;
359
+ }
360
+
361
+ const CudaTensor* CudaNn::Forward (const CudaTensor* x)
362
+ {
363
+ const CudaTensor* y = nullptr ;
364
+ size_t numLayer = _layers.size ();
365
+ for (size_t i = 0 ; i < numLayer; ++i)
366
+ {
367
+ if (nullptr == x)
368
+ return nullptr ;
369
+
370
+ y = _layers[i]->Forward (x);
371
+ x = y;
372
+ }
373
+
374
+ return y;
375
+ }
376
+
377
+ void CudaNn::Backward (const CudaTensor* yLabel)
378
+ {
379
+ const CudaTensor* y = yLabel;
380
+ const CudaTensor* yGradient = nullptr ;
381
+ int numLayer = (int )_layers.size ();
382
+ for (int i = 0 ; i < numLayer; ++i)
383
+ {
384
+ int layerIndex = numLayer - i - 1 ;
385
+ yGradient = _layers[layerIndex]->Backward (y, layerIndex);
386
+ y = yGradient;
387
+ }
388
+ }
389
+
390
+ void CudaNn::UpdateWs (double learningRate)
391
+ {
392
+ int numLayer = (int )_layers.size ();
393
+ for (int i = 0 ; i < numLayer; ++i)
394
+ {
395
+ _layers[i]->UpdateWs (learningRate);
396
+ }
397
+ }
398
+ } // namespace ff
0 commit comments