Skip to content

Commit 094b5ca

Browse files
committed
ffCudaNn
1 parent 863f6d9 commit 094b5ca

File tree

2 files changed

+538
-0
lines changed

2 files changed

+538
-0
lines changed

ffCudaNn.cpp

Lines changed: 398 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
#include "ffCudaNn.h"
2+
3+
#include <cuda_runtime.h>
4+
#include <random>
5+
#include <assert.h>
6+
7+
namespace ff
8+
{
9+
///////////////////////////////////////////////////////////////////////
10+
static std::default_random_engine g_generator;
11+
static std::normal_distribution<double> g_distribution;
12+
13+
CudaTensor::CudaTensor() : _d0(0), _d1(0), _d2(0), _d3(0), _dataSize(0), _dataGpu(nullptr), _dataGpuSize(0)
14+
{
15+
}
16+
17+
CudaTensor::CudaTensor(int d0, int d1, int d2, int d3) : _dataGpu(nullptr), _dataGpuSize(0)
18+
{
19+
ResetTensor(d0, d1, d2, d3);
20+
}
21+
22+
CudaTensor::~CudaTensor()
23+
{
24+
if (nullptr != _dataGpu) cudaFree(_dataGpu);
25+
}
26+
27+
void CudaTensor::ResetTensor(int d0, int d1, int d2, int d3)
28+
{
29+
_d0 = d0; _d1 = d1; _d2 = d2; _d3 = d3;
30+
_dataSize = _d0 * _d1 * _d2 * _d3;
31+
_data.clear();
32+
_data.resize(_dataSize);
33+
34+
if (_dataGpuSize < _dataSize)
35+
{
36+
_dataGpuSize = _dataSize;
37+
if (_dataGpu) cudaFree(_dataGpu);
38+
cudaError_t err = cudaMalloc(&_dataGpu, _dataGpuSize * sizeof(double));
39+
assert(err == cudaSuccess);
40+
}
41+
}
42+
43+
void CudaTensor::Random(const double multiplier)
44+
{
45+
for (int i = 0; i < _dataSize; ++i)
46+
{
47+
_data[i] = g_distribution(g_generator) * multiplier;
48+
}
49+
cudaMemcpy(_dataGpu, &_data[0], _dataSize * sizeof(double), cudaMemcpyKind::cudaMemcpyHostToDevice);
50+
}
51+
52+
void CudaTensor::Zero()
53+
{
54+
memset(&_data[0], 0, _data.size() * sizeof(double));
55+
cudaMemcpy(_dataGpu, &_data[0], _dataSize * sizeof(double), cudaMemcpyKind::cudaMemcpyHostToDevice);
56+
}
57+
58+
void CudaTensor::Push()
59+
{
60+
cudaMemcpy(_dataGpu, &_data[0], _dataSize * sizeof(double), cudaMemcpyKind::cudaMemcpyHostToDevice);
61+
}
62+
63+
void CudaTensor::Pull()
64+
{
65+
cudaMemcpy(&_data[0], _dataGpu, _dataSize * sizeof(double), cudaMemcpyKind::cudaMemcpyDeviceToHost);
66+
}
67+
68+
///////////////////////////////////////////////////////////////////////
69+
__global__ void LinearTransform_Cuda(double* y, const double* x, const double* w, const double* b, int xw, int ww)
70+
{
71+
int r = blockIdx.x;
72+
int c = threadIdx.x;
73+
double v = 0.0;
74+
for (int i = 0; i < xw; ++i)
75+
{
76+
v += x[i + r * xw] * w[c + i * ww];
77+
}
78+
y[c + r * ww] = v;
79+
}
80+
81+
void LinearTransform(CudaTensor* y, const CudaTensor* x, const CudaTensor* w, const CudaTensor* b)
82+
{
83+
//y = xw+b
84+
dim3 block(x->_d1), threads(w->_d0);
85+
LinearTransform_Cuda <<< block, threads >>> (y->_dataGpu, x->_dataGpu, w->_dataGpu, b->_dataGpu, x->_d0, w->_d0);
86+
assert(cudaGetLastError() == cudaSuccess);
87+
}
88+
89+
__global__ void ComputeWg_Cuda(double* wG, const double* x, const double* yG, int xTh, int xTw, int yGw)
90+
{
91+
int r = blockIdx.x;
92+
int c = threadIdx.x;
93+
double v = 0.0;
94+
for (int i = 0; i < xTw; ++i)
95+
{
96+
v += x[r + i * xTh] * yG[c + i * yGw];
97+
}
98+
wG[c + r * yGw] = v;
99+
}
100+
101+
void ComputeWg(CudaTensor* wG, const CudaTensor* x, const CudaTensor* yG)
102+
{
103+
// wG = x.T * yG
104+
dim3 block(x->_d0), threads(yG->_d0);
105+
ComputeWg_Cuda << < block, threads >> > (wG->_dataGpu, x->_dataGpu, yG->_dataGpu, x->_d0, x->_d1, yG->_d0);
106+
assert(cudaGetLastError() == cudaSuccess);
107+
}
108+
109+
__global__ void ComputeXg_Cuda(double* xG, const double* yG, const double* w, int yGw, int yGh, int wTh)
110+
{
111+
int r = blockIdx.x;
112+
int c = threadIdx.x;
113+
double v = 0.0;
114+
for (int i = 0; i < yGw; ++i)
115+
{
116+
v += yG[i + r * yGw] * w[i + c * wTh];
117+
}
118+
xG[c + r * yGh] = v;
119+
}
120+
121+
void ComputeXg(CudaTensor* xG, const CudaTensor* yG, const CudaTensor* w)
122+
{
123+
// xG = yG * w.T
124+
dim3 block(yG->_d1), threads(w->_d1);
125+
ComputeWg_Cuda << < block, threads >> > (xG->_dataGpu, yG->_dataGpu, w->_dataGpu, yG->_d0, yG->_d1, w->_d0);
126+
assert(cudaGetLastError() == cudaSuccess);
127+
}
128+
129+
__global__ void ComputeSumOfSquresGradient(double* yG, const double* y, const double* yLabel, int nCol)
130+
{
131+
int r = blockIdx.x;
132+
int c = threadIdx.x;
133+
int index = c + r * nCol;
134+
double diff = y[index] - yLabel[index];
135+
yG[index] = 2.0f * diff;
136+
}
137+
138+
__global__ void UpdateWs_Cuda(int nCol, double learningRate, double* w, const double* wG)
139+
{
140+
int r = blockIdx.x;
141+
int c = threadIdx.x;
142+
int index = c + r * nCol;
143+
w[index] -= wG[index] * learningRate;
144+
}
145+
146+
__global__ void Relu_Cuda(double* relu_x, const double* x, int nCol)
147+
{
148+
int r = blockIdx.x;
149+
int c = threadIdx.x;
150+
int index = c + r * nCol;
151+
relu_x[index] = fmax(x[index], 0.0);
152+
}
153+
154+
__global__ void ReluG_Cuda(double* xG, const double* x, int nCol)
155+
{
156+
int r = blockIdx.x;
157+
int c = threadIdx.x;
158+
int index = c + r * nCol;
159+
xG[index] = xG[index] * (fmax(x[index], 1e-32) / x[index]);
160+
//if (x[index] < 0.0) xG[index] = 0.0;
161+
}
162+
163+
__global__ void SoftmaxBackward_Cuda(double* lossG, const double* softmax, const double* yLabel, int nCol)
164+
{
165+
int r = blockIdx.x;
166+
int c = threadIdx.x;
167+
int index = c + r * nCol;
168+
lossG[index] = softmax[index];
169+
if ((int)yLabel[r] == c) lossG[index] -= 1.0;
170+
}
171+
172+
///////////////////////////////////////////////////////////////////////
173+
FcLayer::FcLayer(int inDim, int outDim) : _pX(nullptr)
174+
{
175+
_w.ResetTensor(outDim, inDim);
176+
_w.Random(1.0 / sqrt(inDim));
177+
_wG.ResetTensor(outDim, inDim);
178+
_b.ResetTensor(outDim);
179+
_b.Zero();
180+
_bG.ResetTensor(outDim);
181+
}
182+
183+
const CudaTensor* FcLayer::Forward(const CudaTensor* x)
184+
{
185+
if (x->_d0 != _w._d1)
186+
return nullptr;
187+
188+
_pX = x;
189+
_y.ResetTensor(_w._d0, _pX->_d1);
190+
LinearTransform(&_y, _pX, &_w, &_b);
191+
return &_y;
192+
}
193+
194+
const CudaTensor* FcLayer::Backward(const CudaTensor* yG, const int layerIndex)
195+
{
196+
ComputeWg(&_wG, _pX, yG);
197+
if (layerIndex > 0)
198+
{
199+
_xG.ResetTensor(_pX->_d0, _pX->_d1);
200+
ComputeXg(&_xG, yG, &_w);
201+
}
202+
return &_xG;
203+
}
204+
205+
void FcLayer::UpdateWs(double learningRate)
206+
{
207+
dim3 block(_w._d1), threads(_w._d0);
208+
UpdateWs_Cuda <<< block, threads >>> (_w._d0, learningRate, _w._dataGpu, _wG._dataGpu);
209+
assert(cudaGetLastError() == cudaSuccess);
210+
}
211+
212+
ReluFcLayer::ReluFcLayer(int inDim, int outDim) : _pX(nullptr)
213+
{
214+
_w.ResetTensor(outDim, inDim);
215+
_w.Random(1.0 / sqrt(inDim));
216+
_wG.ResetTensor(outDim, inDim);
217+
_b.ResetTensor(outDim);
218+
_b.Zero();
219+
_bG.ResetTensor(outDim);
220+
}
221+
222+
const CudaTensor* ReluFcLayer::Forward(const CudaTensor* x)
223+
{
224+
if (x->_d0 != _w._d1)
225+
return nullptr;
226+
227+
_pX = x;
228+
_xRelu.ResetTensor(_pX->_d0, _pX->_d1);
229+
dim3 block(_xRelu._d1), threads(_xRelu._d0);
230+
Relu_Cuda<<< block, threads >>>(_xRelu._dataGpu, _pX->_dataGpu, _xRelu._d0);
231+
assert(cudaGetLastError() == cudaSuccess);
232+
233+
_y.ResetTensor(_w._d0, _pX->_d1);
234+
LinearTransform(&_y, &_xRelu, &_w, &_b);
235+
return &_y;
236+
}
237+
238+
const CudaTensor* ReluFcLayer::Backward(const CudaTensor* yG, const int layerIndex)
239+
{
240+
ComputeWg(&_wG, &_xRelu, yG);
241+
if (layerIndex > 0)
242+
{
243+
_xG.ResetTensor(_pX->_d0, _pX->_d1);
244+
ComputeXg(&_xG, yG, &_w);
245+
dim3 block(_xG._d1), threads(_xG._d0);
246+
ReluG_Cuda<<< block, threads >>> (_xG._dataGpu, _pX->_dataGpu, _xG._d0);
247+
assert(cudaGetLastError() == cudaSuccess);
248+
}
249+
return &_xG;
250+
}
251+
252+
void ReluFcLayer::UpdateWs(double learningRate)
253+
{
254+
dim3 block(_w._d1), threads(_w._d0);
255+
UpdateWs_Cuda <<< block, threads >>> (_w._d0, learningRate, _w._dataGpu, _wG._dataGpu);
256+
assert(cudaGetLastError() == cudaSuccess);
257+
}
258+
259+
const CudaTensor* SoftmaxLayer::Forward(const CudaTensor* x)
260+
{
261+
_softmax.ResetTensor(x->_d0, x->_d1);
262+
_lossG.ResetTensor(x->_d0, x->_d1);
263+
264+
const_cast<CudaTensor*>(x)->Pull();
265+
for (int r = 0; r < x->_d1; ++r)
266+
{
267+
double maxValue = x->_data[0 + x->_d0 * r];
268+
for (int i = 1; i < x->_d0; ++i)
269+
{
270+
double currValue = x->_data[i + x->_d0 * r];
271+
if (maxValue < currValue)
272+
{
273+
maxValue = currValue;
274+
}
275+
}
276+
277+
double sum = 0.0;
278+
for (int i = 0; i < x->_d0; ++i)
279+
{
280+
sum += exp(x->_data[i + x->_d0 * r] - maxValue); // stable softmax
281+
}
282+
for (int i = 0; i < x->_d0; ++i)
283+
{
284+
_softmax._data[i + _softmax._d0 * r] = exp(x->_data[i + x->_d0 * r] - maxValue) / sum;
285+
}
286+
}
287+
_softmax.Push();
288+
return &_softmax;
289+
}
290+
291+
const CudaTensor* SoftmaxLayer::Backward(const CudaTensor* yG, const int layerIndex)
292+
{
293+
dim3 block(yG->_d1), threads(_softmax._d0);
294+
SoftmaxBackward_Cuda <<< block, threads >>> (_lossG._dataGpu, _softmax._dataGpu, yG->_dataGpu, _lossG._d0);
295+
assert(cudaGetLastError() == cudaSuccess);
296+
return &_lossG;
297+
}
298+
299+
SumOfSquaresLayer::SumOfSquaresLayer() : _pY(nullptr)
300+
{
301+
}
302+
303+
const CudaTensor* SumOfSquaresLayer::Forward(const CudaTensor* x)
304+
{
305+
_pY = x;
306+
return _pY;
307+
}
308+
309+
const CudaTensor* SumOfSquaresLayer::Backward(const CudaTensor* yLabel, const int layerIndex)
310+
{
311+
_yG.ResetTensor(yLabel->_d0, yLabel->_d1);
312+
313+
dim3 block(_yG._d1), threads(_yG._d0);
314+
ComputeSumOfSquresGradient <<< block, threads >>> (_yG._dataGpu, _pY->_dataGpu, yLabel->_dataGpu, _yG._d0);
315+
assert(cudaGetLastError() == cudaSuccess);
316+
return &_yG;
317+
}
318+
319+
///////////////////////////////////////////////////////////////////////
320+
CudaNn::~CudaNn()
321+
{
322+
InitializeCudaNn("");
323+
}
324+
325+
bool CudaNn::InitializeCudaNn(const char* desc)
326+
{
327+
size_t numLayers = _layers.size();
328+
for (size_t i = 0; i < numLayers; ++i)
329+
{
330+
delete _layers[i];
331+
}
332+
_layers.clear();
333+
334+
return true;
335+
}
336+
337+
bool CudaNn::AddFc(int inDim, int outDim)
338+
{
339+
_layers.push_back(new FcLayer(inDim, outDim));
340+
return true;
341+
}
342+
343+
bool CudaNn::AddReluFc(int inDim, int outDim)
344+
{
345+
_layers.push_back(new ReluFcLayer(inDim, outDim));
346+
return true;
347+
}
348+
349+
bool CudaNn::AddSoftmax()
350+
{
351+
_layers.push_back(new SoftmaxLayer);
352+
return true;
353+
}
354+
355+
bool CudaNn::AddSumOfSquares()
356+
{
357+
_layers.push_back(new SumOfSquaresLayer);
358+
return true;
359+
}
360+
361+
const CudaTensor* CudaNn::Forward(const CudaTensor* x)
362+
{
363+
const CudaTensor* y = nullptr;
364+
size_t numLayer = _layers.size();
365+
for (size_t i = 0; i < numLayer; ++i)
366+
{
367+
if (nullptr == x)
368+
return nullptr;
369+
370+
y = _layers[i]->Forward(x);
371+
x = y;
372+
}
373+
374+
return y;
375+
}
376+
377+
void CudaNn::Backward(const CudaTensor* yLabel)
378+
{
379+
const CudaTensor* y = yLabel;
380+
const CudaTensor* yGradient = nullptr;
381+
int numLayer = (int)_layers.size();
382+
for (int i = 0; i < numLayer; ++i)
383+
{
384+
int layerIndex = numLayer - i - 1;
385+
yGradient = _layers[layerIndex]->Backward(y, layerIndex);
386+
y = yGradient;
387+
}
388+
}
389+
390+
void CudaNn::UpdateWs(double learningRate)
391+
{
392+
int numLayer = (int)_layers.size();
393+
for (int i = 0; i < numLayer; ++i)
394+
{
395+
_layers[i]->UpdateWs(learningRate);
396+
}
397+
}
398+
} // namespace ff

0 commit comments

Comments
 (0)