Skip to content

Commit 8901e83

Browse files
committed
Added QuatNormLayer
1 parent 7186416 commit 8901e83

File tree

3 files changed

+244
-6
lines changed

3 files changed

+244
-6
lines changed

ffCudaNn.cpp

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ namespace ff
1313
///////////////////////////////////////////////////////////////////////
1414
//std::default_random_engine g_generator;
1515
std::default_random_engine g_generator(static_cast<int>(std::chrono::steady_clock::now().time_since_epoch().count()));
16-
static std::uniform_real_distribution<float> g_uniformDistribution;
16+
std::uniform_real_distribution<float> g_uniformDistribution;
1717
static std::normal_distribution<float> g_normalDistribution(0.0f, 1.0f);
1818

1919
CudaTensor::CudaTensor() : _d0(0), _d1(0), _d2(0), _d3(0), _dataSize(0), _dataGpu(nullptr), _dataGpuSize(0)
@@ -46,7 +46,6 @@ namespace ff
4646
_d0 = d0; _d1 = d1; _d2 = d2; _d3 = d3;
4747
_dataSize = _d0 * _d1 * _d2 * _d3;
4848
_data.resize(_dataSize);
49-
5049
if (_dataGpuSize < _dataSize)
5150
{
5251
_dataGpuSize = _dataSize;
@@ -1058,6 +1057,84 @@ namespace ff
10581057
_yGdropped.PullFromGpu();
10591058
}
10601059

1060+
__global__ void ForwardQuatNorm_Cuda(float* y, const float* x, int nQuats, int nJobs)
1061+
{
1062+
int index = blockIdx.x * blockDim.x + threadIdx.x;
1063+
if (index >= nJobs) return;
1064+
1065+
int batch = index / nQuats;
1066+
int elem = index % nQuats;
1067+
int baseIndex = batch * nQuats * 4 + elem * 4;
1068+
float q[4] = { x[baseIndex], x[baseIndex + 1], x[baseIndex + 2], x[baseIndex + 3] };
1069+
float l = sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]) + 1e-8f;
1070+
y[baseIndex+0] = q[0] / l;
1071+
y[baseIndex+1] = q[1] / l;
1072+
y[baseIndex+2] = q[2] / l;
1073+
y[baseIndex+3] = q[3] / l;
1074+
}
1075+
1076+
const CudaTensor* QuatNormLayer::Forward(const CudaTensor* x)
1077+
{
1078+
assert(x->_d0 % 4 == 0);
1079+
_pX = x;
1080+
_y.ResetTensor(x->_d0, x->_d1, x->_d2, x->_d3);
1081+
1082+
int nJobs = _pX->_d0 * _pX->_d1 / 4;
1083+
int nBlocks = (nJobs + K_SMALL_THREAD_PER_BLOCK - 1) / K_SMALL_THREAD_PER_BLOCK;
1084+
dim3 block(nBlocks), threads(K_SMALL_THREAD_PER_BLOCK);
1085+
ForwardQuatNorm_Cuda <<< block, threads >>> (_y._dataGpu, _pX->_dataGpu, _pX->_d1 / 4, nJobs);
1086+
assert(cudaGetLastError() == cudaSuccess);
1087+
1088+
return &_y;
1089+
}
1090+
1091+
__global__ void BackwardQuatNorm_Cuda(float* xG, const float* x, const float* yG, int nQuats, int nJobs)
1092+
{
1093+
int index = blockIdx.x * blockDim.x + threadIdx.x;
1094+
if (index >= nJobs) return;
1095+
1096+
int batch = index / nQuats;
1097+
int elem = index % nQuats;
1098+
int baseIndex = batch * nQuats * 4 + elem * 4;
1099+
float q[4] = { x[baseIndex], x[baseIndex + 1], x[baseIndex + 2], x[baseIndex + 3] };
1100+
float tYg[4] = { yG[baseIndex], yG[baseIndex + 1], yG[baseIndex + 2], yG[baseIndex + 3] };
1101+
float squaredSum = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3];
1102+
float a = powf(squaredSum + 1e-8f, -1.5f);
1103+
for (int i = 0; i < 4; ++i)
1104+
{
1105+
float b = squaredSum * tYg[i];
1106+
float c = 0.0f;
1107+
for (int j = 0; j < 4; ++j)
1108+
{
1109+
c += (q[i] * q[j] * tYg[j]);
1110+
}
1111+
xG[baseIndex + i] = a * (b - c);
1112+
}
1113+
}
1114+
1115+
const CudaTensor* QuatNormLayer::Backward(const CudaTensor* yG, const int layerIndex)
1116+
{
1117+
assert(yG->_dataSize == _pX->_dataSize);
1118+
_xG.ResetTensor(_pX->_d0, _pX->_d1, _pX->_d2, _pX->_d3);
1119+
1120+
if (layerIndex > 0)
1121+
{
1122+
int nJobs = _xG._d0 * _xG._d1 / 4;
1123+
int nBlocks = (nJobs + K_SMALL_THREAD_PER_BLOCK - 1) / K_SMALL_THREAD_PER_BLOCK;
1124+
dim3 block(nBlocks), threads(K_SMALL_THREAD_PER_BLOCK);
1125+
BackwardQuatNorm_Cuda <<< block, threads >>> (_xG._dataGpu, _pX->_dataGpu, yG->_dataGpu, _xG._d0 / 4, nJobs);
1126+
assert(cudaGetLastError() == cudaSuccess);
1127+
}
1128+
1129+
return &_xG;
1130+
}
1131+
1132+
void QuatNormLayer::Pull()
1133+
{
1134+
_y.PullFromGpu();
1135+
_xG.PullFromGpu();
1136+
}
1137+
10611138
__global__ void ForwardSoftmax_Cuda(float* softmax , const float* x, int nRow, int nCol)
10621139
{
10631140
int r = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1230,6 +1307,12 @@ namespace ff
12301307
return true;
12311308
}
12321309

1310+
bool CudaNn::AddQuatNorm()
1311+
{
1312+
_layers.push_back(new QuatNormLayer(this));
1313+
return true;
1314+
}
1315+
12331316
const CudaTensor* CudaNn::Forward(const CudaTensor* x, bool train)
12341317
{
12351318
_train = train;

ffCudaNn.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,23 @@ namespace ff
229229
CudaTensor _yG;
230230
};
231231

232+
class QuatNormLayer : public CudaLayer
233+
{
234+
public:
235+
QuatNormLayer(CudaNn* nn) : CudaLayer(nn) {}
236+
237+
const CudaTensor* Forward(const CudaTensor*) override;
238+
239+
const CudaTensor* Backward(const CudaTensor*, const int layerIndex) override;
240+
241+
void Pull() override;
242+
243+
public:
244+
const CudaTensor* _pX;
245+
CudaTensor _y;
246+
CudaTensor _xG;
247+
};
248+
232249
class CudaNn
233250
{
234251
public:
@@ -248,6 +265,8 @@ namespace ff
248265

249266
bool AddBatchNorm2d(int inDim);
250267

268+
bool AddQuatNorm();
269+
251270
bool AddDropout(float dropoutRatio);
252271

253272
bool AddSoftmax();

main.cpp

Lines changed: 140 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,153 @@
11
#include <stdio.h>
2+
#include <math.h>
3+
#include <random>
24
#include "ffCudaNn.h"
35

46
int mnist();
57
int cifar10();
68

9+
namespace ff
10+
{
11+
extern std::default_random_engine g_generator;
12+
extern std::uniform_real_distribution<float> g_uniformDistribution;
13+
}
14+
15+
void EulerToQuat(float* q, float yaw, float pitch, float roll)
16+
{
17+
float cy = cosf(yaw * 0.5f);
18+
float sy = sinf(yaw * 0.5f);
19+
float cp = cosf(pitch * 0.5f);
20+
float sp = sinf(pitch * 0.5f);
21+
float cr = cosf(roll * 0.5f);
22+
float sr = sinf(roll * 0.5f);
23+
24+
q[0] = cr * cp * cy + sr * sp * sy;
25+
q[1] = sr * cp * cy - cr * sp * sy;
26+
q[2] = cr * sp * cy + sr * cp * sy;
27+
q[3] = cr * cp * sy - sr * sp * cy;
28+
}
29+
30+
void NormalizeQuat(float* q)
31+
{
32+
float a = sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]);
33+
if(a > 0.0f)
34+
{
35+
q[0] /= a;
36+
q[1] /= a;
37+
q[2] /= a;
38+
q[3] /= a;
39+
}
40+
}
41+
42+
int TestQuatNorm()
43+
{
44+
ff::CudaTensor x(3 * 64, 32);
45+
ff::CudaTensor y(4 * 64, 32);
46+
ff::CudaTensor xTest(3 * 64, 32);
47+
ff::CudaTensor yTest(4 * 64, 32);
48+
for (int batch = 0; batch < xTest._d1; ++batch)
49+
{
50+
for (int elem = 0; elem < 64; ++elem)
51+
{
52+
int baseIndexX = batch * 64 * 3 + elem * 3;
53+
int baseIndexY = batch * 64 * 4 + elem * 4;
54+
float yaw = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
55+
float pitch = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
56+
float roll = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
57+
xTest._data[baseIndexX] = yaw;
58+
xTest._data[baseIndexX + 1] = pitch;
59+
xTest._data[baseIndexX + 2] = roll;
60+
EulerToQuat(&yTest._data[baseIndexY], yaw, pitch, roll);
61+
}
62+
}
63+
xTest.PushToGpu();
64+
yTest.PushToGpu();
65+
66+
float learningRate = 0.001f;
67+
ff::CudaNn nn;
68+
nn.AddFc(3*64, 1000);
69+
nn.AddRelu();
70+
nn.AddFc(1000, 4 * 64);
71+
nn.AddQuatNorm();
72+
nn.AddSumOfSquares();
73+
74+
float lastLoss[1000];
75+
for (int i = 0; i < 120000; ++i)
76+
{
77+
if (i == 49999)
78+
{
79+
learningRate *= 0.1f;
80+
}
81+
if (i == 99999)
82+
{
83+
learningRate *= 0.1f;
84+
}
85+
for (int batch = 0; batch < x._d1; ++batch)
86+
{
87+
for (int elem = 0; elem < 64; ++elem)
88+
{
89+
int baseIndexX = batch * 64 * 3 + elem * 3;
90+
int baseIndexY = batch * 64 * 4 + elem * 4;
91+
float yaw = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
92+
float pitch = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
93+
float roll = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
94+
x._data[baseIndexX] = yaw;
95+
x._data[baseIndexX + 1] = pitch;
96+
x._data[baseIndexX + 2] = roll;
97+
EulerToQuat(&y._data[baseIndexY], yaw, pitch, roll);
98+
}
99+
}
100+
x.PushToGpu();
101+
y.PushToGpu();
102+
103+
for (int j = 0; j < 1; ++j)
104+
{
105+
nn.Forward(&x, true);
106+
nn.Backward(&y);
107+
nn.UpdateWs(learningRate);
108+
}
109+
110+
ff::CudaTensor* yPred = const_cast<ff::CudaTensor*>(nn.Forward(&xTest));
111+
yPred->PullFromGpu();
112+
113+
float loss = 0.0;
114+
for (int r = 0; r < yPred->_d1; ++r)
115+
{
116+
for (int c = 0; c < yPred->_d0; c+=4)
117+
{
118+
int index = c + r * yPred->_d0;
119+
NormalizeQuat(&yPred->_data[index]);
120+
float aa = yPred->_data[index + 0] - yTest._data[index + 0];
121+
float bb = yPred->_data[index + 1] - yTest._data[index + 1];
122+
float cc = yPred->_data[index + 2] - yTest._data[index + 2];
123+
float dd = yPred->_data[index + 3] - yTest._data[index + 3];
124+
loss += sqrtf(aa * aa + bb * bb + cc * cc + dd * dd);
125+
}
126+
}
127+
loss /= (yPred->_d1 * yPred->_d0 / 4);
128+
lastLoss[i % 1000] = loss;
129+
if (0 == i % 1000)
130+
printf("[%05d]loss: %f\n", i, loss);
131+
}
132+
133+
float loss = 0.0f;
134+
for (int i = 0; i < 1000; ++i)
135+
{
136+
loss += lastLoss[i];
137+
}
138+
printf("Last 1000's loss: %f\n", loss / 1000.0f);
139+
return 0;
140+
141+
}
142+
7143
int simple()
8144
{
9145
#if 1
10146
float learningRate = 0.0001f;
11147
ff::CudaNn nn;
12-
nn.AddFc(1000, 1000);
148+
nn.AddFc(1000, 2000);
13149
nn.AddRelu();
14-
nn.AddFc(1000, 500);
15-
nn.AddDropout(0.5f);
150+
nn.AddFc(2000, 500);
16151
nn.AddRelu();
17152
nn.AddFc(500, 500);
18153
nn.AddRelu();
@@ -73,7 +208,8 @@ int simple()
73208

74209
int main()
75210
{
211+
return TestQuatNorm();
76212
//return cifar10();
77213
//return mnist();
78-
return simple();
214+
//return simple();
79215
}

0 commit comments

Comments
 (0)