Added QuatNormLayer

dwha · dwha · commit 8901e83b0657 · 2020-09-24T23:14:22.000+09:00
diff --git a/ffCudaNn.cpp b/ffCudaNn.cpp
@@ -13,7 +13,7 @@ namespace ff
 	///////////////////////////////////////////////////////////////////////
 	//std::default_random_engine g_generator;
 	std::default_random_engine g_generator(static_cast<int>(std::chrono::steady_clock::now().time_since_epoch().count()));
-	static std::uniform_real_distribution<float> g_uniformDistribution;
+	std::uniform_real_distribution<float> g_uniformDistribution;
 	static std::normal_distribution<float> g_normalDistribution(0.0f, 1.0f);
 
 	CudaTensor::CudaTensor() : _d0(0), _d1(0), _d2(0), _d3(0), _dataSize(0), _dataGpu(nullptr), _dataGpuSize(0)
@@ -46,7 +46,6 @@ namespace ff
 		_d0 = d0; _d1 = d1; _d2 = d2; _d3 = d3;
 		_dataSize = _d0 * _d1 * _d2 * _d3;
 		_data.resize(_dataSize);
-
 		if (_dataGpuSize < _dataSize)
 		{
 			_dataGpuSize = _dataSize;
@@ -1058,6 +1057,84 @@ namespace ff
 		_yGdropped.PullFromGpu();
 	}
 
+	__global__ void ForwardQuatNorm_Cuda(float* y, const float* x, int nQuats, int nJobs)
+	{
+		int index = blockIdx.x * blockDim.x + threadIdx.x;
+		if (index >= nJobs) return;
+
+		int batch = index / nQuats;
+		int elem = index % nQuats;
+		int baseIndex = batch * nQuats * 4 + elem * 4;
+		float q[4] = { x[baseIndex], x[baseIndex + 1], x[baseIndex + 2], x[baseIndex + 3] };
+		float l = sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]) + 1e-8f;
+		y[baseIndex+0] = q[0] / l;
+		y[baseIndex+1] = q[1] / l;
+		y[baseIndex+2] = q[2] / l;
+		y[baseIndex+3] = q[3] / l;
+	}
+
+	const CudaTensor* QuatNormLayer::Forward(const CudaTensor* x)
+	{
+		assert(x->_d0 % 4 == 0);
+		_pX = x;
+		_y.ResetTensor(x->_d0, x->_d1, x->_d2, x->_d3);
+
+		int nJobs = _pX->_d0 * _pX->_d1 / 4;
+		int nBlocks = (nJobs + K_SMALL_THREAD_PER_BLOCK - 1) / K_SMALL_THREAD_PER_BLOCK;
+		dim3 block(nBlocks), threads(K_SMALL_THREAD_PER_BLOCK);
+		ForwardQuatNorm_Cuda <<< block, threads >>> (_y._dataGpu, _pX->_dataGpu, _pX->_d1 / 4, nJobs);
+		assert(cudaGetLastError() == cudaSuccess);
+
+		return &_y;
+	}
+
+	__global__ void BackwardQuatNorm_Cuda(float* xG, const float* x, const float* yG, int nQuats, int nJobs)
+	{
+		int index = blockIdx.x * blockDim.x + threadIdx.x;
+		if (index >= nJobs) return;
+
+		int batch = index / nQuats;
+		int elem = index % nQuats;
+		int baseIndex = batch * nQuats * 4 + elem * 4;
+		float q[4] = { x[baseIndex], x[baseIndex + 1], x[baseIndex + 2], x[baseIndex + 3] };
+		float tYg[4] = { yG[baseIndex], yG[baseIndex + 1], yG[baseIndex + 2], yG[baseIndex + 3] };
+		float squaredSum = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3];
+		float a = powf(squaredSum + 1e-8f, -1.5f);
+		for (int i = 0; i < 4; ++i)
+		{
+			float b = squaredSum * tYg[i];
+			float c = 0.0f;
+			for (int j = 0; j < 4; ++j)
+			{
+				c += (q[i] * q[j] * tYg[j]);
+			}
+			xG[baseIndex + i] = a * (b - c);
+		}
+	}
+
+	const CudaTensor* QuatNormLayer::Backward(const CudaTensor* yG, const int layerIndex)
+	{
+		assert(yG->_dataSize == _pX->_dataSize);
+		_xG.ResetTensor(_pX->_d0, _pX->_d1, _pX->_d2, _pX->_d3);
+
+		if (layerIndex > 0)
+		{
+			int nJobs = _xG._d0 * _xG._d1 / 4;
+			int nBlocks = (nJobs + K_SMALL_THREAD_PER_BLOCK - 1) / K_SMALL_THREAD_PER_BLOCK;
+			dim3 block(nBlocks), threads(K_SMALL_THREAD_PER_BLOCK);
+			BackwardQuatNorm_Cuda <<< block, threads >>> (_xG._dataGpu, _pX->_dataGpu, yG->_dataGpu, _xG._d0 / 4, nJobs);
+			assert(cudaGetLastError() == cudaSuccess);
+		}
+
+		return &_xG;
+	}
+
+	void QuatNormLayer::Pull()
+	{
+		_y.PullFromGpu();
+		_xG.PullFromGpu();
+	}
+
 	__global__ void ForwardSoftmax_Cuda(float* softmax , const float* x, int nRow, int nCol)
 	{
 		int r = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1230,6 +1307,12 @@ namespace ff
 		return true;
 	}
 
+	bool CudaNn::AddQuatNorm()
+	{
+		_layers.push_back(new QuatNormLayer(this));
+		return true;
+	}
+
 	const CudaTensor* CudaNn::Forward(const CudaTensor* x, bool train)
 	{
 		_train = train;
diff --git a/ffCudaNn.h b/ffCudaNn.h
@@ -229,6 +229,23 @@ namespace ff
 		CudaTensor _yG;
 	};
 
+	class QuatNormLayer : public CudaLayer
+	{
+	public:
+	QuatNormLayer(CudaNn* nn) : CudaLayer(nn) {}
+
+	const CudaTensor* Forward(const CudaTensor*) override;
+
+	const CudaTensor* Backward(const CudaTensor*, const int layerIndex) override;
+
+	void Pull() override;
+
+	public:
+		const CudaTensor* _pX;
+		CudaTensor _y;
+		CudaTensor _xG;
+	};
+
 	class CudaNn
 	{
 	public:
@@ -248,6 +265,8 @@ namespace ff
 
 		bool AddBatchNorm2d(int inDim);
 
+		bool AddQuatNorm();
+
 		bool AddDropout(float dropoutRatio);
 
 		bool AddSoftmax();
diff --git a/main.cpp b/main.cpp
@@ -1,18 +1,153 @@
 ﻿#include <stdio.h>
+#include <math.h>
+#include <random>
 #include "ffCudaNn.h"
 
 int mnist();
 int cifar10();
 
+namespace ff
+{
+	extern std::default_random_engine g_generator;
+	extern std::uniform_real_distribution<float> g_uniformDistribution;
+}
+
+void EulerToQuat(float* q, float yaw, float pitch, float roll)
+{
+    float cy = cosf(yaw * 0.5f);
+    float sy = sinf(yaw * 0.5f);
+    float cp = cosf(pitch * 0.5f);
+    float sp = sinf(pitch * 0.5f);
+    float cr = cosf(roll * 0.5f);
+    float sr = sinf(roll * 0.5f);
+
+    q[0] = cr * cp * cy + sr * sp * sy;
+    q[1] = sr * cp * cy - cr * sp * sy;
+    q[2] = cr * sp * cy + sr * cp * sy;
+    q[3] = cr * cp * sy - sr * sp * cy;
+}
+
+void NormalizeQuat(float* q)
+{
+	float a = sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]);
+	if(a > 0.0f)
+	{
+		q[0] /= a;
+		q[1] /= a;
+		q[2] /= a;
+		q[3] /= a;
+	}
+}
+
+int TestQuatNorm()
+{
+	ff::CudaTensor x(3 * 64, 32);
+	ff::CudaTensor y(4 * 64, 32);
+	ff::CudaTensor xTest(3 * 64, 32);
+	ff::CudaTensor yTest(4 * 64, 32);
+	for (int batch = 0; batch < xTest._d1; ++batch)
+	{
+		for (int elem = 0; elem < 64; ++elem)
+		{
+			int baseIndexX = batch * 64 * 3 + elem * 3;
+			int baseIndexY = batch * 64 * 4 + elem * 4;
+			float yaw = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
+			float pitch = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
+			float roll = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
+			xTest._data[baseIndexX] = yaw;
+			xTest._data[baseIndexX + 1] = pitch;
+			xTest._data[baseIndexX + 2] = roll;
+			EulerToQuat(&yTest._data[baseIndexY], yaw, pitch, roll);
+		}
+	}
+	xTest.PushToGpu();
+	yTest.PushToGpu();
+
+	float learningRate = 0.001f;
+	ff::CudaNn nn;
+	nn.AddFc(3*64, 1000);
+	nn.AddRelu();
+	nn.AddFc(1000, 4 * 64);
+	nn.AddQuatNorm();
+	nn.AddSumOfSquares();
+
+	float lastLoss[1000];
+	for (int i = 0; i < 120000; ++i)
+	{
+		if (i == 49999)
+		{
+			learningRate *= 0.1f;
+		}
+		if (i == 99999)
+		{
+			learningRate *= 0.1f;
+		}
+		for (int batch = 0; batch < x._d1; ++batch)
+		{
+			for (int elem = 0; elem < 64; ++elem)
+			{
+				int baseIndexX = batch * 64 * 3 + elem * 3;
+				int baseIndexY = batch * 64 * 4 + elem * 4;
+				float yaw = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
+				float pitch = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
+				float roll = (ff::g_uniformDistribution(ff::g_generator) * 2.0f - 1.0f) * 3.141592f;
+				x._data[baseIndexX] = yaw;
+				x._data[baseIndexX + 1] = pitch;
+				x._data[baseIndexX + 2] = roll;
+				EulerToQuat(&y._data[baseIndexY], yaw, pitch, roll);
+			}
+		}
+		x.PushToGpu();
+		y.PushToGpu();
+
+		for (int j = 0; j < 1; ++j)
+		{
+			nn.Forward(&x, true);
+			nn.Backward(&y);
+			nn.UpdateWs(learningRate);
+		}
+
+		ff::CudaTensor* yPred = const_cast<ff::CudaTensor*>(nn.Forward(&xTest));
+		yPred->PullFromGpu();
+
+		float loss = 0.0;
+		for (int r = 0; r < yPred->_d1; ++r)
+		{
+			for (int c = 0; c < yPred->_d0; c+=4)
+			{
+				int index = c + r * yPred->_d0;
+				NormalizeQuat(&yPred->_data[index]);
+				float aa = yPred->_data[index + 0] - yTest._data[index + 0];
+				float bb = yPred->_data[index + 1] - yTest._data[index + 1];
+				float cc = yPred->_data[index + 2] - yTest._data[index + 2];
+				float dd = yPred->_data[index + 3] - yTest._data[index + 3];
+				loss += sqrtf(aa * aa + bb * bb + cc * cc + dd * dd);
+			}
+		}
+		loss /= (yPred->_d1 * yPred->_d0 / 4);
+		lastLoss[i % 1000] = loss;
+		if (0 == i % 1000)
+			printf("[%05d]loss: %f\n", i, loss);
+	}
+
+	float loss = 0.0f;
+	for (int i = 0; i < 1000; ++i)
+	{
+		loss += lastLoss[i];
+	}
+	printf("Last 1000's loss: %f\n", loss / 1000.0f);
+	return 0;
+
+}
+
 int simple()
 {
 #if 1
 	float learningRate = 0.0001f;
 	ff::CudaNn nn;
-	nn.AddFc(1000, 1000);
+	nn.AddFc(1000, 2000);
 	nn.AddRelu();
-	nn.AddFc(1000, 500);
-	nn.AddDropout(0.5f);
+	nn.AddFc(2000, 500);
 	nn.AddRelu();
 	nn.AddFc(500, 500);
 	nn.AddRelu();
@@ -73,7 +208,8 @@ int simple()
 
 int main()
 {
+	return TestQuatNorm();
 	//return cifar10();
 	//return mnist();
-	return simple();
+	//return simple();
 }