Skip to content

Commit dd1d03b

Browse files
committed
+ Batch norm layer, + Data augmentation
1 parent 53e37ef commit dd1d03b

File tree

7 files changed

+223
-368
lines changed

7 files changed

+223
-368
lines changed

CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ message (STATUS "C_FLAGS: " ${CMAKE_C_FLAGS})
2525
message (STATUS "CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
2626
message (STATUS "CXX_FLAGS_DEBUG: " ${CMAKE_CXX_FLAGS_DEBUG})
2727
message (STATUS "CXX_FLAGS_RELEASE: " ${CMAKE_CXX_FLAGS_RELEASE})
28+
message (STATUS "CUDA_FLAGS: " ${CMAKE_CUDA_FLAGS})
29+
message (STATUS "CUDA_FLAGS_DEBUG: " ${CMAKE_CUDA_FLAGS_DEBUG})
30+
message (STATUS "CUDA_FLAGS_RELEASE: " ${CMAKE_CUDA_FLAGS_RELEASE})
2831
message (STATUS "#############################################")
2932

3033
include_directories (

README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ I focused on simplicity and conciseness while coding. That means there is no err
77
#### Weight layers
88
* 2D Convolutional
99
* Fully connected
10+
* Batch normalization
1011

1112
#### Non-linearity
1213
* Relu
@@ -29,9 +30,9 @@ I focused on simplicity and conciseness while coding. That means there is no err
2930
After basic components for deep learning implemented, I built a handwritten digit recognizer using [MNIST database](http://yann.lecun.com/exdb/mnist/). A simple 2-layer FCNN(1000 hidden unit) could achieve 1.56% Top-1 error rate after 14 epochs which take less than 20 seconds of training time on RTX 2070 graphics card. (See [mnist.cpp](mnist.cpp))
3031

3132
### CIFAR-10 photo classification
32-
![cifar10_cmd](https://user-images.githubusercontent.com/670560/92716821-4ec0e900-f39a-11ea-80f9-88bcad3d9c8b.png)
33+
![top1_error_rate](https://user-images.githubusercontent.com/670560/93190845-cddd6500-f77e-11ea-8ef0-6c6fe57c9d53.png)
3334

34-
In [cifar10.cpp](cifar10.cpp), you can find a VGG-like convolutional network which has 8 weight layers. [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset without augmentation is used to train the model. It achieves 25.9% top-1 error rate after 16 epoches. It took 33 seconds per epoch on my RTX 2070. If you try a larger model and have enough time to train you can improve it. Data augmentation and batch normalization will be helpful.
35+
In [cifar10.cpp](cifar10.cpp), you can find a VGG-like convolutional network which has 8 weight layers. [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset is used to train the model. It achieves 16.8% top-1 error rate after 33 epoches. It took 26 seconds of training time per epoch on my RTX 2070. If you try a larger model and have enough time to train you can improve it.
3536

3637
### Notes
3738
- Even naive CUDA implementation easily speeds up by 700x more than single-core/no-SIMD CPU version.

cifar10.cpp

+94-42
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,37 @@
55
#include <algorithm>
66
#include <math.h>
77
#include <chrono>
8+
#include <random>
89
#include "ffCudaNn.h"
910

11+
namespace ff
12+
{
13+
extern std::default_random_engine g_generator;
14+
}
15+
1016
class ProfileScope
1117
{
1218
public:
13-
ProfileScope(const char* msg) : _msg(msg)
19+
ProfileScope(const char* msg) : _msg(msg), _delta(-1.0f)
1420
{
1521
_s = std::chrono::high_resolution_clock::now();
1622
}
1723
~ProfileScope()
1824
{
19-
std::chrono::duration<float> delta = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - _s);
20-
printf("%s [%fs]\n", _msg, delta.count());
25+
if (_delta < 0.0f)
26+
{
27+
EndScope();
28+
}
29+
printf("%s [%fs]\n", _msg, _delta);
30+
}
31+
void EndScope()
32+
{
33+
std::chrono::duration<float> delta =
34+
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - _s);
35+
_delta = delta.count();
2136
}
2237
const char* _msg;
38+
float _delta;
2339
std::chrono::high_resolution_clock::time_point _s;
2440
};
2541

@@ -64,6 +80,16 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
6480
nLeft -= batchSize;
6581
}
6682

83+
std::vector<int> order(numTotalImages);
84+
for (int i = 0; i < numTotalImages; ++i)
85+
{
86+
order[i] = i;
87+
}
88+
if (true == augment)
89+
{
90+
std::shuffle(order.begin(), order.end(), ff::g_generator);
91+
}
92+
6793
// Data normalization
6894
float mean[3] = { 0.4914f, 0.4822f, 0.4465f };
6995
float std[3] = { 0.2023f, 0.1994f, 0.2010f };
@@ -81,18 +107,19 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
81107
for (int j = 0; j < kNumImagePerFile; ++j)
82108
{
83109
bool bFlip = false;
84-
if (true == augment && 1 == rand() % 2) bFlip = true;
85-
int batchIndex = imageCounter / batchSize;
86-
int elementIndex = imageCounter % batchSize;
110+
if (true == augment && 1 == ff::g_generator() % 2) bFlip = true;
111+
int batchIndex = order[imageCounter] / batchSize;
112+
int elementIndex = order[imageCounter] % batchSize;
87113
labels[batchIndex]._data[elementIndex] = static_cast<float>(*pCurr++);
114+
int baseIndex = elementIndex * kNumBytesPerChannel * kNumChannel;
88115
for (int ch = 0; ch < kNumChannel; ++ch)
89116
{
90117
for (int row = 0; row < 32; ++row)
91118
{
92119
for (int col = 0; col < 32; ++col)
93120
{
94-
float val = *pCurr++;
95-
int index = elementIndex * kNumBytesPerChannel * kNumChannel + ch * kNumBytesPerChannel;
121+
float val = static_cast<float>(*pCurr++);
122+
int index = baseIndex + ch * kNumBytesPerChannel;
96123
if (true == bFlip)
97124
{
98125
index += (row * 32 + (31 - col));
@@ -101,15 +128,14 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
101128
{
102129
index += (row * 32 + col);
103130
}
104-
images[batchIndex]._data[index] = ((val / 255.0f) - mean[ch]) / std[ch];
131+
images[batchIndex]._data[index] = val / 255.0f;
105132
}
106133
}
107134
}
108135
if (true == augment)
109136
{
110137
int shift = 8;
111138
int newSize = 32 + shift;
112-
int baseIndex = elementIndex * kNumBytesPerChannel * kNumChannel;
113139
buffer.resize(newSize * newSize * kNumChannel);
114140
for (int ch = 0; ch < kNumChannel; ++ch)
115141
{
@@ -123,8 +149,8 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
123149
}
124150
}
125151
}
126-
int rowShift = static_cast<int>(rand() % (shift+1));
127-
int colShift = static_cast<int>(rand() % (shift+1));
152+
int rowShift = static_cast<int>(ff::g_generator() % (shift+1));
153+
int colShift = static_cast<int>(ff::g_generator() % (shift+1));
128154
for (int ch = 0; ch < kNumChannel; ++ch)
129155
{
130156
for (int row = 0; row < 32; ++row)
@@ -137,6 +163,18 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
137163
}
138164
}
139165
}
166+
for (int ch = 0; ch < kNumChannel; ++ch)
167+
{
168+
for (int row = 0; row < 32; ++row)
169+
{
170+
for (int col = 0; col < 32; ++col)
171+
{
172+
int index = baseIndex + ch * kNumBytesPerChannel + row * 32 + col;
173+
images[batchIndex]._data[index] = (images[batchIndex]._data[index] - mean[ch])/std[ch];
174+
}
175+
}
176+
}
177+
140178
++imageCounter;
141179
if (imageCounter >= numTotalImages)
142180
break;
@@ -183,7 +221,11 @@ int ComputeLoss(ff::CudaNn& nn, std::vector<ff::CudaTensor>& images, std::vector
183221

184222
int cifar10()
185223
{
224+
// Note(dongwook): Hyper-parameters
225+
const bool augmentDataSet = false;
186226
const int kBatchSize = 100;
227+
const int kDataSetScalerInv = 1;
228+
float learningRate = 0.001f;
187229

188230
std::vector<std::string> trainingDataFilenames;
189231
trainingDataFilenames.push_back("cifar-10/data_batch_1.bin");
@@ -193,12 +235,12 @@ int cifar10()
193235
trainingDataFilenames.push_back("cifar-10/data_batch_5.bin");
194236
std::vector<ff::CudaTensor> trainingImages;
195237
std::vector<ff::CudaTensor> trainingLabels;
196-
LoadCifar10(kBatchSize, 5000, false, trainingDataFilenames, trainingImages, trainingLabels);
238+
LoadCifar10(kBatchSize, 50000 / kDataSetScalerInv, false, trainingDataFilenames, trainingImages, trainingLabels);
197239
std::vector<std::string> testDataFilenames;
198240
testDataFilenames.push_back("cifar-10/test_batch.bin");
199241
std::vector<ff::CudaTensor> testImages;
200242
std::vector<ff::CudaTensor> testLabels;
201-
LoadCifar10(kBatchSize, 1000, false, testDataFilenames, testImages, testLabels);
243+
LoadCifar10(kBatchSize, 10000 / kDataSetScalerInv , false, testDataFilenames, testImages, testLabels);
202244

203245
#if 1
204246
ff::CudaNn nn;
@@ -224,9 +266,9 @@ int cifar10()
224266
nn.AddBatchNorm2d(256);
225267
nn.AddRelu();
226268
nn.AddMaxPool();
227-
nn.AddFc(4 * 256, 1024);
269+
nn.AddFc(4 * 256, 1000);
228270
nn.AddRelu();
229-
nn.AddFc(1024, 10);
271+
nn.AddFc(1000, 10);
230272
nn.AddSoftmax();
231273
#else
232274
ff::CudaNn nn;
@@ -271,71 +313,81 @@ int cifar10()
271313
nn.AddSoftmax();
272314
#endif
273315

274-
float learningRate = 0.001f;
275-
276-
float last_train_loss = 0.0f;
277-
float lowest_train_loss = 1e8f;
316+
float last_validation_loss = 0.0f;
317+
float lowest_validation_loss = 1e8f;
278318
float last_test_loss = 0.0f;
279319
float lowest_test_loss = 1e8f;
280-
const int numBatch = (int)trainingImages.size();
281-
const int kNumEpoch = 200;
320+
const int kNumEpoch = 100;
282321
for (int i = 0; i < kNumEpoch; ++i)
283322
{
284323
float currLearningRate = learningRate;
285324

286325
// gradual decay
287326
//const float kDecay = 0.2f;
288-
//const int kCooldown = 3;
327+
//const int kCooldown = 24;
289328
//if (i >= kCooldown)
290329
//{
291330
// currLearningRate *= expf(-1.0f * kDecay * (i - kCooldown));
292331
//}
293332

333+
if (true == augmentDataSet && 0 != i)
334+
{
335+
LoadCifar10(kBatchSize, 50000 / kDataSetScalerInv, true, trainingDataFilenames, trainingImages, trainingLabels);
336+
}
337+
294338
char buffer[2048];
295339
sprintf(buffer, "-- Epoch %03d(lr: %f)", i + 1, currLearningRate);
296340
ProfileScope __m(buffer);
297341

298342
// Training
299-
int trainingImageCounter = 0;
300-
float train_loss = 0.0f;
301-
int top1 = 0, top3 = 0, top5 = 0;
343+
const int numBatch = (int)trainingImages.size();
302344
for (int j = 0; j < numBatch; ++j)
303345
{
304-
const ff::CudaTensor* pSoftmax = nullptr;
305-
pSoftmax = nn.Forward(&trainingImages[j], true);
346+
nn.Forward(&trainingImages[j], true);
306347
nn.Backward(&trainingLabels[j]);
307348
nn.UpdateWs(currLearningRate);
349+
}
350+
__m.EndScope();
308351

309-
// train loss
352+
// Validation loss
353+
int validationImageCounter = 0;
354+
float validation_loss = 0.0f;
355+
int top1 = 0, top3 = 0, top5 = 0;
356+
for (int j = 0; j < numBatch / 10; ++j)
357+
{
358+
// Note(dongwook): You should call Forward() several times after training if BatchNorm layers exist.
359+
// In the subsequent calls, mean and variance parameters are set to make the network deterministic.
360+
const ff::CudaTensor* pSoftmax = nullptr;
361+
pSoftmax = nn.Forward(&trainingImages[j], true);
310362
const_cast<ff::CudaTensor*>(pSoftmax)->PullFromGpu();
311363
for (int k = 0; k < pSoftmax->_d1; ++k)
312364
{
313365
float val = pSoftmax->_data[static_cast<int>(trainingLabels[j]._data[k]) + pSoftmax->_d0 * k];
314366
assert(val > 0.0f);
315367
if (val > 0.0f)
316368
{
317-
++trainingImageCounter;
318-
train_loss += -logf(val);
369+
++validationImageCounter;
370+
validation_loss += -logf(val);
319371
}
320372
}
321373
int t1, t3, t5;
322374
CheckAccuracy(pSoftmax, trainingLabels[j], t1, t3, t5);
323375
top1 += t1; top3 += t3; top5 += t5;
324376
}
325-
if (trainingImageCounter <= 0) trainingImageCounter = 1;
326-
train_loss /= trainingImageCounter;
327-
if (0 == i) last_train_loss = train_loss;
328-
if (train_loss < lowest_train_loss)
377+
if (validationImageCounter <= 0) validationImageCounter = 1;
378+
validation_loss /= validationImageCounter;
379+
if (0 == i) last_validation_loss = validation_loss;
380+
if (validation_loss < lowest_validation_loss)
329381
{
330-
lowest_train_loss = train_loss;
382+
lowest_validation_loss = validation_loss;
331383
}
332-
printf("Train[%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n",
333-
trainingImageCounter,
334-
train_loss, train_loss - last_train_loss, lowest_train_loss,
335-
top1, top1 * 100.0f / trainingImageCounter,
384+
printf("Val_[%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n",
385+
validationImageCounter,
386+
validation_loss, validation_loss - last_validation_loss, lowest_validation_loss,
387+
top1, top1 * 100.0f / validationImageCounter,
336388
top3,
337389
top5);
338-
last_train_loss = train_loss;
390+
last_validation_loss = validation_loss;
339391

340392
// Test loss
341393
{
@@ -348,7 +400,7 @@ int cifar10()
348400
{
349401
lowest_test_loss = test_loss;
350402
}
351-
printf("Test_[%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n",
403+
printf("Test[%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n",
352404
testCounter,
353405
test_loss, test_loss - last_test_loss, lowest_test_loss,
354406
top1, top1 * 100.0f / testCounter,

0 commit comments

Comments
 (0)