55#include < algorithm>
66#include < math.h>
77#include < chrono>
8+ #include < random>
89#include " ffCudaNn.h"
910
11+ namespace ff
12+ {
13+ extern std::default_random_engine g_generator;
14+ }
15+
1016class ProfileScope
1117{
1218public:
13- ProfileScope (const char * msg) : _msg(msg)
19+ ProfileScope (const char * msg) : _msg(msg), _delta(- 1 . 0f )
1420 {
1521 _s = std::chrono::high_resolution_clock::now ();
1622 }
1723 ~ProfileScope ()
1824 {
19- std::chrono::duration<float > delta = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now () - _s);
20- printf (" %s [%fs]\n " , _msg, delta.count ());
25+ if (_delta < 0 .0f )
26+ {
27+ EndScope ();
28+ }
29+ printf (" %s [%fs]\n " , _msg, _delta);
30+ }
31+ void EndScope ()
32+ {
33+ std::chrono::duration<float > delta =
34+ std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now () - _s);
35+ _delta = delta.count ();
2136 }
2237 const char * _msg;
38+ float _delta;
2339 std::chrono::high_resolution_clock::time_point _s;
2440};
2541
@@ -64,6 +80,16 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
6480 nLeft -= batchSize;
6581 }
6682
83+ std::vector<int > order (numTotalImages);
84+ for (int i = 0 ; i < numTotalImages; ++i)
85+ {
86+ order[i] = i;
87+ }
88+ if (true == augment)
89+ {
90+ std::shuffle (order.begin (), order.end (), ff::g_generator);
91+ }
92+
6793 // Data normalization
6894 float mean[3 ] = { 0 .4914f , 0 .4822f , 0 .4465f };
6995 float std[3 ] = { 0 .2023f , 0 .1994f , 0 .2010f };
@@ -81,18 +107,19 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
81107 for (int j = 0 ; j < kNumImagePerFile ; ++j)
82108 {
83109 bool bFlip = false ;
84- if (true == augment && 1 == rand () % 2 ) bFlip = true ;
85- int batchIndex = imageCounter / batchSize;
86- int elementIndex = imageCounter % batchSize;
110+ if (true == augment && 1 == ff::g_generator () % 2 ) bFlip = true ;
111+ int batchIndex = order[ imageCounter] / batchSize;
112+ int elementIndex = order[ imageCounter] % batchSize;
87113 labels[batchIndex]._data [elementIndex] = static_cast <float >(*pCurr++);
114+ int baseIndex = elementIndex * kNumBytesPerChannel * kNumChannel ;
88115 for (int ch = 0 ; ch < kNumChannel ; ++ch)
89116 {
90117 for (int row = 0 ; row < 32 ; ++row)
91118 {
92119 for (int col = 0 ; col < 32 ; ++col)
93120 {
94- float val = *pCurr++;
95- int index = elementIndex * kNumBytesPerChannel * kNumChannel + ch * kNumBytesPerChannel ;
121+ float val = static_cast < float >( *pCurr++) ;
122+ int index = baseIndex + ch * kNumBytesPerChannel ;
96123 if (true == bFlip)
97124 {
98125 index += (row * 32 + (31 - col));
@@ -101,15 +128,14 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
101128 {
102129 index += (row * 32 + col);
103130 }
104- images[batchIndex]._data [index] = (( val / 255 .0f ) - mean[ch]) / std[ch] ;
131+ images[batchIndex]._data [index] = val / 255 .0f ;
105132 }
106133 }
107134 }
108135 if (true == augment)
109136 {
110137 int shift = 8 ;
111138 int newSize = 32 + shift;
112- int baseIndex = elementIndex * kNumBytesPerChannel * kNumChannel ;
113139 buffer.resize (newSize * newSize * kNumChannel );
114140 for (int ch = 0 ; ch < kNumChannel ; ++ch)
115141 {
@@ -123,8 +149,8 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
123149 }
124150 }
125151 }
126- int rowShift = static_cast <int >(rand () % (shift+1 ));
127- int colShift = static_cast <int >(rand () % (shift+1 ));
152+ int rowShift = static_cast <int >(ff::g_generator () % (shift+1 ));
153+ int colShift = static_cast <int >(ff::g_generator () % (shift+1 ));
128154 for (int ch = 0 ; ch < kNumChannel ; ++ch)
129155 {
130156 for (int row = 0 ; row < 32 ; ++row)
@@ -137,6 +163,18 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
137163 }
138164 }
139165 }
166+ for (int ch = 0 ; ch < kNumChannel ; ++ch)
167+ {
168+ for (int row = 0 ; row < 32 ; ++row)
169+ {
170+ for (int col = 0 ; col < 32 ; ++col)
171+ {
172+ int index = baseIndex + ch * kNumBytesPerChannel + row * 32 + col;
173+ images[batchIndex]._data [index] = (images[batchIndex]._data [index] - mean[ch])/std[ch];
174+ }
175+ }
176+ }
177+
140178 ++imageCounter;
141179 if (imageCounter >= numTotalImages)
142180 break ;
@@ -183,7 +221,11 @@ int ComputeLoss(ff::CudaNn& nn, std::vector<ff::CudaTensor>& images, std::vector
183221
184222int cifar10 ()
185223{
224+ // Note(dongwook): Hyper-parameters
225+ const bool augmentDataSet = false ;
186226 const int kBatchSize = 100 ;
227+ const int kDataSetScalerInv = 1 ;
228+ float learningRate = 0 .001f ;
187229
188230 std::vector<std::string> trainingDataFilenames;
189231 trainingDataFilenames.push_back (" cifar-10/data_batch_1.bin" );
@@ -193,12 +235,12 @@ int cifar10()
193235 trainingDataFilenames.push_back (" cifar-10/data_batch_5.bin" );
194236 std::vector<ff::CudaTensor> trainingImages;
195237 std::vector<ff::CudaTensor> trainingLabels;
196- LoadCifar10 (kBatchSize , 5000 , false , trainingDataFilenames, trainingImages, trainingLabels);
238+ LoadCifar10 (kBatchSize , 50000 / kDataSetScalerInv , false , trainingDataFilenames, trainingImages, trainingLabels);
197239 std::vector<std::string> testDataFilenames;
198240 testDataFilenames.push_back (" cifar-10/test_batch.bin" );
199241 std::vector<ff::CudaTensor> testImages;
200242 std::vector<ff::CudaTensor> testLabels;
201- LoadCifar10 (kBatchSize , 1000 , false , testDataFilenames, testImages, testLabels);
243+ LoadCifar10 (kBatchSize , 10000 / kDataSetScalerInv , false , testDataFilenames, testImages, testLabels);
202244
203245#if 1
204246 ff::CudaNn nn;
@@ -224,9 +266,9 @@ int cifar10()
224266 nn.AddBatchNorm2d (256 );
225267 nn.AddRelu ();
226268 nn.AddMaxPool ();
227- nn.AddFc (4 * 256 , 1024 );
269+ nn.AddFc (4 * 256 , 1000 );
228270 nn.AddRelu ();
229- nn.AddFc (1024 , 10 );
271+ nn.AddFc (1000 , 10 );
230272 nn.AddSoftmax ();
231273#else
232274 ff::CudaNn nn;
@@ -271,71 +313,81 @@ int cifar10()
271313 nn.AddSoftmax();
272314 #endif
273315
274- float learningRate = 0 .001f ;
275-
276- float last_train_loss = 0 .0f ;
277- float lowest_train_loss = 1e8f;
316+ float last_validation_loss = 0 .0f ;
317+ float lowest_validation_loss = 1e8f;
278318 float last_test_loss = 0 .0f ;
279319 float lowest_test_loss = 1e8f;
280- const int numBatch = (int )trainingImages.size ();
281- const int kNumEpoch = 200 ;
320+ const int kNumEpoch = 100 ;
282321 for (int i = 0 ; i < kNumEpoch ; ++i)
283322 {
284323 float currLearningRate = learningRate;
285324
286325 // gradual decay
287326 // const float kDecay = 0.2f;
288- // const int kCooldown = 3 ;
327+ // const int kCooldown = 24 ;
289328 // if (i >= kCooldown)
290329 // {
291330 // currLearningRate *= expf(-1.0f * kDecay * (i - kCooldown));
292331 // }
293332
333+ if (true == augmentDataSet && 0 != i)
334+ {
335+ LoadCifar10 (kBatchSize , 50000 / kDataSetScalerInv , true , trainingDataFilenames, trainingImages, trainingLabels);
336+ }
337+
294338 char buffer[2048 ];
295339 sprintf (buffer, " -- Epoch %03d(lr: %f)" , i + 1 , currLearningRate);
296340 ProfileScope __m (buffer);
297341
298342 // Training
299- int trainingImageCounter = 0 ;
300- float train_loss = 0 .0f ;
301- int top1 = 0 , top3 = 0 , top5 = 0 ;
343+ const int numBatch = (int )trainingImages.size ();
302344 for (int j = 0 ; j < numBatch; ++j)
303345 {
304- const ff::CudaTensor* pSoftmax = nullptr ;
305- pSoftmax = nn.Forward (&trainingImages[j], true );
346+ nn.Forward (&trainingImages[j], true );
306347 nn.Backward (&trainingLabels[j]);
307348 nn.UpdateWs (currLearningRate);
349+ }
350+ __m.EndScope ();
308351
309- // train loss
352+ // Validation loss
353+ int validationImageCounter = 0 ;
354+ float validation_loss = 0 .0f ;
355+ int top1 = 0 , top3 = 0 , top5 = 0 ;
356+ for (int j = 0 ; j < numBatch / 10 ; ++j)
357+ {
358+ // Note(dongwook): You should call Forward() several times after training if BatchNorm layers exist.
359+ // In the subsequent calls, mean and variance parameters are set to make the network deterministic.
360+ const ff::CudaTensor* pSoftmax = nullptr ;
361+ pSoftmax = nn.Forward (&trainingImages[j], true );
310362 const_cast <ff::CudaTensor*>(pSoftmax)->PullFromGpu ();
311363 for (int k = 0 ; k < pSoftmax->_d1 ; ++k)
312364 {
313365 float val = pSoftmax->_data [static_cast <int >(trainingLabels[j]._data [k]) + pSoftmax->_d0 * k];
314366 assert (val > 0 .0f );
315367 if (val > 0 .0f )
316368 {
317- ++trainingImageCounter ;
318- train_loss += -logf (val);
369+ ++validationImageCounter ;
370+ validation_loss += -logf (val);
319371 }
320372 }
321373 int t1, t3, t5;
322374 CheckAccuracy (pSoftmax, trainingLabels[j], t1, t3, t5);
323375 top1 += t1; top3 += t3; top5 += t5;
324376 }
325- if (trainingImageCounter <= 0 ) trainingImageCounter = 1 ;
326- train_loss /= trainingImageCounter ;
327- if (0 == i) last_train_loss = train_loss ;
328- if (train_loss < lowest_train_loss )
377+ if (validationImageCounter <= 0 ) validationImageCounter = 1 ;
378+ validation_loss /= validationImageCounter ;
379+ if (0 == i) last_validation_loss = validation_loss ;
380+ if (validation_loss < lowest_validation_loss )
329381 {
330- lowest_train_loss = train_loss ;
382+ lowest_validation_loss = validation_loss ;
331383 }
332- printf (" Train [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
333- trainingImageCounter ,
334- train_loss, train_loss - last_train_loss, lowest_train_loss ,
335- top1, top1 * 100 .0f / trainingImageCounter ,
384+ printf (" Val_ [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
385+ validationImageCounter ,
386+ validation_loss, validation_loss - last_validation_loss, lowest_validation_loss ,
387+ top1, top1 * 100 .0f / validationImageCounter ,
336388 top3,
337389 top5);
338- last_train_loss = train_loss ;
390+ last_validation_loss = validation_loss ;
339391
340392 // Test loss
341393 {
@@ -348,7 +400,7 @@ int cifar10()
348400 {
349401 lowest_test_loss = test_loss;
350402 }
351- printf (" Test_ [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
403+ printf (" Test [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
352404 testCounter,
353405 test_loss, test_loss - last_test_loss, lowest_test_loss,
354406 top1, top1 * 100 .0f / testCounter,
0 commit comments