5
5
#include < algorithm>
6
6
#include < math.h>
7
7
#include < chrono>
8
+ #include < random>
8
9
#include " ffCudaNn.h"
9
10
11
+ namespace ff
12
+ {
13
+ extern std::default_random_engine g_generator;
14
+ }
15
+
10
16
class ProfileScope
11
17
{
12
18
public:
13
- ProfileScope (const char * msg) : _msg(msg)
19
+ ProfileScope (const char * msg) : _msg(msg), _delta(- 1 . 0f )
14
20
{
15
21
_s = std::chrono::high_resolution_clock::now ();
16
22
}
17
23
~ProfileScope ()
18
24
{
19
- std::chrono::duration<float > delta = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now () - _s);
20
- printf (" %s [%fs]\n " , _msg, delta.count ());
25
+ if (_delta < 0 .0f )
26
+ {
27
+ EndScope ();
28
+ }
29
+ printf (" %s [%fs]\n " , _msg, _delta);
30
+ }
31
+ void EndScope ()
32
+ {
33
+ std::chrono::duration<float > delta =
34
+ std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now () - _s);
35
+ _delta = delta.count ();
21
36
}
22
37
const char * _msg;
38
+ float _delta;
23
39
std::chrono::high_resolution_clock::time_point _s;
24
40
};
25
41
@@ -64,6 +80,16 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
64
80
nLeft -= batchSize;
65
81
}
66
82
83
+ std::vector<int > order (numTotalImages);
84
+ for (int i = 0 ; i < numTotalImages; ++i)
85
+ {
86
+ order[i] = i;
87
+ }
88
+ if (true == augment)
89
+ {
90
+ std::shuffle (order.begin (), order.end (), ff::g_generator);
91
+ }
92
+
67
93
// Data normalization
68
94
float mean[3 ] = { 0 .4914f , 0 .4822f , 0 .4465f };
69
95
float std[3 ] = { 0 .2023f , 0 .1994f , 0 .2010f };
@@ -81,18 +107,19 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
81
107
for (int j = 0 ; j < kNumImagePerFile ; ++j)
82
108
{
83
109
bool bFlip = false ;
84
- if (true == augment && 1 == rand () % 2 ) bFlip = true ;
85
- int batchIndex = imageCounter / batchSize;
86
- int elementIndex = imageCounter % batchSize;
110
+ if (true == augment && 1 == ff::g_generator () % 2 ) bFlip = true ;
111
+ int batchIndex = order[ imageCounter] / batchSize;
112
+ int elementIndex = order[ imageCounter] % batchSize;
87
113
labels[batchIndex]._data [elementIndex] = static_cast <float >(*pCurr++);
114
+ int baseIndex = elementIndex * kNumBytesPerChannel * kNumChannel ;
88
115
for (int ch = 0 ; ch < kNumChannel ; ++ch)
89
116
{
90
117
for (int row = 0 ; row < 32 ; ++row)
91
118
{
92
119
for (int col = 0 ; col < 32 ; ++col)
93
120
{
94
- float val = *pCurr++;
95
- int index = elementIndex * kNumBytesPerChannel * kNumChannel + ch * kNumBytesPerChannel ;
121
+ float val = static_cast < float >( *pCurr++) ;
122
+ int index = baseIndex + ch * kNumBytesPerChannel ;
96
123
if (true == bFlip)
97
124
{
98
125
index += (row * 32 + (31 - col));
@@ -101,15 +128,14 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
101
128
{
102
129
index += (row * 32 + col);
103
130
}
104
- images[batchIndex]._data [index ] = (( val / 255 .0f ) - mean[ch]) / std[ch] ;
131
+ images[batchIndex]._data [index ] = val / 255 .0f ;
105
132
}
106
133
}
107
134
}
108
135
if (true == augment)
109
136
{
110
137
int shift = 8 ;
111
138
int newSize = 32 + shift;
112
- int baseIndex = elementIndex * kNumBytesPerChannel * kNumChannel ;
113
139
buffer.resize (newSize * newSize * kNumChannel );
114
140
for (int ch = 0 ; ch < kNumChannel ; ++ch)
115
141
{
@@ -123,8 +149,8 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
123
149
}
124
150
}
125
151
}
126
- int rowShift = static_cast <int >(rand () % (shift+1 ));
127
- int colShift = static_cast <int >(rand () % (shift+1 ));
152
+ int rowShift = static_cast <int >(ff::g_generator () % (shift+1 ));
153
+ int colShift = static_cast <int >(ff::g_generator () % (shift+1 ));
128
154
for (int ch = 0 ; ch < kNumChannel ; ++ch)
129
155
{
130
156
for (int row = 0 ; row < 32 ; ++row)
@@ -137,6 +163,18 @@ void LoadCifar10(int batchSize, int maxImages, bool augment, const std::vector<s
137
163
}
138
164
}
139
165
}
166
+ for (int ch = 0 ; ch < kNumChannel ; ++ch)
167
+ {
168
+ for (int row = 0 ; row < 32 ; ++row)
169
+ {
170
+ for (int col = 0 ; col < 32 ; ++col)
171
+ {
172
+ int index = baseIndex + ch * kNumBytesPerChannel + row * 32 + col;
173
+ images[batchIndex]._data [index ] = (images[batchIndex]._data [index ] - mean[ch])/std[ch];
174
+ }
175
+ }
176
+ }
177
+
140
178
++imageCounter;
141
179
if (imageCounter >= numTotalImages)
142
180
break ;
@@ -183,7 +221,11 @@ int ComputeLoss(ff::CudaNn& nn, std::vector<ff::CudaTensor>& images, std::vector
183
221
184
222
int cifar10 ()
185
223
{
224
+ // Note(dongwook): Hyper-parameters
225
+ const bool augmentDataSet = false ;
186
226
const int kBatchSize = 100 ;
227
+ const int kDataSetScalerInv = 1 ;
228
+ float learningRate = 0 .001f ;
187
229
188
230
std::vector<std::string> trainingDataFilenames;
189
231
trainingDataFilenames.push_back (" cifar-10/data_batch_1.bin" );
@@ -193,12 +235,12 @@ int cifar10()
193
235
trainingDataFilenames.push_back (" cifar-10/data_batch_5.bin" );
194
236
std::vector<ff::CudaTensor> trainingImages;
195
237
std::vector<ff::CudaTensor> trainingLabels;
196
- LoadCifar10 (kBatchSize , 5000 , false , trainingDataFilenames, trainingImages, trainingLabels);
238
+ LoadCifar10 (kBatchSize , 50000 / kDataSetScalerInv , false , trainingDataFilenames, trainingImages, trainingLabels);
197
239
std::vector<std::string> testDataFilenames;
198
240
testDataFilenames.push_back (" cifar-10/test_batch.bin" );
199
241
std::vector<ff::CudaTensor> testImages;
200
242
std::vector<ff::CudaTensor> testLabels;
201
- LoadCifar10 (kBatchSize , 1000 , false , testDataFilenames, testImages, testLabels);
243
+ LoadCifar10 (kBatchSize , 10000 / kDataSetScalerInv , false , testDataFilenames, testImages, testLabels);
202
244
203
245
#if 1
204
246
ff::CudaNn nn;
@@ -224,9 +266,9 @@ int cifar10()
224
266
nn.AddBatchNorm2d (256 );
225
267
nn.AddRelu ();
226
268
nn.AddMaxPool ();
227
- nn.AddFc (4 * 256 , 1024 );
269
+ nn.AddFc (4 * 256 , 1000 );
228
270
nn.AddRelu ();
229
- nn.AddFc (1024 , 10 );
271
+ nn.AddFc (1000 , 10 );
230
272
nn.AddSoftmax ();
231
273
#else
232
274
ff::CudaNn nn;
@@ -271,71 +313,81 @@ int cifar10()
271
313
nn.AddSoftmax();
272
314
#endif
273
315
274
- float learningRate = 0 .001f ;
275
-
276
- float last_train_loss = 0 .0f ;
277
- float lowest_train_loss = 1e8f;
316
+ float last_validation_loss = 0 .0f ;
317
+ float lowest_validation_loss = 1e8f;
278
318
float last_test_loss = 0 .0f ;
279
319
float lowest_test_loss = 1e8f;
280
- const int numBatch = (int )trainingImages.size ();
281
- const int kNumEpoch = 200 ;
320
+ const int kNumEpoch = 100 ;
282
321
for (int i = 0 ; i < kNumEpoch ; ++i)
283
322
{
284
323
float currLearningRate = learningRate;
285
324
286
325
// gradual decay
287
326
// const float kDecay = 0.2f;
288
- // const int kCooldown = 3 ;
327
+ // const int kCooldown = 24 ;
289
328
// if (i >= kCooldown)
290
329
// {
291
330
// currLearningRate *= expf(-1.0f * kDecay * (i - kCooldown));
292
331
// }
293
332
333
+ if (true == augmentDataSet && 0 != i)
334
+ {
335
+ LoadCifar10 (kBatchSize , 50000 / kDataSetScalerInv , true , trainingDataFilenames, trainingImages, trainingLabels);
336
+ }
337
+
294
338
char buffer[2048 ];
295
339
sprintf (buffer, " -- Epoch %03d(lr: %f)" , i + 1 , currLearningRate);
296
340
ProfileScope __m (buffer);
297
341
298
342
// Training
299
- int trainingImageCounter = 0 ;
300
- float train_loss = 0 .0f ;
301
- int top1 = 0 , top3 = 0 , top5 = 0 ;
343
+ const int numBatch = (int )trainingImages.size ();
302
344
for (int j = 0 ; j < numBatch; ++j)
303
345
{
304
- const ff::CudaTensor* pSoftmax = nullptr ;
305
- pSoftmax = nn.Forward (&trainingImages[j], true );
346
+ nn.Forward (&trainingImages[j], true );
306
347
nn.Backward (&trainingLabels[j]);
307
348
nn.UpdateWs (currLearningRate);
349
+ }
350
+ __m.EndScope ();
308
351
309
- // train loss
352
+ // Validation loss
353
+ int validationImageCounter = 0 ;
354
+ float validation_loss = 0 .0f ;
355
+ int top1 = 0 , top3 = 0 , top5 = 0 ;
356
+ for (int j = 0 ; j < numBatch / 10 ; ++j)
357
+ {
358
+ // Note(dongwook): You should call Forward() several times after training if BatchNorm layers exist.
359
+ // In the subsequent calls, mean and variance parameters are set to make the network deterministic.
360
+ const ff::CudaTensor* pSoftmax = nullptr ;
361
+ pSoftmax = nn.Forward (&trainingImages[j], true );
310
362
const_cast <ff::CudaTensor*>(pSoftmax)->PullFromGpu ();
311
363
for (int k = 0 ; k < pSoftmax->_d1 ; ++k)
312
364
{
313
365
float val = pSoftmax->_data [static_cast <int >(trainingLabels[j]._data [k]) + pSoftmax->_d0 * k];
314
366
assert (val > 0 .0f );
315
367
if (val > 0 .0f )
316
368
{
317
- ++trainingImageCounter ;
318
- train_loss += -logf (val);
369
+ ++validationImageCounter ;
370
+ validation_loss += -logf (val);
319
371
}
320
372
}
321
373
int t1, t3, t5;
322
374
CheckAccuracy (pSoftmax, trainingLabels[j], t1, t3, t5);
323
375
top1 += t1; top3 += t3; top5 += t5;
324
376
}
325
- if (trainingImageCounter <= 0 ) trainingImageCounter = 1 ;
326
- train_loss /= trainingImageCounter ;
327
- if (0 == i) last_train_loss = train_loss ;
328
- if (train_loss < lowest_train_loss )
377
+ if (validationImageCounter <= 0 ) validationImageCounter = 1 ;
378
+ validation_loss /= validationImageCounter ;
379
+ if (0 == i) last_validation_loss = validation_loss ;
380
+ if (validation_loss < lowest_validation_loss )
329
381
{
330
- lowest_train_loss = train_loss ;
382
+ lowest_validation_loss = validation_loss ;
331
383
}
332
- printf (" Train [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
333
- trainingImageCounter ,
334
- train_loss, train_loss - last_train_loss, lowest_train_loss ,
335
- top1, top1 * 100 .0f / trainingImageCounter ,
384
+ printf (" Val_ [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
385
+ validationImageCounter ,
386
+ validation_loss, validation_loss - last_validation_loss, lowest_validation_loss ,
387
+ top1, top1 * 100 .0f / validationImageCounter ,
336
388
top3,
337
389
top5);
338
- last_train_loss = train_loss ;
390
+ last_validation_loss = validation_loss ;
339
391
340
392
// Test loss
341
393
{
@@ -348,7 +400,7 @@ int cifar10()
348
400
{
349
401
lowest_test_loss = test_loss;
350
402
}
351
- printf (" Test_ [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
403
+ printf (" Test [%05d](Loss: %f(%+f)/%f, Top1: %05d(%5.2f%%), Top3: %05d, Top5: %05d)\n " ,
352
404
testCounter,
353
405
test_loss, test_loss - last_test_loss, lowest_test_loss,
354
406
top1, top1 * 100 .0f / testCounter,
0 commit comments