-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeeder.py
154 lines (126 loc) · 5.18 KB
/
feeder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
'''
Created on 13 Sep 2017
@author: pingshiyu
'''
import pandas as pd, numpy as np
from PIL import Image
'''
Feeder object which will provide interface to batch, get the testing set,
validation set etc - methods necessary to train a neural network.
The input data will in a csv file, with each datapoint (flatten 1d images)
represented in rows. The last entry in each row will be the data's class.
i.e. [flattened image vector, class]
The seperator used in the .CSV should be a space (' ') character. Though
this can be easily changed in _reload_data() and _make_test_validation_set()
'''
class Feeder():
def __init__(self,
file_path,
shuffle = True,
classes = 24,
test_validation_size = 10000,
_chunksize = 10000,
one_hot = False):
# Loading data:
self._filepath = file_path
self._chunksize = _chunksize
self._test_validation_size = test_validation_size
self._reload_data()
self._make_test_validation_set(test_validation_size)
# number of classes in dataset
self.classes = classes
self._shuffle = shuffle
# contains the current chunk of, by default, 1000
self._chunk = self._next_chunk()
# current index on chunk
self._chunk_index = 0
# should we use one_hot encoding?
self.one_hot = one_hot
def _next_chunk(self):
'''
Returns next chunk of formatted data, which is a list of
(data, one_hot_label)
'''
# check if iterator has next_chunk. if not reload it
next_chunk = next(self._data_iter, pd.DataFrame())
if next_chunk.empty:
self._reload_data()
next_chunk = next(self._data_iter)
# print(next_chunk.shape)
# ``raw_chunk`` an np representation of the csv data
raw_chunk = next_chunk.values
if self._shuffle:
np.random.shuffle(raw_chunk)
# size of current chunk
self._curr_chunksize = raw_chunk.shape[0]
return raw_chunk
def _reload_data(self):
'''
Reloads the csv file to start from the beginning
'''
print('Epoch complete, reloading data...')
self._data_iter = pd.read_csv(self._filepath,
sep = ' ',
chunksize = self._chunksize,
memory_map = True,
skiprows = self._test_validation_size
)
print('Data reloaded!')
def _make_test_validation_set(self, size):
'''
Make the test and validation sets. The total size will be ``size``.
Each one of self.test, self.train is a tuple, with [0] being features,
[1] being labels
'''
test_validation_data = pd.read_csv(self._filepath,
sep = ' ',
memory_map = True,
nrows = size).values
self.test = self._format_data(test_validation_data[:(size//2)])
self.validation = self._format_data(test_validation_data[(size//2):])
def _format_data(self, raw_data, one_hot = False):
'''
Takes in a numpy array of data and returns a tuple of (data, labels)
Where labels are in either integer or one_hot form.
'''
if one_hot:
labels = self._to_one_hot(raw_data[:, -1])
else:
labels = raw_data[:, -1]
data = raw_data[:, :-1]
return data, labels
def _to_one_hot(self, labels):
'''
Converts 1d row of labels to one-hot form
'''
one_hot_list = [np.eye(self.classes)[y] for y in labels]
return np.array(one_hot_list).reshape(-1, self.classes)
def next_batch(self, size = 128):
'''
Returns a minibatch of ``size`` datapoints
'''
i = self._chunk_index
if (i+size) <= self._curr_chunksize:
xs, ys = self._format_data(self._chunk[i:(i+size)], self.one_hot)
self._chunk_index += size
else: # i+size > chunksize
# grab first part of batch
xs_1, ys_1 = self._format_data(self._chunk[i:], self.one_hot)
remaining = size - (self._curr_chunksize - i)
# begin the next chunk
self._chunk = self._next_chunk()
# get the second part of the chunk
xs_2, ys_2 = self._format_data(self._chunk[0:remaining], self.one_hot)
# update the chunk index
self._chunk_index = remaining
xs = np.append(xs_1, xs_2, axis = 0)
ys = np.append(ys_1, ys_2, axis = 0)
return xs, ys
if __name__ == '__main__':
'''
Testing purposes
'''
feeder = Feeder('./data/cleanData_small.csv')
for i in range(10000):
xs, ys = feeder.next_batch(500)
print(i, xs.shape, ys.shape)