-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsplit_and_summarize.py
executable file
·350 lines (277 loc) · 12.7 KB
/
split_and_summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/bin/env python3
"""split_and_summarize.py
Derived from: https://colab.research.google.com/drive/1aAU9sik9ymXQrA_JNS0sNmU4i0YuHu4q
"""
import os
import glob
import random
import shutil
import xml.etree.ElementTree as ET
import lxml.etree as ETT
import cv2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from config import *
### Harmonize image filenames
#
def jpeg_to_jpg_xml(path):
img_names = os.listdir(path + 'images/')
y = [s for s in img_names if s.find('.jpeg') > -1]
for jpeg_file in y:
os.rename(path+'images/'+jpeg_file,path+'images/'+jpeg_file.replace('.jpeg', '.jpg'))
xml_list = []
for xml_file in glob.glob(path+'annotations' + '/*.xml'):
tree = ET.parse(xml_file)
root = tree.getroot()
root.find('filename').text = (root.find('filename').text.replace('.jpeg', '.jpg'))
root.find('path').text = (root.find('path').text.replace('.jpeg', '.jpg'))
tree.write(xml_file)
print(xml_file)
jpeg_to_jpg_xml(DATA_DIR + 'Alles/')
### Split files into train, test and validation set
#
def split_filenames(DATA_DIR, fold = 0.2) :
# make dataframe with imagenames and corresponding annotationnames
img_names = os.listdir(DATA_DIR+"Alles/images")
anno_names = [w.replace('.jpg', '.xml') for w in img_names]
img_names = [DATA_DIR+'Alles/images/'+w for w in img_names]
anno_names = [DATA_DIR+'Alles/annotations/'+w for w in anno_names]
data_names_ar = np.transpose(np.array([img_names, anno_names]))
data_names_df = pd.DataFrame.from_records(data_names_ar, columns=['image', 'annotation'])
#split into test and train data
X_train, X_test, y_train, y_test = train_test_split(data_names_df, data_names_df['annotation'], test_size=fold)
#split into test and train data
X_train, X_valid, y_train, y_valid = train_test_split(X_train, X_train['annotation'], test_size=fold)
#print the directory sizes:
print('Supposed sizes of directories (images : annotations):')
print('Train: ' + str(X_train.shape[0]) + " : " + str(y_train.shape[0]))
print('Test: ' + str(X_test.shape[0]) + " : " + str(y_test.shape[0]))
print('Validation: ' + str(X_valid.shape[0]) + " : " + str(y_valid.shape[0]))
if (X_train.shape[0] != y_train.shape[0]) or (X_test.shape[0] != y_test.shape[0]) or (X_valid.shape[0] != y_valid.shape[0]) :
raise Exception('Directory sizes are not equal')
return X_train, X_valid, X_test, y_train, y_valid, y_test
def write_df_to_csv(dir, df):
#removes the images and annotations directories in the 'dir' directory, creates new, empty ones
path = DATA_DIR + dir
file_path = DATA_DIR + dir + '/'+ dir
os.makedirs(path)
if (dir == 'validation') or (dir == 'train') or (dir == 'test'):
if os.path.exists(file_path+ '_anno.txt'):
os.remove(file_path+ '_anno.txt')
if os.path.exists(file_path+ '_img.txt'):
os.remove(file_path+ '_img.txt')
df.image.to_csv(file_path+ '_img.txt', header=None, index=None, sep=' ', mode='a')
df.annotation.to_csv(file_path+ '_anno.txt', header=None, index=None, sep=' ', mode='a')
else :
raise Exception('Dir str not train, test or validation')
def split_and_write(DATA_DIR, fold):
#split
X_train, X_valid, X_test, y_train, y_valid, y_test = split_filenames(DATA_DIR, fold)
#write df to csv
write_df_to_csv(dir = 'train', df = X_train)
write_df_to_csv(dir = 'test', df = X_test)
write_df_to_csv(dir = 'validation', df = X_valid)
if split_and_csv_bool:
split_and_write(DATA_DIR=DATA_DIR, fold=0.2)
### Write files to directories
#
def clear_img_anno(dir):
#removes the images and annotations directories in the 'dir' directory, creates new, empty ones
path = DATA_DIR + dir
if (dir == 'validation') or (dir == 'train') or (dir == 'test'):
if os.path.exists(path+'/images'): shutil.rmtree(path+'/images/')
if os.path.exists(path+'/annotations'): shutil.rmtree(path+'/annotations/')
if not os.path.exists(path + '/images'): os.makedirs(path + '/images')
if not os.path.exists(path + '/annotations'): os.makedirs(path + '/annotations')
else :
raise Exception('Dir str not train, test or validation')
def check_missing_files(dir):
#searches for files that exist in either the images or annotations directory, that are not present in the other
if (dir == 'validation') or (dir == 'train') or (dir == 'test') or (dir == 'Alles') or (dir == 'alles'):
if (dir == 'alles'):
dir = 'Alles'
imgs = os.listdir(DATA_DIR+dir+'/images/')
imgs = [w.replace('.jpg', '') for w in imgs]
imgs = [w.replace('.jpeg', '') for w in imgs]
imgs = [w.replace('.JPG', '') for w in imgs]
imgs = [w.replace('.JPEG', '') for w in imgs]
annos = os.listdir(DATA_DIR+dir+'/annotations/')
annos = [w.replace('.xml', '') for w in annos]
print('Files that don\'t appear in both image or '+dir+' directory: (Should be two empty lists)')
print(list(set(imgs) - set(annos)))
print(list(set(annos) - set(imgs)))
if (len(list(set(imgs) - set(annos))) > 0 ) :
raise Exception('There exist files in images that do not exist in annotations, namely: ' + str(list(set(imgs) - set(annos))) )
if (len(list(set(annos) - set(imgs))) > 0):
raise Exception('There exist files in annotations that do not exist in images, namely: ' + str(list(set(annos) - set(imgs)) ) )
else :
raise Exception('Dir str not Alles, train, test or validation')
def write_split_to_dir(dir, data_dir):
print(dir)
dir_path = data_dir + dir + '/'
IMPATH = dir_path + 'images/'
ANNOPATH = dir_path + 'annotations/'
IMTXT = dir_path + dir + '_img.txt'
ANNOTXT = dir_path + dir + '_anno.txt'
# clear out image and annotations directories
clear_img_anno(dir)
print('Files that appear in both image or annotation directory: (Should be nothing)')
print('Files that appear in both image or test directory: (Should be nothing)')
os.system('ls "%s"' % IMPATH)
os.system('ls "%s"' % ANNOPATH)
print('Total amount of files in images and annotations in the %s directory:' % dir)
os.system('ls "%s" | wc -l' % IMPATH)
os.system('ls "%s" | wc -l' % ANNOPATH)
# copy files to directory
os.system('cat "%s" | xargs -I @ ln @ "%s"' % (IMTXT, IMPATH))
os.system('cat "%s" | xargs -I @ cp @ "%s" ' % (ANNOTXT, ANNOPATH ))
# test if every image has an annotation
check_missing_files(dir)
if reassign_csv_bool :
write_split_to_dir('train', DATA_DIR)
write_split_to_dir('test', DATA_DIR)
write_split_to_dir('validation', DATA_DIR)
### Filter temporary data (and output results to see if it worked)
#
# Function that translate xml file to dataframe
def xml_to_csv(path):
xml_list = []
for xml_file in glob.glob(path + '/*.xml'):
tree = ET.parse(xml_file)
root = tree.getroot()
for member in root.findall('object'):
value = (root.find('filename').text,
# int(root.find('size')[0].text),
# int(root.find('size')[1].text),
member[0].text,
int(member[4][0].text),
int(member[4][1].text),
int(member[4][2].text),
int(member[4][3].text)
)
xml_list.append(value)
column_name = ['filename', 'certificate', 'xmin', 'ymin', 'xmax', 'ymax'] #'width', 'height',
xml_df = pd.DataFrame(xml_list, columns=column_name)
return xml_df
# writes all xml files into one dataframe
def transformers_assemble(directory):
image_path = os.path.join(directory, 'annotations')
xml_df = xml_to_csv(image_path)
xml_df.to_csv(image_path + '.csv', index=None)
# print('Successfully converted xml to csv.')
return xml_df
def filter_labels_in_subset(path, labels):
for xml_file in glob.glob(path + '/*.xml'):
tree = ET.parse(xml_file)
root = tree.getroot()
for member in root.findall('object'):
lab = (member.find('name').text)
if lab not in labels :
# print(root.find('filename').text)
# print(lab)
root.remove(member)
tree.write(open(xml_file, 'w'), encoding='unicode')
def filter_labels_everywhere(path, labels):
for subset in ['train','test', 'validation']:
filter_labels_in_subset(path+subset+'/annotations', labels)
filter_labels_everywhere(DATA_DIR, labels)
### Balance dataset by crossing out labels
#
def draw_over_label_and_alter_xml(path, filename, label):
#draws a purple square over a certificate, and removes it from the xml file
#path should link to directory on top of images / annotations subdirs
image_name = path+'images/'+filename.replace('.xml', '.jpg')
xml_name = path+'annotations/'+filename
#open picture in OpenCV
img = cv2.imread(image_name)
#read out coordinates of certificate location and draw over it
tree = ET.parse(xml_name)
root = tree.getroot()
for member in root.findall('object'):
if member.find('name').text == label:
pt_min = (int(member.find('bndbox').find('xmin').text),int(member.find('bndbox').find('ymin').text) )
pt_max = (int(member.find('bndbox').find('xmax').text),int(member.find('bndbox').find('ymax').text) )
img = cv2.rectangle(img=img, pt1=pt_min, pt2=pt_max, color=(125, 0, 125) , thickness=-1)
root.remove(member)
tree.write(open(xml_name, 'w'), encoding='unicode') #.replace('.xml', '_altered.xml')
cv2.imwrite(image_name, img) #.replace('.jpg', '_altered.jpg')
def list_only_label(path, label):
#makes list of all files that contain a certificate in labels
# path should link to annotations file
files_with_label = []
for xml_file in glob.glob(path+'*.xml'):
tree = ET.parse(xml_file)
root = tree.getroot()
for member in root.findall('object'):
if member.find('name').text == label:
files_with_label.append((root.find('filename').text).replace('.jpg', '.xml'))
return files_with_label
def remove_perc_labels_in_dir(path, label, percentage_to_remove):
#iterates over a __% random list of files with certain label, and alters them
#path should link to the directory above the images/annotations subdir.
files_with_label = list_only_label(path + 'annotations/', label)
files_to_alter = random.sample(files_with_label, int(percentage_to_remove*len(files_with_label)))
for filename in files_to_alter:
draw_over_label_and_alter_xml(path, filename, label)
def remove_perc_everywhere(path, label, percentage_to_remove):
# remove certain percentage of certain label out of all 3 split-datasets
for dir in ['train/','test/','validation/']:
remove_perc_labels_in_dir(path=path+dir, label=label, percentage_to_remove=percentage_to_remove)
if balance_certificates_bool:
percentage_to_remove=0.55
label = 'organic'
remove_perc_everywhere(path=DATA_DIR, label=label, percentage_to_remove=percentage_to_remove)
percentage_to_remove=0.61
label = 'fairtrade'
remove_perc_everywhere(path=DATA_DIR, label=label, percentage_to_remove=percentage_to_remove)
### Preprocessing
#
def prepr_dir(dir, ks = 0):
for file in glob.glob(dir+"*.jpg"):
img = cv2.imread(file)
blur = cv2.GaussianBlur(img,(ks,ks),cv2.BORDER_DEFAULT)
filename = file
cv2.imwrite(filename,img-blur)
def preprocess(DATA_DIR):
for dset in ['train','test','validation']:
prepr_dir(DATA_DIR+dset+'/images/')
if preproces_bool:
preprocess(DATA_DIR)
if visualisation_bool:
visualise_data_for_label(DATA_DIR, 'beterleven3')
### Check splits
#
# Returns for any directory how many of each certificate are present
def certificates_in_dir(dir):
if (dir == 'Alles') or (dir == 'alles') :
path = DATA_DIR + 'Alles/'
elif (dir == 'validation') or (dir == 'train') or (dir == 'test'):
path = DATA_DIR+ dir+ '/'
else :
raise Exception('Split value not train, test or validation')
annotations_df = transformers_assemble(path)
classes = (
annotations_df
['certificate']
.unique()
)
print('Nr of files in ' + dir + ' directory: ' + str(annotations_df.shape))
print('Nr of certificate types in ' + dir + ': ' + str(len(classes)))
# count how many pictures there are per certificate
distrib = (
annotations_df[['certificate', 'filename']]
.groupby(['certificate'])
.agg(['count'])
)
distrib['percentage'] = distrib['filename']['count']*100/annotations_df.shape[0]
# print(distrib)
# print('\n')
return distrib
print(certificates_in_dir('Alles'))
print('\n')
print(certificates_in_dir('train'))
print('\n')
print(certificates_in_dir('test'))
print('\n')
print(certificates_in_dir('validation'))