Skip to content

Commit a720622

Browse files
authored
extract ImageNet training '.tar' files
extract ImageNet training '.tar' files
1 parent 3fa8be4 commit a720622

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

untarFiles.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
untarFiles.py
3+
20220309
4+
"""
5+
import tarfile
6+
import os
7+
import scipy.io as sio
8+
from tqdm import tqdm
9+
10+
11+
def un_tar(file_name, output_root_path='../train', new_folder_name=None):
12+
"""
13+
extract the files in the file_name in to output_root_path/new_folder_name, and new_folder_name will be
14+
the name of 'tar' file if new_folder_name is None.
15+
So the output is organized in output_root_path/new_folder_name/image files.
16+
:param file_name: str, tar filename, end with '.tar'.
17+
:param output_root_path: str, the upper folder to save the extracted folder.
18+
:param new_folder_name: str or None, None means the extracted files will be saved in a folder whose name is same
19+
as the tar file's name, or the folder will be renamed to new_folder_name.
20+
:return: True
21+
"""
22+
assert os.path.exists(file_name)
23+
assert file_name[-4:] == '.tar'
24+
# untar zip file to folder whose name is same as tar file
25+
tar = tarfile.open(file_name)
26+
names = tar.getnames()
27+
28+
file_name = os.path.basename(file_name)
29+
if new_folder_name is None:
30+
extract_dir = os.path.join(output_root_path, file_name.split('.')[0])
31+
else:
32+
extract_dir = os.path.join(output_root_path, new_folder_name)
33+
34+
os.makedirs(extract_dir, exist_ok=True)
35+
for name in names:
36+
tar.extract(name, extract_dir)
37+
tar.close()
38+
return True
39+
40+
41+
def untar_train_images(train_path, output_root_path='../train', meta_data_for_name_mapping=None):
42+
"""
43+
untar all the .tar files in train_path to output_root_path. If meta data is given, the tar files' destination
44+
folder name will be renamed to the 'words' corresponding to the tar filename('WNID').
45+
So the output is organized in output_root_path/'WIND' or 'words'/image files.
46+
:param train_path: str, the path that contains the train 'tar' files named by 'WNID'.
47+
:param output_root_path: str, the upper folder to save the extracted folder.
48+
:param meta_data_for_name_mapping: str or None, fullpath to 'meta.mat', this mat contains a array named with
49+
'synsets', and each item ia a cell with fields:
50+
"ILSVRC2012_ID, WNID, words, gloss, num_children, children, wordnet_height, num_train_images"
51+
if this parameter is given, the files in each tar file under the train_path will be extracted to a folder
52+
named by its corresponding 'words'.
53+
:return: True
54+
"""
55+
assert os.path.exists(train_path)
56+
assert os.path.isdir(train_path)
57+
58+
# read meta data
59+
if meta_data_for_name_mapping is not None:
60+
assert meta_data_for_name_mapping[-4:] == '.mat'
61+
meta_data_dict = dict()
62+
meta_data = sio.loadmat(meta_data_for_name_mapping)['synsets']
63+
for i in range(len(meta_data)):
64+
meta_data_dict[meta_data[i][0][1].item()] = meta_data[i][0][2].item()
65+
pbar = tqdm(os.scandir(train_path))
66+
for tar_file in pbar:
67+
if tar_file.is_file() and tar_file.name[-4:] == '.tar':
68+
WNID = tar_file.name.split('.')[0]
69+
new_folder_name = meta_data_dict[WNID] if meta_data is not None else WNID
70+
os.makedirs(os.path.join(output_root_path, new_folder_name), exist_ok=True)
71+
pbar.set_description(f'Extracting {tar_file.path} to {os.path.join(output_root_path, new_folder_name)}')
72+
un_tar(
73+
file_name=tar_file.path,
74+
output_root_path=output_root_path,
75+
new_folder_name=new_folder_name
76+
)
77+
return True
78+
79+
80+
if __name__ == '__main__':
81+
untar_train_images(
82+
train_path=r'D:\tmp_dataset',
83+
output_root_path=r'D:\tmp_dataset\imagenet_train',
84+
meta_data_for_name_mapping=r'E:\datasets\ImageNet\ILSVRC2012_devkit_t12\data\meta.mat')
85+
86+
87+

0 commit comments

Comments
 (0)