|
| 1 | +""" |
| 2 | + untarFiles.py |
| 3 | + 20220309 |
| 4 | +""" |
| 5 | +import tarfile |
| 6 | +import os |
| 7 | +import scipy.io as sio |
| 8 | +from tqdm import tqdm |
| 9 | + |
| 10 | + |
| 11 | +def un_tar(file_name, output_root_path='../train', new_folder_name=None): |
| 12 | + """ |
| 13 | + extract the files in the file_name in to output_root_path/new_folder_name, and new_folder_name will be |
| 14 | + the name of 'tar' file if new_folder_name is None. |
| 15 | + So the output is organized in output_root_path/new_folder_name/image files. |
| 16 | + :param file_name: str, tar filename, end with '.tar'. |
| 17 | + :param output_root_path: str, the upper folder to save the extracted folder. |
| 18 | + :param new_folder_name: str or None, None means the extracted files will be saved in a folder whose name is same |
| 19 | + as the tar file's name, or the folder will be renamed to new_folder_name. |
| 20 | + :return: True |
| 21 | + """ |
| 22 | + assert os.path.exists(file_name) |
| 23 | + assert file_name[-4:] == '.tar' |
| 24 | + # untar zip file to folder whose name is same as tar file |
| 25 | + tar = tarfile.open(file_name) |
| 26 | + names = tar.getnames() |
| 27 | + |
| 28 | + file_name = os.path.basename(file_name) |
| 29 | + if new_folder_name is None: |
| 30 | + extract_dir = os.path.join(output_root_path, file_name.split('.')[0]) |
| 31 | + else: |
| 32 | + extract_dir = os.path.join(output_root_path, new_folder_name) |
| 33 | + |
| 34 | + os.makedirs(extract_dir, exist_ok=True) |
| 35 | + for name in names: |
| 36 | + tar.extract(name, extract_dir) |
| 37 | + tar.close() |
| 38 | + return True |
| 39 | + |
| 40 | + |
| 41 | +def untar_train_images(train_path, output_root_path='../train', meta_data_for_name_mapping=None): |
| 42 | + """ |
| 43 | + untar all the .tar files in train_path to output_root_path. If meta data is given, the tar files' destination |
| 44 | + folder name will be renamed to the 'words' corresponding to the tar filename('WNID'). |
| 45 | + So the output is organized in output_root_path/'WIND' or 'words'/image files. |
| 46 | + :param train_path: str, the path that contains the train 'tar' files named by 'WNID'. |
| 47 | + :param output_root_path: str, the upper folder to save the extracted folder. |
| 48 | + :param meta_data_for_name_mapping: str or None, fullpath to 'meta.mat', this mat contains a array named with |
| 49 | + 'synsets', and each item ia a cell with fields: |
| 50 | + "ILSVRC2012_ID, WNID, words, gloss, num_children, children, wordnet_height, num_train_images" |
| 51 | + if this parameter is given, the files in each tar file under the train_path will be extracted to a folder |
| 52 | + named by its corresponding 'words'. |
| 53 | + :return: True |
| 54 | + """ |
| 55 | + assert os.path.exists(train_path) |
| 56 | + assert os.path.isdir(train_path) |
| 57 | + |
| 58 | + # read meta data |
| 59 | + if meta_data_for_name_mapping is not None: |
| 60 | + assert meta_data_for_name_mapping[-4:] == '.mat' |
| 61 | + meta_data_dict = dict() |
| 62 | + meta_data = sio.loadmat(meta_data_for_name_mapping)['synsets'] |
| 63 | + for i in range(len(meta_data)): |
| 64 | + meta_data_dict[meta_data[i][0][1].item()] = meta_data[i][0][2].item() |
| 65 | + pbar = tqdm(os.scandir(train_path)) |
| 66 | + for tar_file in pbar: |
| 67 | + if tar_file.is_file() and tar_file.name[-4:] == '.tar': |
| 68 | + WNID = tar_file.name.split('.')[0] |
| 69 | + new_folder_name = meta_data_dict[WNID] if meta_data is not None else WNID |
| 70 | + os.makedirs(os.path.join(output_root_path, new_folder_name), exist_ok=True) |
| 71 | + pbar.set_description(f'Extracting {tar_file.path} to {os.path.join(output_root_path, new_folder_name)}') |
| 72 | + un_tar( |
| 73 | + file_name=tar_file.path, |
| 74 | + output_root_path=output_root_path, |
| 75 | + new_folder_name=new_folder_name |
| 76 | + ) |
| 77 | + return True |
| 78 | + |
| 79 | + |
| 80 | +if __name__ == '__main__': |
| 81 | + untar_train_images( |
| 82 | + train_path=r'D:\tmp_dataset', |
| 83 | + output_root_path=r'D:\tmp_dataset\imagenet_train', |
| 84 | + meta_data_for_name_mapping=r'E:\datasets\ImageNet\ILSVRC2012_devkit_t12\data\meta.mat') |
| 85 | + |
| 86 | + |
| 87 | + |
0 commit comments