|
| 1 | +import tensorflow as tf |
| 2 | +import numpy as np |
| 3 | +from tensorflow import keras |
| 4 | +from keras.utils.np_utils import to_categorical |
| 5 | +import json |
| 6 | +import h5py |
| 7 | +import os |
| 8 | + |
| 9 | +############################################ |
| 10 | + |
| 11 | +def right_align(seq,lengths): |
| 12 | + # Align the input questions to the right side (pad on left with zeros) |
| 13 | + v = np.zeros(np.shape(seq)) |
| 14 | + N = np.shape(seq)[1] |
| 15 | + for i in range(np.shape(seq)[0]): |
| 16 | + v[i][N-lengths[i]:N]=seq[i][0:lengths[i]] |
| 17 | + return v |
| 18 | + |
| 19 | +############################################# |
| 20 | + |
| 21 | +def read_data(data_img, data_prepro, data_limit): |
| 22 | + print("Reading Data...") |
| 23 | + img_data = h5py.File(data_img, 'r') |
| 24 | + ques_data = h5py.File(data_prepro, 'r') |
| 25 | + |
| 26 | + #Reading upto data limit images |
| 27 | + img_data = np.array(img_data['images_train']) |
| 28 | + img_pos_train = ques_data['img_pos_train'][:data_limit] |
| 29 | + train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train]) |
| 30 | + |
| 31 | + # Normalizing images |
| 32 | + tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1)) |
| 33 | + train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1)))) |
| 34 | + |
| 35 | + #shifting padding to left side |
| 36 | + ques_train = np.array(ques_data['ques_train'])[:data_limit, :] |
| 37 | + ques_length_train = np.array(ques_data['ques_length_train'])[:data_limit] |
| 38 | + ques_train = right_align(ques_train, ques_length_train) |
| 39 | + |
| 40 | + train_X = [train_img_data, ques_train] |
| 41 | + |
| 42 | + # All validation answers which are not in training set have been labelled as 1 |
| 43 | + train_y = to_categorical(ques_data['answers'])[:data_limit, :] |
| 44 | + |
| 45 | + return train_X, train_y |
| 46 | + |
| 47 | +######################################## |
| 48 | + |
| 49 | +def get_val_data(val_annotations_path, data_img, data_prepro, data_prepro_meta): |
| 50 | + img_data = h5py.File(data_img, 'r') |
| 51 | + ques_data = h5py.File(data_prepro, 'r') |
| 52 | + metadata = get_metadata(data_prepro_meta) |
| 53 | + with open(val_annotations_path, 'r') as an_file: |
| 54 | + annotations = json.loads(an_file.read()) |
| 55 | + |
| 56 | + img_data = np.array(img_data['images_test']) |
| 57 | + img_pos_train = ques_data['img_pos_test'] |
| 58 | + train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train]) |
| 59 | + tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1)) |
| 60 | + train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1)))) |
| 61 | + |
| 62 | + ques_train = np.array(ques_data['ques_test']) |
| 63 | + ques_length_train = np.array(ques_data['ques_length_test']) |
| 64 | + ques_train = right_align(ques_train, ques_length_train) |
| 65 | + |
| 66 | + # Convert all last indices to 0, as embeddings were made that way |
| 67 | + for _ in ques_train: |
| 68 | + if 12602 in _: |
| 69 | + _[_==12602] = 0 |
| 70 | + |
| 71 | + val_X = [train_img_data, ques_train] |
| 72 | + |
| 73 | + ans_to_ix = {str(ans):int(i) for i,ans in metadata['ix_to_ans'].items()} |
| 74 | + ques_annotations = {} |
| 75 | + for _ in annotations['annotations']: |
| 76 | + idx = ans_to_ix.get(_['multiple_choice_answer'].lower()) |
| 77 | + _['multiple_choice_answer_idx'] = 1 if idx in [None, 1000] else idx |
| 78 | + ques_annotations[_['question_id']] = _ |
| 79 | + |
| 80 | + abs_val_y = [ques_annotations[ques_id]['multiple_choice_answer_idx'] for ques_id in ques_data['question_id_test']] |
| 81 | + abs_val_y = to_categorical(np.array(abs_val_y)) |
| 82 | + |
| 83 | + multi_val_y = [list(set([ans_to_ix.get(_['answer'].lower()) for _ in ques_annotations[ques_id]['answers']])) for ques_id in ques_data['question_id_test']] |
| 84 | + for i,_ in enumerate(multi_val_y): |
| 85 | + multi_val_y[i] = [1 if ans in [None, 1000] else ans for ans in _] |
| 86 | + |
| 87 | + return val_X, abs_val_y, multi_val_y |
| 88 | + |
| 89 | +############################################### |
| 90 | + |
| 91 | +def get_metadata(data_prepro_meta): |
| 92 | + meta_data = json.load(open(data_prepro_meta, 'r')) |
| 93 | + meta_data['ix_to_word'] = {str(word):int(i) for i,word in meta_data['ix_to_word'].items()} |
| 94 | + return meta_data |
| 95 | + |
| 96 | +############################################### |
| 97 | + |
| 98 | +def prepare_embeddings(num_words, embedding_dim, metadata, glove_path, train_questions_path, embedding_matrix_filename): |
| 99 | + if os.path.exists(embedding_matrix_filename): |
| 100 | + with h5py.File(embedding_matrix_filename) as f: |
| 101 | + return np.array(f['embedding_matrix']) |
| 102 | + |
| 103 | + print("Embedding Data...") |
| 104 | + with open(train_questions_path, 'r') as qs_file: |
| 105 | + questions = json.loads(qs_file.read()) |
| 106 | + texts = [str(_['question']) for _ in questions['questions']] |
| 107 | + |
| 108 | + embeddings_index = {} |
| 109 | + with open(glove_path, 'r') as glove_file: |
| 110 | + for line in glove_file: |
| 111 | + values = line.split() |
| 112 | + word = values[0] |
| 113 | + coefs = np.asarray(values[1:], dtype='float32') |
| 114 | + embeddings_index[word] = coefs |
| 115 | + |
| 116 | + embedding_matrix = np.zeros((num_words, embedding_dim)) |
| 117 | + word_index = metadata['ix_to_word'] |
| 118 | + |
| 119 | + for word, i in word_index.items(): |
| 120 | + embedding_vector = embeddings_index.get(word) |
| 121 | + if embedding_vector is not None: |
| 122 | + embedding_matrix[i] = embedding_vector |
| 123 | + |
| 124 | + with h5py.File(embedding_matrix_filename, 'w') as f: |
| 125 | + f.create_dataset('embedding_matrix', data=embedding_matrix) |
| 126 | + |
| 127 | + return embedding_matrix |
| 128 | + |
| 129 | + |
0 commit comments