diff --git a/ann_class2/batch_norm_tf.py b/ann_class2/batch_norm_tf.py index de25cd3b..31d9a351 100644 --- a/ann_class2/batch_norm_tf.py +++ b/ann_class2/batch_norm_tf.py @@ -4,13 +4,15 @@ # sudo pip install -U future import numpy as np -import pandas as pd +#import pandas as pd import matplotlib.pyplot as plt import tensorflow as tf from sklearn.utils import shuffle -from sklearn.model_selection import train_test_split +#from sklearn.model_selection import train_test_split from util import get_normalized_data +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() def init_weight(M1, M2): return np.random.randn(M1, M2) * np.sqrt(2.0 / M1) @@ -38,13 +40,11 @@ def forward(self, X, is_training, decay=0.9): activation = tf.matmul(X, self.W) if is_training: batch_mean, batch_var = tf.nn.moments(activation, [0]) - update_running_mean = tf.assign( - self.running_mean, - self.running_mean * decay + batch_mean * (1 - decay) + update_running_mean = self.running_mean.assign( + self.running_mean * decay + batch_mean * (1 - decay) ) - update_running_var = tf.assign( - self.running_var, - self.running_var * decay + batch_var * (1 - decay) + update_running_var = self.running_var.assign( + self.running_var * decay + batch_var * (1 - decay) ) with tf.control_dependencies([update_running_mean, update_running_var]): @@ -115,8 +115,8 @@ def fit(self, X, Y, Xtest, Ytest, activation=tf.nn.relu, learning_rate=1e-2, epo # for train and test (prediction) # set up theano functions and variables - tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') - tfY = tf.placeholder(tf.int32, shape=(None,), name='Y') + tfX = tf.compat.v1.placeholder(tf.float32, shape=(None, D), name='X') + tfY = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='Y') # for later use self.tfX = tfX @@ -131,7 +131,7 @@ def fit(self, X, Y, Xtest, Ytest, activation=tf.nn.relu, learning_rate=1e-2, epo ) # train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost) # train_op = tf.train.RMSPropOptimizer(learning_rate, decay=0.99, momentum=0.9).minimize(cost) - train_op = tf.train.MomentumOptimizer(learning_rate, momentum=0.9, use_nesterov=True).minimize(cost) + train_op = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=0.9, use_nesterov=True).minimize(cost) # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) # for testing @@ -141,7 +141,7 @@ def fit(self, X, Y, Xtest, Ytest, activation=tf.nn.relu, learning_rate=1e-2, epo # accuracy = tf.reduce_mean(1.0*(tfY == tf.argmax(logits, 1))) # init the variables - self.session.run(tf.global_variables_initializer()) + self.session.run(tf.compat.v1.global_variables_initializer()) n_batches = N // batch_sz costs = [] @@ -187,7 +187,7 @@ def main(): ann = ANN([500, 300]) - session = tf.InteractiveSession() + session = tf.compat.v1.InteractiveSession() ann.set_session(session) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True) diff --git a/ann_class2/dropout_tensorflow.py b/ann_class2/dropout_tensorflow.py index b20c44fb..c2186e57 100644 --- a/ann_class2/dropout_tensorflow.py +++ b/ann_class2/dropout_tensorflow.py @@ -13,6 +13,8 @@ from util import get_normalized_data from sklearn.utils import shuffle +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() class HiddenLayer(object): def __init__(self, M1, M2): @@ -59,8 +61,8 @@ def fit(self, X, Y, Xvalid, Yvalid, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch self.params += h.params # set up theano functions and variables - inputs = tf.placeholder(tf.float32, shape=(None, D), name='inputs') - labels = tf.placeholder(tf.int64, shape=(None,), name='labels') + inputs = tf.compat.v1.placeholder(tf.float32, shape=(None, D), name='inputs') + labels = tf.compat.v1.placeholder(tf.int64, shape=(None,), name='labels') logits = self.forward(inputs) cost = tf.reduce_mean( @@ -69,7 +71,7 @@ def fit(self, X, Y, Xvalid, Yvalid, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch labels=labels ) ) - train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) + train_op = tf.compat.v1.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) # train_op = tf.train.MomentumOptimizer(lr, momentum=mu).minimize(cost) # train_op = tf.train.AdamOptimizer(lr).minimize(cost) prediction = self.predict(inputs) @@ -85,8 +87,8 @@ def fit(self, X, Y, Xvalid, Yvalid, lr=1e-4, mu=0.9, decay=0.9, epochs=15, batch n_batches = N // batch_sz costs = [] - init = tf.global_variables_initializer() - with tf.Session() as session: + init = tf.compat.v1.global_variables_initializer() + with tf.compat.v1.Session() as session: session.run(init) for i in range(epochs): print("epoch:", i, "n_batches:", n_batches) diff --git a/ann_class2/keras_functional.py b/ann_class2/keras_functional.py index 265d3f9b..14e5e955 100644 --- a/ann_class2/keras_functional.py +++ b/ann_class2/keras_functional.py @@ -5,8 +5,8 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.models import Model -from keras.layers import Dense, Input +from tensorflow.keras.models import Model #type: ignore +from tensorflow.keras.layers import Dense, Input #type: ignore from util import get_normalized_data, y2indicator import matplotlib.pyplot as plt diff --git a/ann_class2/pytorch_batchnorm.py b/ann_class2/pytorch_batchnorm.py index 766dc805..c3fb30ad 100644 --- a/ann_class2/pytorch_batchnorm.py +++ b/ann_class2/pytorch_batchnorm.py @@ -37,9 +37,11 @@ model.add_module("dense1", torch.nn.Linear(D, 500)) model.add_module("bn1", torch.nn.BatchNorm1d(500)) model.add_module("relu1", torch.nn.ReLU()) +model.add_module("dropout1", torch.nn.Dropout(p=0.2)) model.add_module("dense2", torch.nn.Linear(500, 300)) model.add_module("bn2", torch.nn.BatchNorm1d(300)) model.add_module("relu2", torch.nn.ReLU()) +model.add_module("dropout2", torch.nn.Dropout(p=0.2)) model.add_module("dense3", torch.nn.Linear(300, K)) # Note: no final softmax! # just like Tensorflow, it's included in cross-entropy function diff --git a/ann_class2/tensorflow2.py b/ann_class2/tensorflow2.py index a07f0104..00bd7746 100644 --- a/ann_class2/tensorflow2.py +++ b/ann_class2/tensorflow2.py @@ -12,11 +12,12 @@ import numpy as np import tensorflow as tf - import matplotlib.pyplot as plt - from util import get_normalized_data, y2indicator +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() + def error_rate(p, t): return np.mean(p != t) @@ -31,7 +32,7 @@ def main(): print_period = 50 lr = 0.00004 - reg = 0.01 + #reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) @@ -53,8 +54,8 @@ def main(): # define variables and expressions - X = tf.placeholder(tf.float32, shape=(None, D), name='X') - T = tf.placeholder(tf.float32, shape=(None, K), name='T') + X = tf.compat.v1.placeholder(tf.float32, shape=(None, D), name='X') + T = tf.compat.v1.placeholder(tf.float32, shape=(None, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) @@ -70,19 +71,19 @@ def main(): # softmax_cross_entropy_with_logits take in the "logits" # if you wanted to know the actual output of the neural net, # you could pass "Yish" into tf.nn.softmax(logits) - cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(logits=Yish, labels=T)) + cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) # we choose the optimizer but don't implement the algorithm ourselves # let's go with RMSprop, since we just learned about it. # it includes momentum! - train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) + train_op = tf.compat.v1.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) costs = [] - init = tf.global_variables_initializer() - with tf.Session() as session: + init = tf.compat.v1.global_variables_initializer() + with tf.compat.v1.Session() as session: session.run(init) for i in range(max_iter): diff --git a/ann_class2/util.py b/ann_class2/util.py index 5c8ad934..20bba18d 100644 --- a/ann_class2/util.py +++ b/ann_class2/util.py @@ -15,7 +15,7 @@ import pandas as pd import matplotlib.pyplot as plt from sklearn.decomposition import PCA -from sklearn.linear_model import LogisticRegression +#from sklearn.linear_model import LogisticRegression def get_clouds(): @@ -70,14 +70,14 @@ def get_spiral(): def get_transformed_data(): print("Reading in and transforming data...") - if not os.path.exists('../large_files/train.csv'): - print('Looking for ../large_files/train.csv') + if not os.path.exists('.\\large_files\\digit-recognizer\\train.csv'): + print('Looking for .\\large_files\\digit-recognizer\\train.csv') print('You have not downloaded the data and/or not placed the files in the correct location.') print('Please get the data from: https://www.kaggle.com/c/digit-recognizer') print('Place train.csv in the folder large_files adjacent to the class folder') exit() - df = pd.read_csv('../large_files/train.csv') + df = pd.read_csv('.\\large_files\\digit-recognizer\\train.csv') data = df.values.astype(np.float32) np.random.shuffle(data) @@ -117,14 +117,14 @@ def get_transformed_data(): def get_normalized_data(): print("Reading in and transforming data...") - if not os.path.exists('../large_files/train.csv'): - print('Looking for ../large_files/train.csv') + if not os.path.exists('.\\large_files\\digit-recognizer\\train.csv'): + print('Looking for .\\large_files\\digit-recognizer\\train.csv') print('You have not downloaded the data and/or not placed the files in the correct location.') print('Please get the data from: https://www.kaggle.com/c/digit-recognizer') print('Place train.csv in the folder large_files adjacent to the class folder') exit() - df = pd.read_csv('../large_files/train.csv') + df = pd.read_csv('.\\large_files\\digit-recognizer\\train.csv') data = df.values.astype(np.float32) np.random.shuffle(data) X = data[:, 1:] diff --git a/cnn_class2/class_activation_maps.py b/cnn_class2/class_activation_maps.py index 19033ff5..ec2af568 100644 --- a/cnn_class2/class_activation_maps.py +++ b/cnn_class2/class_activation_maps.py @@ -6,9 +6,9 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.models import Model -from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions -from keras.preprocessing import image +from tensorflow.keras.models import Model #type: ignore +from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions #type: ignore +from tensorflow.keras.preprocessing import image #type: ignore import numpy as np import scipy as sp @@ -19,10 +19,10 @@ # get the image files -# http://www.vision.caltech.edu/Image_Datasets/Caltech101/ -# http://www.vision.caltech.edu/Image_Datasets/Caltech256/ -image_files = glob('../large_files/256_ObjectCategories/*/*.jp*g') -image_files += glob('../large_files/101_ObjectCategories/*/*.jp*g') +# http://www.vision.caltech.edu/datasets/Caltech101 +# http://www.vision.caltech.edu/datasets/Caltech256/ +image_files = glob('.\\large_files\\256_ObjectCategories\\*\\*.jp*g') +image_files += glob('.\\large_files\\101_ObjectCategories\\*\\*.jp*g') @@ -39,13 +39,13 @@ resnet.summary() # make a model to get output before flatten -activation_layer = resnet.get_layer('activation_49') +activation_layer = resnet.get_layer('conv5_block3_out') # create a model object model = Model(inputs=resnet.input, outputs=activation_layer.output) # get the feature map weights -final_dense = resnet.get_layer('fc1000') +final_dense = resnet.get_layer('predictions') W = final_dense.get_weights()[0] diff --git a/cnn_class2/make_limited_datasets.py b/cnn_class2/make_limited_datasets.py index 911c4563..51b57c95 100644 --- a/cnn_class2/make_limited_datasets.py +++ b/cnn_class2/make_limited_datasets.py @@ -10,7 +10,7 @@ def link(src, dst): if not os.path.exists(dst): os.symlink(src, dst, target_is_directory=True) -mkdir('../large_files/fruits-360-small') +mkdir('.\\large_files\\fruits-360-small') classes = [ @@ -24,16 +24,16 @@ def link(src, dst): 'Raspberry' ] -train_path_from = os.path.abspath('../large_files/fruits-360/Training') -valid_path_from = os.path.abspath('../large_files/fruits-360/Validation') +train_path_from = os.path.abspath('.\\large_files\\fruits-360\\Training') +valid_path_from = os.path.abspath('\\large_files\\fruits-360\\Validation') -train_path_to = os.path.abspath('../large_files/fruits-360-small/Training') -valid_path_to = os.path.abspath('../large_files/fruits-360-small/Validation') +train_path_to = os.path.abspath('.\\large_files\\fruits-360-small\\Training') +valid_path_to = os.path.abspath('.\\large_files\\fruits-360-small\\Validation') mkdir(train_path_to) mkdir(valid_path_to) for c in classes: - link(train_path_from + '/' + c, train_path_to + '/' + c) - link(valid_path_from + '/' + c, valid_path_to + '/' + c) \ No newline at end of file + link(train_path_from + '\\' + c, train_path_to + '\\' + c) + link(valid_path_from + '\\' + c, valid_path_to + '\\' + c) \ No newline at end of file diff --git a/cnn_class2/siamese.py b/cnn_class2/siamese.py index 4c43f163..1df2c634 100644 --- a/cnn_class2/siamese.py +++ b/cnn_class2/siamese.py @@ -4,21 +4,23 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.layers import Input, Lambda, Dense, Flatten, Conv2D, BatchNormalization, Activation, MaxPooling2D -from keras.models import Model -from keras.preprocessing import image +from tensorflow.keras.layers import (Input, Lambda, Dense, Flatten, Conv2D, #type: ignore + BatchNormalization, Activation, MaxPooling2D) +from tensorflow.keras.models import Model #type: ignore +from tensorflow.keras.preprocessing import image #type: ignore -import keras.backend as K +import tensorflow.keras.backend as K #type: ignore import numpy as np import matplotlib.pyplot as plt +import tensorflow as tf from glob import glob from collections import Counter # get the data from: http://vision.ucsd.edu/content/yale-face-database -files = glob('../large_files/yalefaces/subject*') +files = glob('.\\large_files\\yalefaces\\subject*') # easier to randomize later np.random.shuffle(files) @@ -54,7 +56,7 @@ def load_img(filepath): # all the filenames are something like 'subject13.happy' labels = np.zeros(N) for i, f in enumerate(files): - filename = f.rsplit('/', 1)[-1] + filename = f.rsplit('\\', 1)[-1] subject_num = filename.split('.', 1)[0] # subtract 1 since the filenames start from 1 @@ -229,6 +231,32 @@ def test_generator(): yield [x1, x2], y +train_dataset = tf.data.Dataset.from_generator( + train_generator, + output_signature=( + ( + tf.TensorSpec(shape=(None, *img.shape), dtype=tf.float32), # x_batch_1 + tf.TensorSpec(shape=(None, *img.shape), dtype=tf.float32) # x_batch_2 + ), + tf.TensorSpec(shape=(None,), dtype=tf.float32) # y_batch + ) +) + +test_dataset = tf.data.Dataset.from_generator( + test_generator, + output_signature=( + ( + tf.TensorSpec(shape=(None, *img.shape), dtype=tf.float32), # x_batch_1 + tf.TensorSpec(shape=(None, *img.shape), dtype=tf.float32) # x_batch_2 + ), + tf.TensorSpec(shape=(None,), dtype=tf.float32) # y_batch + ) +) + +# Optional: Shuffle, batch, and prefetch for performance +train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) +test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) + # build the base neural network @@ -426,10 +454,10 @@ def get_test_accuracy(threshold=0.85): # fit the model r = model.fit( - train_generator(), + train_dataset, steps_per_epoch=train_steps, epochs=20, - validation_data=test_generator(), + validation_data=test_dataset, validation_steps=valid_steps, ) diff --git a/cnn_class2/style_transfer1.py b/cnn_class2/style_transfer1.py index 421a0a19..1ec6ae0c 100644 --- a/cnn_class2/style_transfer1.py +++ b/cnn_class2/style_transfer1.py @@ -9,15 +9,12 @@ # In this script, we will focus on generating the content # E.g. given an image, can we recreate the same image -from keras.layers import Input, Lambda, Dense, Flatten -from keras.layers import AveragePooling2D, MaxPooling2D -from keras.layers.convolutional import Conv2D -from keras.models import Model, Sequential -from keras.applications.vgg16 import VGG16 -from keras.applications.vgg16 import preprocess_input -from keras.preprocessing import image - -import keras.backend as K +from tensorflow.keras.layers import AveragePooling2D, MaxPooling2D, Conv2D #type: ignore +from tensorflow.keras.models import Model, clone_model #type: ignore +from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input #type: ignore +from tensorflow.keras.preprocessing import image #type: ignore + +import tensorflow.keras.backend as K #type: ignore import numpy as np import matplotlib.pyplot as plt @@ -25,14 +22,15 @@ import tensorflow as tf -if tf.__version__.startswith('2'): - tf.compat.v1.disable_eager_execution() +#if tf.__version__.startswith('2'): +# tf.compat.v1.disable_eager_execution() def VGG16_AvgPool(shape): # we want to account for features across the entire image # so get rid of the maxpool which throws away information vgg = VGG16(input_shape=shape, weights='imagenet', include_top=False) + vgg_clone = clone_model(vgg) # new_model = Sequential() # for layer in vgg.layers: @@ -42,16 +40,17 @@ def VGG16_AvgPool(shape): # else: # new_model.add(layer) - i = vgg.input - x = i - for layer in vgg.layers: - if layer.__class__ == MaxPooling2D: + #i = vgg.input + #x = i + for layer in vgg_clone.layers: + if isinstance(layer, MaxPooling2D): # replace it with average pooling - x = AveragePooling2D()(x) - else: - x = layer(x) + layer = AveragePooling2D(pool_size=layer.pool_size) + #else: + # x = layer(x) - return Model(i, x) +# return Model(i, x) + return vgg_clone def VGG16_AvgPool_CutOff(shape, num_convs): # there are 13 convolutions in total @@ -98,12 +97,13 @@ def scale_img(x): return x + if __name__ == '__main__': # open an image # feel free to try your own # path = '../large_files/caltech101/101_ObjectCategories/elephant/image_0002.jpg' - path = 'content/elephant.jpg' + path = '.\\cnn_class2\\content\\elephant.jpg' img = image.load_img(path) # convert image to array and preprocess for vgg @@ -116,8 +116,8 @@ def scale_img(x): shape = x.shape[1:] # see the image - # plt.imshow(img) - # plt.show() + plt.imshow(img) + plt.show() # make a content model @@ -131,16 +131,17 @@ def scale_img(x): # try to match the image # define our loss in keras - loss = K.mean(K.square(target - content_model.output)) - - # gradients which are needed by the optimizer - grads = K.gradients(loss, content_model.input) - - # just like theano.function - get_loss_and_grads = K.function( - inputs=[content_model.input], - outputs=[loss] + grads - ) + #loss_layer = Lambda(lambda inputs: K.mean(K.square(inputs[0] - inputs[1]))) + #loss = loss_layer([target, content_model.output]) + + def get_loss_and_grads(inputs): + with tf.GradientTape() as tape: + tape.watch(inputs) + # Compute the loss as the mean squared difference between target and model output + loss_value = K.mean(K.square(target - content_model(inputs))) + # Compute the gradient of loss with respect to the inputs + grads_value = tape.gradient(loss_value, inputs) + return loss_value, grads_value def get_loss_and_grads_wrapper(x_vec): @@ -155,10 +156,10 @@ def get_loss_and_grads_wrapper(x_vec): # gradient must also be a 1-D array # and both loss and gradient must be np.float64 # will get an error otherwise - - l, g = get_loss_and_grads([x_vec.reshape(*batch_shape)]) - return l.astype(np.float64), g.flatten().astype(np.float64) - + x_tensor = tf.convert_to_tensor(x_vec.reshape(*batch_shape), dtype=tf.float32) + l, g = get_loss_and_grads(x_tensor) + #l, g = get_loss_and_grads(x_vec.reshape(*batch_shape)) + return l.numpy().astype(np.float64), g.numpy().flatten().astype(np.float64) from datetime import datetime diff --git a/cnn_class2/style_transfer2.py b/cnn_class2/style_transfer2.py index f385ad53..fb05340e 100644 --- a/cnn_class2/style_transfer2.py +++ b/cnn_class2/style_transfer2.py @@ -11,10 +11,12 @@ # But NOT the same content. # It should capture only the essence of the style. -from keras.models import Model, Sequential -from keras.applications.vgg16 import preprocess_input -from keras.preprocessing import image -from keras.applications.vgg16 import VGG16 +from tensorflow.keras.models import Model #type: ignore +from tensorflow.keras.applications.vgg16 import preprocess_input #type: ignore +from tensorflow.keras.preprocessing import image #type: ignore +#from keras.applications.vgg16 import VGG16 +import tensorflow as tf +from tensorflow.keras.layers import Layer, Lambda #type:ignore from style_transfer1 import VGG16_AvgPool, unpreprocess, scale_img # from skimage.transform import resize @@ -23,24 +25,41 @@ import numpy as np import matplotlib.pyplot as plt -import keras.backend as K +import tensorflow.keras.backend as K #type: ignore -def gram_matrix(img): - # input is (H, W, C) (C = # feature maps) - # we first need to convert it to (C, H*W) - X = K.batch_flatten(K.permute_dimensions(img, (2, 0, 1))) +# def gram_matrix(img): +# # input is (H, W, C) (C = # feature maps) +# # we first need to convert it to (C, H*W) +# X = K.batch_flatten(K.permute_dimensions(img, (2, 0, 1))) - # now, calculate the gram matrix - # gram = XX^T / N - # the constant is not important since we'll be weighting these - G = K.dot(X, K.transpose(X)) / img.get_shape().num_elements() - return G +# # now, calculate the gram matrix +# # gram = XX^T / N +# # the constant is not important since we'll be weighting these +# G = K.dot(X, K.transpose(X))/img.get_shape().num_elements() +# return G + +class GramMatrixLayer(Layer): + def call(self, inputs): + # Input shape is expected to be (H, W, C) + # Permute dimensions to (C, H, W) + permuted_img = tf.transpose(inputs, perm=[2, 0, 1]) # (C, H, W) + + # Flatten the permuted image to (C, H*W) + flattened_img = tf.reshape(permuted_img, (tf.shape(permuted_img)[0], -1)) # (C, H*W) + + # Calculate the Gram matrix + num_elements = tf.cast(tf.reduce_prod(K.int_shape(inputs)[1:]), tf.float32) + G = K.dot(flattened_img, K.transpose(flattened_img)) / num_elements + return G + +def gram_matrix(img): + return GramMatrixLayer()(img) def style_loss(y, t): - return K.mean(K.square(gram_matrix(y) - gram_matrix(t))) + return Lambda(lambda x: K.mean(K.square(gram_matrix(x[0]) - gram_matrix(x[1]))))([y, t]) # let's generalize this and put it into a function @@ -69,7 +88,7 @@ def minimize(fn, epochs, batch_shape): if __name__ == '__main__': # try these, or pick your own! - path = 'styles/starrynight.jpg' + path = '.\\cnn_class2\\styles\\starrynight.jpg' # path = 'styles/flowercarrier.jpg' # path = 'styles/monalisa.jpg' # path = 'styles/lesdemoisellesdavignon.jpg' @@ -82,8 +101,8 @@ def minimize(fn, epochs, batch_shape): x = image.img_to_array(img) # look at the image - # plt.imshow(x) - # plt.show() + plt.imshow(x) + plt.show() # make it (1, H, W, C) x = np.expand_dims(x, axis=0) @@ -103,7 +122,7 @@ def minimize(fn, epochs, batch_shape): # Note: need to select output at index 1, since outputs at # index 0 correspond to the original vgg with maxpool symbolic_conv_outputs = [ - layer.get_output_at(1) for layer in vgg.layers \ + vgg.get_layer(layer.name).output for layer in vgg.layers if layer.name.endswith('conv1') ] @@ -120,24 +139,31 @@ def minimize(fn, epochs, batch_shape): style_layers_outputs = [K.variable(y) for y in multi_output_model.predict(x)] # calculate the total style loss - loss = 0 - for symbolic, actual in zip(symbolic_conv_outputs, style_layers_outputs): - # gram_matrix() expects a (H, W, C) as input - loss += style_loss(symbolic[0], actual[0]) - - grads = K.gradients(loss, multi_output_model.input) - - # just like theano.function - get_loss_and_grads = K.function( - inputs=[multi_output_model.input], - outputs=[loss] + grads - ) + def get_loss_and_grads(inputs): + inputs = tf.convert_to_tensor(inputs, dtype=tf.float32) # Ensure it's a tensor + with tf.GradientTape() as tape: + tape.watch(inputs) + + # Calculate the total style loss + loss_value = 0 + for symbolic, actual in zip(symbolic_conv_outputs, style_layers_outputs): + current_loss = style_loss(symbolic[0], actual[0]) + print(f'Loss: {current_loss.numpy().astype(np.float64)}') + loss_value += current_loss + # Compute gradients + grads_value = tape.gradient(loss_value, inputs) + return loss_value, grads_value def get_loss_and_grads_wrapper(x_vec): - l, g = get_loss_and_grads([x_vec.reshape(*batch_shape)]) - return l.astype(np.float64), g.flatten().astype(np.float64) - + # Convert the 1-D array back to the appropriate tensor shape + x_tensor = tf.convert_to_tensor(x_vec.reshape(*batch_shape), dtype=tf.float32) + + # Get the loss and gradients + l, g = get_loss_and_grads(x_tensor) + + # Return the loss and the gradients as required by the optimizer + return l.numpy().astype(np.float64), g.numpy().flatten().astype(np.float64) final_img = minimize(get_loss_and_grads_wrapper, 10, batch_shape) plt.imshow(scale_img(final_img)) diff --git a/cnn_class2/style_transfer3.py b/cnn_class2/style_transfer3.py index 8f383d1d..0891ec12 100644 --- a/cnn_class2/style_transfer3.py +++ b/cnn_class2/style_transfer3.py @@ -13,22 +13,23 @@ # We accomplish this by balancing the content loss # and style loss simultaneously. -from keras.layers import Input, Lambda, Dense, Flatten -from keras.layers import AveragePooling2D, MaxPooling2D -from keras.layers.convolutional import Conv2D -from keras.models import Model, Sequential -from keras.applications.vgg16 import VGG16 -from keras.applications.vgg16 import preprocess_input -from keras.preprocessing import image -from skimage.transform import resize - -import keras.backend as K +from tensorflow.keras.layers import Layer #type: ignore #Input, Lambda, Dense, Flatten +# from keras.layers import AveragePooling2D, MaxPooling2D +# from keras.layers.convolutional import Conv2D +from tensorflow.keras.models import Model #type: ignore +# from keras.applications.vgg16 import VGG16 +from tensorflow.keras.applications.vgg16 import preprocess_input #type: ignore +from tensorflow.keras.preprocessing import image #type: ignore +#from skimage.transform import resize + +import tensorflow.keras.backend as K #type: ignore +import tensorflow as tf import numpy as np import matplotlib.pyplot as plt -from style_transfer1 import VGG16_AvgPool, VGG16_AvgPool_CutOff, unpreprocess, scale_img -from style_transfer2 import gram_matrix, style_loss, minimize -from scipy.optimize import fmin_l_bfgs_b +from style_transfer1 import VGG16_AvgPool, scale_img +from style_transfer2 import style_loss, minimize +#from scipy.optimize import fmin_l_bfgs_b # load the content image @@ -47,7 +48,7 @@ def load_img_and_preprocess(path, shape=None): content_img = load_img_and_preprocess( # '../large_files/caltech101/101_ObjectCategories/elephant/image_0002.jpg', # 'batman.jpg', - 'content/sydney.jpg', + '.\\cnn_class2\\content\\sydney.jpg', # (225, 300), ) @@ -58,7 +59,7 @@ def load_img_and_preprocess(path, shape=None): # 'styles/starrynight.jpg', # 'styles/flowercarrier.jpg', # 'styles/monalisa.jpg', - 'styles/lesdemoisellesdavignon.jpg', + '.\\cnn_class2\\styles\\lesdemoisellesdavignon.jpg', (h, w) ) @@ -78,16 +79,16 @@ def load_img_and_preprocess(path, shape=None): # we only want 1 output # remember you can call vgg.summary() to see a list of layers # 1,2,4,5,7-9,11-13,15-17 -content_model = Model(vgg.input, vgg.layers[13].get_output_at(0)) -content_target = K.variable(content_model.predict(content_img)) +content_model = Model(vgg.input, vgg.layers[13].output) +content_target = tf.Variable(content_model.predict(content_img)) # create the style model # we want multiple outputs # we will take the same approach as in style_transfer2.py symbolic_conv_outputs = [ - layer.get_output_at(1) for layer in vgg.layers \ - if layer.name.endswith('conv1') + vgg.get_layer(layer.name).output for layer in vgg.layers + if layer.name.endswith('conv1') ] # make a big model that outputs multiple layers' outputs @@ -103,7 +104,22 @@ def load_img_and_preprocess(path, shape=None): # create the total loss which is the sum of content + style loss -loss = K.mean(K.square(content_model.output - content_target)) +#loss = K.mean(K.square(content_model.output - content_target)) + +class ContentLossLayer(Layer): + def __init__(self, content_target, **kwargs): + super(ContentLossLayer, self).__init__(**kwargs) + self.content_target = content_target + + def call(self, inputs): + return tf.reduce_mean(tf.square(inputs - self.content_target)) + +with tf.GradientTape() as tape: + # Instantiate the content loss layer + content_loss_layer = ContentLossLayer(content_target) + + # Now compute the loss + loss = content_loss_layer(content_model.output) for w, symbolic, actual in zip(style_weights, symbolic_conv_outputs, style_layers_outputs): # gram_matrix() expects a (H, W, C) as input @@ -113,7 +129,7 @@ def load_img_and_preprocess(path, shape=None): # once again, create the gradients and loss + grads function # note: it doesn't matter which model's input you use # they are both pointing to the same keras Input layer in memory -grads = K.gradients(loss, vgg.input) +grads = tape.gradient(loss, vgg.input) # just like theano.function get_loss_and_grads = K.function( diff --git a/cnn_class2/tf_resnet.py b/cnn_class2/tf_resnet.py index 4a3c5fa2..43129c9a 100644 --- a/cnn_class2/tf_resnet.py +++ b/cnn_class2/tf_resnet.py @@ -11,14 +11,14 @@ # compared to keras import tensorflow as tf import numpy as np -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt import keras -from keras.applications.resnet50 import ResNet50 -from keras.models import Model -from keras.preprocessing import image -from keras.layers import Dense -from keras.applications.resnet50 import preprocess_input, decode_predictions +from tensorflow.keras.applications.resnet50 import ResNet50 +from tensorflow.keras.models import Model +#from tensorflow.keras.preprocessing import image +from tensorflow.keras.layers import Dense +#from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions from tf_resnet_convblock import ConvLayer, BatchNormLayer, ConvBlock from tf_resnet_identity_block import IdentityBlock diff --git a/cnn_class2/tf_resnet_convblock.py b/cnn_class2/tf_resnet_convblock.py index 397f160b..5619bf67 100644 --- a/cnn_class2/tf_resnet_convblock.py +++ b/cnn_class2/tf_resnet_convblock.py @@ -17,6 +17,7 @@ def init_filter(d, mi, mo, stride): class ConvLayer: def __init__(self, d, mi, mo, stride=2, padding='VALID'): + super().__init__() self.W = tf.Variable(init_filter(d, mi, mo, stride)) self.b = tf.Variable(np.zeros(mo, dtype=np.float32)) self.stride = stride @@ -50,6 +51,7 @@ def get_params(self): class BatchNormLayer: def __init__(self, D): + super().__init__() self.running_mean = tf.Variable(np.zeros(D, dtype=np.float32), trainable=False) self.running_var = tf.Variable(np.ones(D, dtype=np.float32), trainable=False) self.gamma = tf.Variable(np.ones(D, dtype=np.float32)) @@ -82,6 +84,7 @@ def get_params(self): class ConvBlock: def __init__(self, mi, fm_sizes, stride=2, activation=tf.nn.relu): + super().__init__() # conv1, conv2, conv3 # note: # feature maps shortcut = # feauture maps conv 3 assert(len(fm_sizes) == 3) diff --git a/cnn_class2/tf_resnet_first_layers.py b/cnn_class2/tf_resnet_first_layers.py index 9157b65c..188ee731 100644 --- a/cnn_class2/tf_resnet_first_layers.py +++ b/cnn_class2/tf_resnet_first_layers.py @@ -11,13 +11,13 @@ # compared to keras import tensorflow as tf import numpy as np -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt import keras -from keras.applications.resnet50 import ResNet50 -from keras.models import Model -from keras.preprocessing import image -from keras.applications.resnet50 import preprocess_input, decode_predictions +from tensorflow.keras.applications.resnet50 import ResNet50 +from tensorflow.keras.models import Model +#from keras.preprocessing import image +#from keras.applications.resnet50 import preprocess_input, decode_predictions from tf_resnet_convblock import ConvLayer, BatchNormLayer, ConvBlock @@ -53,6 +53,7 @@ def get_params(self): class MaxPoolLayer: def __init__(self, dim): + super().__init__() self.dim = dim def forward(self, X): @@ -68,6 +69,7 @@ def get_params(self): class PartialResNet: def __init__(self): + super().__init__() self.layers = [ # before conv block ConvLayer(d=7, mi=3, mo=64, stride=2, padding='SAME'), diff --git a/cnn_class2/tf_resnet_identity_block.py b/cnn_class2/tf_resnet_identity_block.py index 3e30d30c..b8569f9c 100644 --- a/cnn_class2/tf_resnet_identity_block.py +++ b/cnn_class2/tf_resnet_identity_block.py @@ -8,13 +8,14 @@ import tensorflow as tf import numpy as np -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt from tf_resnet_convblock import ConvLayer, BatchNormLayer class IdentityBlock: def __init__(self, mi, fm_sizes, activation=tf.nn.relu): + super().__init__() # conv1, conv2, conv3 # note: # feature maps shortcut = # feauture maps conv 3 assert(len(fm_sizes) == 3) diff --git a/cnn_class2/use_pretrained_weights_resnet.py b/cnn_class2/use_pretrained_weights_resnet.py index 8f3aae71..41ffacce 100644 --- a/cnn_class2/use_pretrained_weights_resnet.py +++ b/cnn_class2/use_pretrained_weights_resnet.py @@ -6,12 +6,12 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.layers import Input, Lambda, Dense, Flatten -from keras.models import Model -from keras.applications.resnet import ResNet50, preprocess_input +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.models import Model +from tensorflow.keras.applications.resnet import ResNet50, preprocess_input # from keras.applications.inception_v3 import InceptionV3, preprocess_input -from keras.preprocessing import image -from keras.preprocessing.image import ImageDataGenerator +from tensorflow.keras.preprocessing import image +from tensorflow.keras.preprocessing.image import ImageDataGenerator from sklearn.metrics import confusion_matrix import numpy as np @@ -32,17 +32,17 @@ # valid_path = '../large_files/blood_cell_images/TEST' # https://www.kaggle.com/moltean/fruits -# train_path = '../large_files/fruits-360/Training' -# valid_path = '../large_files/fruits-360/Validation' -train_path = '../large_files/fruits-360-small/Training' -valid_path = '../large_files/fruits-360-small/Validation' +train_path = '.\\large_files\\fruits-360\\Training' +valid_path = '.\\large_files\\fruits-360\\Validation' +#train_path = '.\\large_files\\fruits-360-small\\Training' +#valid_path = '.\\large_files\\fruits-360-small\\Validation' # useful for getting number of files -image_files = glob(train_path + '/*/*.jp*g') -valid_image_files = glob(valid_path + '/*/*.jp*g') +image_files = glob(train_path + '\\*\\*.jp*g') +valid_image_files = glob(valid_path + '\\*\\*.jp*g') # useful for getting number of classes -folders = glob(train_path + '/*') +folders = glob(train_path + '\\*') # look at an image for fun diff --git a/cnn_class2/use_pretrained_weights_vgg.py b/cnn_class2/use_pretrained_weights_vgg.py index 849dd9f6..0338507e 100644 --- a/cnn_class2/use_pretrained_weights_vgg.py +++ b/cnn_class2/use_pretrained_weights_vgg.py @@ -1,16 +1,15 @@ # https://deeplearningcourses.com/c/advanced-computer-vision # https://www.udemy.com/advanced-computer-vision from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -from keras.layers import Input, Lambda, Dense, Flatten -from keras.models import Model -from keras.applications.vgg16 import VGG16 -from keras.applications.vgg16 import preprocess_input -from keras.preprocessing import image -from keras.preprocessing.image import ImageDataGenerator +from tensorflow.keras.layers import Dense, Flatten +from tensorflow.keras.models import Model +from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input +from tensorflow.keras.preprocessing import image +from tensorflow.keras.preprocessing.image import ImageDataGenerator from sklearn.metrics import confusion_matrix import numpy as np @@ -31,17 +30,17 @@ # valid_path = '../large_files/blood_cell_images/TEST' # https://www.kaggle.com/moltean/fruits -train_path = '../large_files/fruits-360/Training' -valid_path = '../large_files/fruits-360/Validation' -# train_path = '../large_files/fruits-360-small/Training' -# valid_path = '../large_files/fruits-360-small/Validation' +#train_path = './large_files/fruits-360/Training' +#valid_path = '../large_files/fruits-360/Validation' +train_path = '.\\large_files\\fruits-360-small\\Training' +valid_path = '.\\large_files\\fruits-360-small\\Validation' -# useful for getting number of files -image_files = glob(train_path + '/*/*.jp*g') -valid_image_files = glob(valid_path + '/*/*.jp*g') +# useful for getting number of file +image_files = glob(train_path + '\\*\\*.jp*g') +valid_image_files = glob(valid_path + '\\*\\*.jp*g') # useful for getting number of classes -folders = glob(train_path + '/*') +folders = glob(train_path + '\\*') # look at an image for fun diff --git a/keras_examples/ann.py b/keras_examples/ann.py index 08636b15..857fd95c 100644 --- a/keras_examples/ann.py +++ b/keras_examples/ann.py @@ -7,8 +7,8 @@ import matplotlib.pyplot as plt from util import getKaggleMNIST -from keras.models import Model -from keras.layers import Dense, Activation, Input +from tensorflow.keras.models import Model # type: ignore +from tensorflow.keras.layers import Dense, Input # type: ignore # get the data @@ -58,8 +58,8 @@ plt.show() # accuracies -plt.plot(r.history['acc'], label='acc') -plt.plot(r.history['val_acc'], label='val_acc') +plt.plot(r.history['accuracy'], label='acc') +plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() plt.show() diff --git a/keras_examples/cnn.py b/keras_examples/cnn.py index 088cc5b2..f0eee61f 100644 --- a/keras_examples/cnn.py +++ b/keras_examples/cnn.py @@ -5,14 +5,14 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.models import Model -from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Input +from tensorflow.keras.models import Model # type: ignore +from tensorflow.keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Input # type: ignore import matplotlib.pyplot as plt import pandas as pd import numpy as np -from util import getKaggleMNIST3D, getKaggleFashionMNIST3D, getCIFAR10 +from util import getKaggleFashionMNIST3D # get the data @@ -73,8 +73,8 @@ plt.show() # accuracies -plt.plot(r.history['acc'], label='acc') -plt.plot(r.history['val_acc'], label='val_acc') +plt.plot(r.history['accuracy'], label='acc') +plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() plt.show() diff --git a/keras_examples/cnn_dropout_batchnorm.py b/keras_examples/cnn_dropout_batchnorm.py index f89cd37d..44e89f72 100644 --- a/keras_examples/cnn_dropout_batchnorm.py +++ b/keras_examples/cnn_dropout_batchnorm.py @@ -5,14 +5,14 @@ # Note: you may need to update your version of future # sudo pip install -U future -from keras.models import Sequential, Model -from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, Input +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, Input import matplotlib.pyplot as plt -import pandas as pd -import numpy as np +#import pandas as pd +#import numpy as np -from util import getKaggleMNIST3D, getKaggleFashionMNIST3D, getCIFAR10 +from util import getKaggleFashionMNIST3D # get the data @@ -76,8 +76,8 @@ plt.show() # accuracies -plt.plot(r.history['acc'], label='acc') -plt.plot(r.history['val_acc'], label='val_acc') +plt.plot(r.history['accuracy'], label='acc') +plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() plt.show() diff --git a/keras_examples/util.py b/keras_examples/util.py index 2e3af106..22fa6832 100644 --- a/keras_examples/util.py +++ b/keras_examples/util.py @@ -12,12 +12,12 @@ def getKaggleMNIST(): # https://www.kaggle.com/c/digit-recognizer - return getMNISTFormat('../large_files/train.csv') + return getMNISTFormat('.\\large_files\\digit-recognizer\\train.csv') def getKaggleFashionMNIST(): # https://www.kaggle.com/zalando-research/fashionmnist - return getMNISTFormat('../large_files/fashionmnist/fashion-mnist_train.csv') + return getMNISTFormat('.\\large_files\\fashionmnist\\fashion-mnist_train.csv') def getMNISTFormat(path): # MNIST data: diff --git a/nlp_class2/bow_classifier.py b/nlp_class2/bow_classifier.py index 25588e3b..70efdde6 100644 --- a/nlp_class2/bow_classifier.py +++ b/nlp_class2/bow_classifier.py @@ -32,7 +32,7 @@ def __init__(self): word2vec = {} embedding = [] idx2word = [] - with open('../large_files/glove.6B/glove.6B.50d.txt') as f: + with open('../large_files/glove.6B/glove.6B.50d.txt', encoding='utf-8') as f: # is just a space-separated text file in the format: # word vec[0] vec[1] vec[2] ... for line in f: diff --git a/nlp_class2/cc_matrix_50.npy b/nlp_class2/cc_matrix_50.npy new file mode 100644 index 00000000..508d3a6b Binary files /dev/null and b/nlp_class2/cc_matrix_50.npy differ diff --git a/nlp_class2/glove.py b/nlp_class2/glove.py index b46c13f2..fdd9e6fa 100644 --- a/nlp_class2/glove.py +++ b/nlp_class2/glove.py @@ -20,7 +20,7 @@ import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +from rnn_class.brown import get_sentences_with_word2idx_limit_vocab # using ALS, what's the least # files to get correct analogies? # use this for word2vec training to make it faster @@ -120,7 +120,7 @@ def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, costs = [] - sentence_indexes = range(len(sentences)) + #sentence_indexes = range(len(sentences)) for epoch in range(epochs): delta = W.dot(U.T) + b.reshape(V, 1) + c.reshape(1, V) + mu - logX cost = ( fX * delta * delta ).sum() diff --git a/nlp_class2/glove_model_50.npz b/nlp_class2/glove_model_50.npz new file mode 100644 index 00000000..56e47511 Binary files /dev/null and b/nlp_class2/glove_model_50.npz differ diff --git a/nlp_class2/glove_svd.py b/nlp_class2/glove_svd.py index a0fd3c0a..a8db4f92 100644 --- a/nlp_class2/glove_svd.py +++ b/nlp_class2/glove_svd.py @@ -14,14 +14,13 @@ from sklearn.decomposition import TruncatedSVD from datetime import datetime -from sklearn.utils import shuffle from util import find_analogies import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +from rnn_class.brown import get_sentences_with_word2idx_limit_vocab class Glove: diff --git a/nlp_class2/glove_tf.py b/nlp_class2/glove_tf.py index 9db18bb4..8986ad94 100644 --- a/nlp_class2/glove_tf.py +++ b/nlp_class2/glove_tf.py @@ -14,13 +14,13 @@ import matplotlib.pyplot as plt from datetime import datetime -from sklearn.utils import shuffle +#from sklearn.utils import shuffle from util import find_analogies import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +from rnn_class.brown import get_sentences_with_word2idx_limit_vocab if tf.__version__.startswith('2'): tf.compat.v1.disable_eager_execution() @@ -141,7 +141,7 @@ def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, session.run(init) costs = [] - sentence_indexes = range(len(sentences)) + #sentence_indexes = range(len(sentences)) for epoch in range(epochs): c, _ = session.run((cost, train_op), feed_dict={tfLogX: logX, tffX: fX}) print("epoch:", epoch, "cost:", c) @@ -190,7 +190,7 @@ def main(we_file, w2i_file, use_brown=True, n_files=50): V = len(word2idx) model = Glove(100, V, 10) - model.fit(sentences, cc_matrix=cc_matrix, epochs=200) + model.fit(sentences, cc_matrix=cc_matrix, epochs=10000) model.save(we_file) diff --git a/nlp_class2/glove_word2idx_50.json b/nlp_class2/glove_word2idx_50.json new file mode 100644 index 00000000..a3142b0f --- /dev/null +++ b/nlp_class2/glove_word2idx_50.json @@ -0,0 +1 @@ +{"START": 0, "END": 1, "the": 2, "of": 3, "and": 4, "in": 5, "to": 6, "a": 7, "as": 8, "is": 9, "was": 10, "for": 11, "that": 12, "by": 13, "with": 14, "on": 15, "from": 16, "his": 17, "are": 18, "it": 19, "an": 20, "at": 21, "he": 22, "or": 23, "which": 24, "be": 25, "were": 26, "this": 27, "not": 28, "have": 29, "also": 30, "had": 31, "their": 32, "has": 33, "its": 34, "but": 35, "one": 36, "first": 37, "other": 38, "they": 39, "been": 40, "such": 41, "after": 42, "who": 43, "more": 44, "new": 45, "some": 46, "most": 47, "used": 48, "can": 49, "into": 50, "two": 51, "all": 52, "when": 53, "during": 54, "there": 55, "these": 56, "may": 57, "many": 58, "than": 59, "time": 60, "between": 61, "would": 62, "only": 63, "over": 64, "while": 65, "states": 66, "about": 67, "years": 68, "world": 69, "her": 70, "later": 71, "known": 72, "no": 73, "use": 74, "war": 75, "people": 76, "however": 77, "both": 78, "including": 79, "united": 80, "where": 81, "made": 82, "became": 83, "him": 84, "being": 85, "city": 86, "american": 87, "under": 88, "through": 89, "century": 90, "called": 91, "early": 92, "state": 93, "since": 94, "them": 95, "system": 96, "then": 97, "three": 98, "up": 99, "government": 100, "part": 101, "number": 102, "if": 103, "out": 104, "well": 105, "often": 106, "several": 107, "because": 108, "any": 109, "work": 110, "before": 111, "i": 112, "national": 113, "she": 114, "so": 115, "against": 116, "each": 117, "could": 118, "same": 119, "year": 120, "us": 121, "film": 122, "although": 123, "until": 124, "found": 125, "second": 126, "form": 127, "according": 128, "following": 129, "example": 130, "will": 131, "around": 132, "british": 133, "include": 134, "like": 135, "name": 136, "those": 137, "different": 138, "due": 139, "did": 140, "english": 141, "among": 142, "began": 143, "major": 144, "within": 145, "another": 146, "life": 147, "large": 148, "high": 149, "based": 150, "french": 151, "series": 152, "even": 153, "language": 154, "general": 155, "group": 156, "international": 157, "much": 158, "using": 159, "population": 160, "north": 161, "power": 162, "music": 163, "south": 164, "modern": 165, "set": 166, "four": 167, "end": 168, "country": 169, "period": 170, "common": 171, "political": 172, "public": 173, "area": 174, "university": 175, "military": 176, "million": 177, "own": 178, "led": 179, "german": 180, "members": 181, "now": 182, "death": 183, "what": 184, "\u2013": 185, "1": 186, "church": 187, "history": 188, "very": 189, "party": 190, "de": 191, "still": 192, "john": 193, "great": 194, "considered": 195, "said": 196, "law": 197, "european": 198, "small": 199, "book": 200, "order": 201, "published": 202, "king": 203, "do": 204, "late": 205, "day": 206, "development": 207, "family": 208, "support": 209, "president": 210, "water": 211, "important": 212, "various": 213, "along": 214, "without": 215, "central": 216, "categories": 217, "developed": 218, "though": 219, "school": 220, "countries": 221, "control": 222, "east": 223, "human": 224, "army": 225, "west": 226, "took": 227, "place": 228, "long": 229, "term": 230, "wrote": 231, "home": 232, "included": 233, "become": 234, "times": 235, "game": 236, "established": 237, "main": 238, "given": 239, "way": 240, "local": 241, "island": 242, "theory": 243, "last": 244, "union": 245, "house": 246, "usually": 247, "age": 248, "similar": 249, "europe": 250, "held": 251, "make": 252, "force": 253, "western": 254, "back": 255, "production": 256, "ii": 257, "left": 258, "systems": 259, "less": 260, "company": 261, "air": 262, "released": 263, "popular": 264, "forces": 265, "social": 266, "roman": 267, "having": 268, "old": 269, "others": 270, "named": 271, "economic": 272, "further": 273, "groups": 274, "empire": 275, "films": 276, "original": 277, "result": 278, "region": 279, "few": 280, "thus": 281, "largest": 282, "point": 283, "role": 284, "court": 285, "case": 286, "former": 287, "described": 288, "team": 289, "march": 290, "works": 291, "written": 292, "land": 293, "five": 294, "process": 295, "service": 296, "languages": 297, "january": 298, "areas": 299, "river": 300, "produced": 301, "per": 302, "single": 303, "games": 304, "research": 305, "june": 306, "july": 307, "sometimes": 308, "december": 309, "came": 310, "2": 311, "rather": 312, "created": 313, "october": 314, "line": 315, "women": 316, "data": 317, "field": 318, "generally": 319, "does": 320, "continued": 321, "down": 322, "york": 323, "september": 324, "islands": 325, "received": 326, "how": 327, "should": 328, "black": 329, "france": 330, "england": 331, "played": 332, "greek": 333, "november": 334, "either": 335, "must": 336, "germany": 337, "show": 338, "april": 339, "god": 340, "best": 341, "season": 342, "total": 343, "species": 344, "see": 345, "third": 346, "kingdom": 347, "council": 348, "soviet": 349, "especially": 350, "just": 351, "science": 352, "art": 353, "study": 354, "word": 355, "10": 356, "member": 357, "august": 358, "every": 359, "free": 360, "days": 361, "light": 362, "instead": 363, "won": 364, "body": 365, "act": 366, "sea": 367, "trade": 368, "son": 369, "space": 370, "throughout": 371, "men": 372, "died": 373, "children": 374, "society": 375, "near": 376, "foreign": 377, "we": 378, "london": 379, "significant": 380, "information": 381, "version": 382, "built": 383, "energy": 384, "take": 385, "possible": 386, "northern": 387, "standard": 388, "ancient": 389, "christian": 390, "white": 391, "next": 392, "final": 393, "natural": 394, "despite": 395, "himself": 396, "addition": 397, "again": 398, "league": 399, "introduced": 400, "bc": 401, "design": 402, "upon": 403, "man": 404, "making": 405, "never": 406, "rights": 407, "movement": 408, "right": 409, "position": 410, "least": 411, "eastern": 412, "february": 413, "india": 414, "seen": 415, "traditional": 416, "battle": 417, "southern": 418, "change": 419, "education": 420, "parts": 421, "religious": 422, "terms": 423, "play": 424, "formed": 425, "followed": 426, "almost": 427, "america": 428, "influence": 429, "china": 430, "republic": 431, "once": 432, "together": 433, "3": 434, "father": 435, "culture": 436, "royal": 437, "across": 438, "evidence": 439, "television": 440, "six": 441, "chinese": 442, "civil": 443, "higher": 444, "forms": 445, "little": 446, "off": 447, "20": 448, "elements": 449, "certain": 450, "middle": 451, "office": 452, "community": 453, "level": 454, "red": 455, "means": 456, "range": 457, "available": 458, "type": 459, "increased": 460, "lost": 461, "album": 462, "official": 463, "side": 464, "15": 465, "program": 466, "particularly": 467, "numbers": 468, "short": 469, "remained": 470, "young": 471, "itself": 472, "born": 473, "above": 474, "band": 475, "computer": 476, "lower": 477, "special": 478, "present": 479, "nations": 480, "record": 481, "model": 482, "associated": 483, "head": 484, "rule": 485, "thought": 486, "2010": 487, "earth": 488, "particular": 489, "eventually": 490, "low": 491, "latin": 492, "center": 493, "rate": 494, "japanese": 495, "jewish": 496, "college": 497, "good": 498, "services": 499, "words": 500, "minister": 501, "capital": 502, "whose": 503, "2011": 504, "2020": 505, "writers": 506, "character": 507, "leading": 508, "you": 509, "story": 510, "believed": 511, "4": 512, "cities": 513, "5": 514, "announced": 515, "referred": 516, "2021": 517, "allowed": 518, "taken": 519, "located": 520, "building": 521, "went": 522, "typically": 523, "structure": 524, "2022": 525, "moved": 526, "beginning": 527, "africa": 528, "male": 529, "months": 530, "market": 531, "spanish": 532, "meaning": 533, "provided": 534, "source": 535, "food": 536, "12": 537, "writing": 538, "nature": 539, "industry": 540, "st": 541, "living": 542, "project": 543, "2023": 544, "function": 545, "live": 546, "italian": 547, "half": 548, "able": 549, "cases": 550, "effect": 551, "gave": 552, "provide": 553, "style": 554, "current": 555, "appeared": 556, "required": 557, "top": 558, "2008": 559, "served": 560, "radio": 561, "health": 562, "saw": 563, "started": 564, "value": 565, "title": 566, "related": 567, "election": 568, "economy": 569, "2012": 570, "aircraft": 571, "books": 572, "town": 573, "includes": 574, "strong": 575, "william": 576, "companies": 577, "lead": 578, "open": 579, "stated": 580, "average": 581, "2000": 582, "network": 583, "events": 584, "emperor": 585, "today": 586, "c": 587, "players": 588, "majority": 589, "far": 590, "outside": 591, "policy": 592, "full": 593, "view": 594, "2009": 595, "mass": 596, "complex": 597, "30": 598, "return": 599, "returned": 600, "sent": 601, "increase": 602, "2007": 603, "independent": 604, "working": 605, "person": 606, "practice": 607, "limited": 608, "rock": 609, "2019": 610, "legal": 611, "2015": 612, "russian": 613, "brought": 614, "founded": 615, "caused": 616, "features": 617, "close": 618, "individual": 619, "private": 620, "technology": 621, "characters": 622, "earlier": 623, "reported": 624, "size": 625, "indian": 626, "james": 627, "whether": 628, "business": 629, "award": 630, "might": 631, "catholic": 632, "2014": 633, "designed": 634, "class": 635, "cultural": 636, "material": 637, "2016": 638, "commonly": 639, "changes": 640, "primary": 641, "action": 642, "recorded": 643, "proposed": 644, "growth": 645, "2017": 646, "therefore": 647, "studies": 648, "types": 649, "schools": 650, "widely": 651, "japan": 652, "specific": 653, "larger": 654, "too": 655, "2013": 656, "2018": 657, "prime": 658, "cause": 659, "code": 660, "themselves": 661, "subject": 662, "mostly": 663, "african": 664, "2006": 665, "charles": 666, "interest": 667, "historical": 668, "surface": 669, "club": 670, "territory": 671, "run": 672, "11": 673, "video": 674, "base": 675, "25": 676, "media": 677, "seven": 678, "uses": 679, "physical": 680, "software": 681, "effects": 682, "students": 683, "canada": 684, "success": 685, "greater": 686, "parliament": 687, "originally": 688, "performance": 689, "names": 690, "help": 691, "away": 692, "always": 693, "defined": 694, "list": 695, "future": 696, "100": 697, "produce": 698, "likely": 699, "italy": 700, "approximately": 701, "sound": 702, "federal": 703, "county": 704, "replaced": 705, "6": 706, "david": 707, "billion": 708, "song": 709, "recent": 710, "coast": 711, "key": 712, "involved": 713, "added": 714, "george": 715, "release": 716, "mother": 717, "b": 718, "elected": 719, "via": 720, "eg": 721, "personal": 722, "below": 723, "conditions": 724, "regions": 725, "security": 726, "construction": 727, "2005": 728, "sources": 729, "numerous": 730, "concept": 731, "britain": 732, "come": 733, "division": 734, "records": 735, "attack": 736, "soon": 737, "19th": 738, "idea": 739, "killed": 740, "access": 741, "uk": 742, "longer": 743, "largely": 744, "hand": 745, "fact": 746, "successful": 747, "supported": 748, "star": 749, "remains": 750, "units": 751, "lines": 752, "real": 753, "park": 754, "love": 755, "site": 756, "my": 757, "directly": 758, "create": 759, "results": 760, "scholars": 761, "multiple": 762, "adopted": 763, "metal": 764, "already": 765, "henry": 766, "classical": 767, "association": 768, "reached": 769, "commercial": 770, "put": 771, "method": 772, "variety": 773, "whom": 774, "20th": 775, "independence": 776, "problems": 777, "towards": 778, "direct": 779, "football": 780, "native": 781, "wife": 782, "sold": 783, "need": 784, "la": 785, "tradition": 786, "points": 787, "initially": 788, "8": 789, "era": 790, "claimed": 791, "organization": 792, "medical": 793, "authority": 794, "18": 795, "shows": 796, "products": 797, "scientific": 798, "16": 799, "asia": 800, "14": 801, "financial": 802, "relationship": 803, "separate": 804, "centre": 805, "smaller": 806, "worked": 807, "laws": 808, "additional": 809, "performed": 810, "oil": 811, "relations": 812, "letter": 813, "estimated": 814, "fire": 815, "global": 816, "lake": 817, "highest": 818, "artists": 819, "leader": 820, "australia": 821, "peoples": 822, "musical": 823, "cells": 824, "problem": 825, "operations": 826, "report": 827, "discovered": 828, "7": 829, "met": 830, "2001": 831, "career": 832, "event": 833, "response": 834, "status": 835, "centuries": 836, "previous": 837, "methods": 838, "knowledge": 839, "active": 840, "compared": 841, "nearly": 842, "primarily": 843, "2024": 844, "levels": 845, "jews": 846, "pressure": 847, "robert": 848, "highly": 849, "complete": 850, "california": 851, "married": 852, "examples": 853, "gas": 854, "rules": 855, "allow": 856, "2004": 857, "gold": 858, "street": 859, "religion": 860, "campaign": 861, "basis": 862, "true": 863, "treaty": 864, "player": 865, "stage": 866, "placed": 867, "13": 868, "novel": 869, "bank": 870, "give": 871, "cell": 872, "ten": 873, "constitution": 874, "mainly": 875, "committee": 876, "joined": 877, "revolution": 878, "changed": 879, "influenced": 880, "parties": 881, "road": 882, "internet": 883, "agreement": 884, "argued": 885, "divided": 886, "museum": 887, "memory": 888, "better": 889, "academy": 890, "cannot": 891, "worlds": 892, "board": 893, "front": 894, "station": 895, "teams": 896, "philosophy": 897, "spain": 898, "heavy": 899, "individuals": 900, "unit": 901, "percent": 902, "basic": 903, "experience": 904, "our": 905, "congress": 906, "police": 907, "entire": 908, "training": 909, "literature": 910, "israel": 911, "rest": 912, "irish": 913, "grand": 914, "chemical": 915, "arts": 916, "suggested": 917, "money": 918, "troops": 919, "50": 920, "ever": 921, "24": 922, "female": 923, "songs": 924, "paul": 925, "text": 926, "yet": 927, "lack": 928, "relatively": 929, "color": 930, "taking": 931, "appointed": 932, "night": 933, "deaths": 934, "x": 935, "past": 936, "letters": 937, "blue": 938, "latter": 939, "eight": 940, "contains": 941, "shown": 942, "previously": 943, "analysis": 944, "2003": 945, "ended": 946, "animals": 947, "fiction": 948, "ground": 949, "account": 950, "turn": 951, "signed": 952, "marriage": 953, "go": 954, "issues": 955, "potential": 956, "resulting": 957, "whole": 958, "enough": 959, "summer": 960, "opened": 961, "appear": 962, "noted": 963, "decided": 964, "rome": 965, "peace": 966, "attempt": 967, "forced": 968, "issue": 969, "director": 970, "imperial": 971, "daughter": 972, "done": 973, "plan": 974, "paris": 975, "department": 976, "completed": 977, "prior": 978, "1980s": 979, "climate": 980, "temperature": 981, "famous": 982, "test": 983, "ireland": 984, "nuclear": 985, "regional": 986, "accepted": 987, "sense": 988, "thomas": 989, "collection": 990, "1970s": 991, "programs": 992, "powers": 993, "operation": 994, "administration": 995, "russia": 996, "property": 997, "17": 998, "1999": 999, "v": 1000, "green": 1001, "blood": 1002, "origin": 1003, "applied": 1004, "stories": 1005, "treatment": 1006, "ad": 1007, "intended": 1008, "approach": 1009, "length": 1010, "rise": 1011, "passed": 1012, "birth": 1013, "ships": 1014, "move": 1015, "speed": 1016, "functions": 1017, "objects": 1018, "wide": 1019, "hall": 1020, "hours": 1021, "activity": 1022, "difficult": 1023, "probably": 1024, "child": 1025, "date": 1026, "amount": 1027, "2002": 1028, "matter": 1029, "article": 1030, "district": 1031, "brother": 1032, "behind": 1033, "ideas": 1034, "chief": 1035, "earliest": 1036, "navy": 1037, "exist": 1038, "degree": 1039, "machine": 1040, "opposition": 1041, "industrial": 1042, "democratic": 1043, "loss": 1044, "derived": 1045, "product": 1046, "canadian": 1047, "find": 1048, "creation": 1049, "reduced": 1050, "presence": 1051, "failed": 1052, "becoming": 1053, "simple": 1054, "properties": 1055, "makes": 1056, "m": 1057, "dutch": 1058, "get": 1059, "needed": 1060, "finally": 1061, "provides": 1062, "reference": 1063, "lived": 1064, "institute": 1065, "ability": 1066, "assembly": 1067, "annual": 1068, "me": 1069, "carried": 1070, "21": 1071, "humans": 1072, "notable": 1073, "contemporary": 1074, "declared": 1075, "ones": 1076, "represented": 1077, "playing": 1078, "cost": 1079, "composed": 1080, "appears": 1081, "washington": 1082, "1960s": 1083, "extended": 1084, "leaders": 1085, "frequently": 1086, "kings": 1087, "ie": 1088, "9": 1089, "transport": 1090, "elections": 1091, "armed": 1092, "resulted": 1093, "holy": 1094, "port": 1095, "border": 1096, "start": 1097, "increasing": 1098, "dna": 1099, "feature": 1100, "plants": 1101, "disease": 1102, "combined": 1103, "carbon": 1104, "featured": 1105, "element": 1106, "identified": 1107, "iron": 1108, "starting": 1109, "agreed": 1110, "older": 1111, "refer": 1112, "responsible": 1113, "existence": 1114, "stars": 1115, "necessary": 1116, "operating": 1117, "fall": 1118, "1998": 1119, "location": 1120, "commission": 1121, "materials": 1122, "louis": 1123, "1990s": 1124, "news": 1125, "san": 1126, "regular": 1127, "ice": 1128, "dead": 1129, "read": 1130, "except": 1131, "greatest": 1132, "mission": 1133, "internal": 1134, "1997": 1135, "conflict": 1136, "activities": 1137, "22": 1138, "spread": 1139, "e": 1140, "command": 1141, "opposed": 1142, "section": 1143, "offered": 1144, "professional": 1145, "province": 1146, "airport": 1147, "critical": 1148, "initial": 1149, "40": 1150, "reason": 1151, "efforts": 1152, "festival": 1153, "polish": 1154, "iii": 1155, "remain": 1156, "buildings": 1157, "places": 1158, "claims": 1159, "quickly": 1160, "respectively": 1161, "flight": 1162, "soldiers": 1163, "contrast": 1164, "fourth": 1165, "alternative": 1166, "launched": 1167, "occur": 1168, "management": 1169, "models": 1170, "mexico": 1171, "decision": 1172, "big": 1173, "cup": 1174, "woman": 1175, "plant": 1176, "consists": 1177, "prominent": 1178, "remaining": 1179, "definition": 1180, "presented": 1181, "exchange": 1182, "simply": 1183, "image": 1184, "asked": 1185, "applications": 1186, "claim": 1187, "currently": 1188, "clear": 1189, "wars": 1190, "paper": 1191, "workers": 1192, "believe": 1193, "australian": 1194, "risk": 1195, "continue": 1196, "1990": 1197, "egypt": 1198, "tour": 1199, "environment": 1200, "engineering": 1201, "engine": 1202, "digital": 1203, "channel": 1204, "techniques": 1205, "upper": 1206, "1996": 1207, "entered": 1208, "author": 1209, "poland": 1210, "urban": 1211, "muslim": 1212, "sports": 1213, "wanted": 1214, "values": 1215, "speech": 1216, "directed": 1217, "occurred": 1218, "sexual": 1219, "defeated": 1220, "meeting": 1221, "issued": 1222, "electric": 1223, "1992": 1224, "jesus": 1225, "allowing": 1226, "islamic": 1227, "allows": 1228, "contain": 1229, "mary": 1230, "freedom": 1231, "23": 1232, "1991": 1233, "expressed": 1234, "attacks": 1235, "object": 1236, "course": 1237, "defense": 1238, "acid": 1239, "communities": 1240, "library": 1241, "figure": 1242, "positive": 1243, "expected": 1244, "quality": 1245, "beyond": 1246, "scale": 1247, "alexander": 1248, "poor": 1249, "magazine": 1250, "race": 1251, "governor": 1252, "unlike": 1253, "richard": 1254, "observed": 1255, "here": 1256, "subsequently": 1257, "minor": 1258, "month": 1259, "growing": 1260, "historian": 1261, "edition": 1262, "turned": 1263, "treaties": 1264, "regarded": 1265, "things": 1266, "sun": 1267, "19": 1268, "organizations": 1269, "versions": 1270, "charge": 1271, "fully": 1272, "families": 1273, "spent": 1274, "structures": 1275, "focus": 1276, "moon": 1277, "medieval": 1278, "conference": 1279, "governments": 1280, "churches": 1281, "fields": 1282, "convention": 1283, "ocean": 1284, "lord": 1285, "1995": 1286, "dynasty": 1287, "mark": 1288, "hold": 1289, "effective": 1290, "d": 1291, "20thcentury": 1292, "institutions": 1293, "distance": 1294, "1994": 1295, "reign": 1296, "orthodox": 1297, "win": 1298, "subsequent": 1299, "recognized": 1300, "helped": 1301, "victory": 1302, "inspired": 1303, "ethnic": 1304, "distinct": 1305, "told": 1306, "formation": 1307, "share": 1308, "ways": 1309, "27": 1310, "n": 1311, "28": 1312, "ship": 1313, "standards": 1314, "impact": 1315, "formal": 1316, "expansion": 1317, "labour": 1318, "critics": 1319, "26": 1320, "direction": 1321, "los": 1322, "attempted": 1323, "prevent": 1324, "f": 1325, "figures": 1326, "notes": 1327, "bands": 1328, "address": 1329, "protection": 1330, "press": 1331, "appearance": 1332, "marked": 1333, "weapons": 1334, "officially": 1335, "instance": 1336, "serve": 1337, "resources": 1338, "content": 1339, "leaving": 1340, "gods": 1341, "friend": 1342, "countrys": 1343, "golden": 1344, "develop": 1345, "negative": 1346, "nation": 1347, "j": 1348, "refused": 1349, "valley": 1350, "showed": 1351, "equal": 1352, "motion": 1353, "factors": 1354, "vote": 1355, "decades": 1356, "stone": 1357, "refers": 1358, "acts": 1359, "heart": 1360, "prince": 1361, "citizens": 1362, "reaction": 1363, "call": 1364, "arrived": 1365, "removed": 1366, "literary": 1367, "grew": 1368, "bill": 1369, "s": 1370, "faith": 1371, "unique": 1372, "sector": 1373, "car": 1374, "income": 1375, "square": 1376, "saint": 1377, "winter": 1378, "1993": 1379, "gained": 1380, "animal": 1381, "writer": 1382, "table": 1383, "double": 1384, "friends": 1385, "invasion": 1386, "distribution": 1387, "communist": 1388, "executive": 1389, "sought": 1390, "giving": 1391, "mean": 1392, "ordered": 1393, "territories": 1394, "overall": 1395, "staff": 1396, "completely": 1397, "increasingly": 1398, "nine": 1399, "justice": 1400, "expanded": 1401, "christianity": 1402, "historians": 1403, "powerful": 1404, "awarded": 1405, "1989": 1406, "specifically": 1407, "foundation": 1408, "politics": 1409, "g": 1410, "americans": 1411, "keep": 1412, "containing": 1413, "hit": 1414, "peter": 1415, "p": 1416, "supreme": 1417, "studio": 1418, "immediately": 1419, "sites": 1420, "advanced": 1421, "inside": 1422, "takes": 1423, "competition": 1424, "notably": 1425, "railway": 1426, "actions": 1427, "actually": 1428, "normal": 1429, "cross": 1430, "theatre": 1431, "h": 1432, "secretary": 1433, "michael": 1434, "pacific": 1435, "r": 1436, "deal": 1437, "rates": 1438, "attention": 1439, "question": 1440, "apollo": 1441, "users": 1442, "significantly": 1443, "understanding": 1444, "student": 1445, "running": 1446, "spoken": 1447, "principle": 1448, "occurs": 1449, "weeks": 1450, "saying": 1451, "application": 1452, "write": 1453, "fish": 1454, "mentioned": 1455, "domestic": 1456, "pope": 1457, "leadership": 1458, "rejected": 1459, "raised": 1460, "cold": 1461, "possibly": 1462, "resistance": 1463, "creating": 1464, "extensive": 1465, "equipment": 1466, "whereas": 1467, "theories": 1468, "face": 1469, "hydrogen": 1470, "liberal": 1471, "worldwide": 1472, "oldest": 1473, "relative": 1474, "awards": 1475, "recently": 1476, "depending": 1477, "formula": 1478, "bay": 1479, "joseph": 1480, "identity": 1481, "planned": 1482, "cut": 1483, "brown": 1484, "tax": 1485, "determined": 1486, "plays": 1487, "branch": 1488, "describes": 1489, "authors": 1490, "von": 1491, "windows": 1492, "generation": 1493, "sets": 1494, "crisis": 1495, "mathematics": 1496, "chicago": 1497, "medicine": 1498, "moving": 1499, "hard": 1500, "situation": 1501, "differences": 1502, "cycle": 1503, "processes": 1504, "queen": 1505, "goal": 1506, "belief": 1507, "arab": 1508, "travel": 1509, "volume": 1510, "studied": 1511, "perhaps": 1512, "ultimately": 1513, "tried": 1514, "follows": 1515, "reduce": 1516, "require": 1517, "plans": 1518, "scotland": 1519, "policies": 1520, "kept": 1521, "difference": 1522, "importance": 1523, "stations": 1524, "scientists": 1525, "destroyed": 1526, "devices": 1527, "cover": 1528, "phase": 1529, "texts": 1530, "greece": 1531, "heat": 1532, "context": 1533, "census": 1534, "closed": 1535, "labor": 1536, "granted": 1537, "purpose": 1538, "shared": 1539, "mountains": 1540, "connected": 1541, "indigenous": 1542, "aid": 1543, "equivalent": 1544, "programming": 1545, "arms": 1546, "fell": 1547, "temple": 1548, "intelligence": 1549, "dance": 1550, "bce": 1551, "martin": 1552, "existing": 1553, "meant": 1554, "settlement": 1555, "gives": 1556, "something": 1557, "conservative": 1558, "christ": 1559, "say": 1560, "shot": 1561, "controlled": 1562, "avoid": 1563, "ruled": 1564, "mind": 1565, "architecture": 1566, "regarding": 1567, "deep": 1568, "instruments": 1569, "attempts": 1570, "causes": 1571, "represent": 1572, "electronic": 1573, "communication": 1574, "reach": 1575, "presidential": 1576, "review": 1577, "core": 1578, "etc": 1579, "tv": 1580, "projects": 1581, "proved": 1582, "behavior": 1583, "prize": 1584, "officers": 1585, "price": 1586, "comes": 1587, "actors": 1588, "care": 1589, "says": 1590, "closely": 1591, "1986": 1592, "achieved": 1593, "week": 1594, "flow": 1595, "shortly": 1596, "describe": 1597, "learning": 1598, "universe": 1599, "solution": 1600, "bodies": 1601, "bridge": 1602, "widespread": 1603, "1984": 1604, "conducted": 1605, "views": 1606, "universal": 1607, "toward": 1608, "parents": 1609, "1945": 1610, "reform": 1611, "felt": 1612, "opening": 1613, "kind": 1614, "1980": 1615, "reasons": 1616, "influential": 1617, "environmental": 1618, "fighting": 1619, "christians": 1620, "going": 1621, "captured": 1622, "supply": 1623, "fuel": 1624, "suggests": 1625, "1979": 1626, "31": 1627, "daily": 1628, "29": 1629, "winning": 1630, "1975": 1631, "academic": 1632, "portuguese": 1633, "1985": 1634, "match": 1635, "200": 1636, "crew": 1637, "offer": 1638, "reports": 1639, "nor": 1640, "gdp": 1641, "angeles": 1642, "principles": 1643, "developing": 1644, "capacity": 1645, "providing": 1646, "visited": 1647, "sciences": 1648, "authorities": 1649, "historically": 1650, "constant": 1651, "serious": 1652, "computers": 1653, "episode": 1654, "unknown": 1655, "pass": 1656, "combination": 1657, "van": 1658, "1950s": 1659, "mountain": 1660, "heavily": 1661, "championship": 1662, "weight": 1663, "articles": 1664, "traditionally": 1665, "mathematical": 1666, "pay": 1667, "alongside": 1668, "failure": 1669, "contact": 1670, "smith": 1671, "thousands": 1672, "towns": 1673, "round": 1674, "agricultural": 1675, "leave": 1676, "brothers": 1677, "scottish": 1678, "naval": 1679, "defeat": 1680, "physics": 1681, "1970": 1682, "listed": 1683, "effort": 1684, "discovery": 1685, "know": 1686, "citys": 1687, "technical": 1688, "scene": 1689, "colonial": 1690, "solar": 1691, "eu": 1692, "classes": 1693, "dark": 1694, "introduction": 1695, "suffered": 1696, "secondary": 1697, "fifth": 1698, "births": 1699, "requires": 1700, "alliance": 1701, "similarly": 1702, "finished": 1703, "external": 1704, "practices": 1705, "novels": 1706, "particles": 1707, "organized": 1708, "shape": 1709, "room": 1710, "attended": 1711, "hebrew": 1712, "genetic": 1713, "ages": 1714, "edward": 1715, "residents": 1716, "le": 1717, "route": 1718, "t": 1719, "alone": 1720, "build": 1721, "paid": 1722, "statement": 1723, "artist": 1724, "affected": 1725, "secret": 1726, "1983": 1727, "tree": 1728, "online": 1729, "60": 1730, "owned": 1731, "mixed": 1732, "1988": 1733, "courts": 1734, "ranked": 1735, "1968": 1736, "emerged": 1737, "receive": 1738, "positions": 1739, "arabic": 1740, "logic": 1741, "oxygen": 1742, "mobile": 1743, "professor": 1744, "persons": 1745, "contained": 1746, "maintain": 1747, "components": 1748, "acquired": 1749, "maintained": 1750, "host": 1751, "defence": 1752, "moral": 1753, "traditions": 1754, "guitar": 1755, "compounds": 1756, "consider": 1757, "officials": 1758, "becomes": 1759, "minutes": 1760, "target": 1761, "combat": 1762, "village": 1763, "entirely": 1764, "maximum": 1765, "lands": 1766, "fight": 1767, "rivers": 1768, "rare": 1769, "damage": 1770, "agriculture": 1771, "popularity": 1772, "contributed": 1773, "spirit": 1774, "goods": 1775, "roughly": 1776, "symbol": 1777, "voice": 1778, "choice": 1779, "1987": 1780, "aspects": 1781, "typical": 1782, "meet": 1783, "sequence": 1784, "bring": 1785, "carry": 1786, "dedicated": 1787, "easily": 1788, "perform": 1789, "violence": 1790, "constructed": 1791, "publication": 1792, "1969": 1793, "1982": 1794, "ottoman": 1795, "houses": 1796, "jerusalem": 1797, "atlantic": 1798, "christmas": 1799, "evolution": 1800, "banks": 1801, "cast": 1802, "display": 1803, "operated": 1804, "wall": 1805, "18th": 1806, "broadcast": 1807, "cancer": 1808, "slightly": 1809, "1000": 1810, "investment": 1811, "condition": 1812, "senate": 1813, "am": 1814, "trial": 1815, "zone": 1816, "wave": 1817, "republican": 1818, "1972": 1819, "settled": 1820, "k": 1821, "1971": 1822, "descent": 1823, "concluded": 1824, "bible": 1825, "sales": 1826, "comedy": 1827, "permanent": 1828, "hot": 1829, "employed": 1830, "younger": 1831, "hospital": 1832, "atoms": 1833, "orders": 1834, "track": 1835, "frequency": 1836, "confirmed": 1837, "clubs": 1838, "contract": 1839, "ball": 1840, "persian": 1841, "magnetic": 1842, "output": 1843, "device": 1844, "technique": 1845, "causing": 1846, "stable": 1847, "apple": 1848, "forest": 1849, "1974": 1850, "factor": 1851, "bbc": 1852, "electron": 1853, "note": 1854, "signal": 1855, "netherlands": 1856, "asian": 1857, "runs": 1858, "drug": 1859, "measure": 1860, "surrounding": 1861, "sons": 1862, "actual": 1863, "w": 1864, "purposes": 1865, "occupied": 1866, "audience": 1867, "marine": 1868, "otherwise": 1869, "duke": 1870, "spring": 1871, "demand": 1872, "reading": 1873, "post": 1874, "1981": 1875, "sister": 1876, "obtained": 1877, "revealed": 1878, "translation": 1879, "unable": 1880, "improved": 1881, "ibn": 1882, "philosophers": 1883, "rail": 1884, "crime": 1885, "measures": 1886, "recording": 1887, "fleet": 1888, "molecules": 1889, "joint": 1890, "columbia": 1891, "sign": 1892, "affairs": 1893, "1967": 1894, "follow": 1895, "1976": 1896, "wood": 1897, "brain": 1898, "additionally": 1899, "producing": 1900, "decline": 1901, "1973": 1902, "approved": 1903, "jersey": 1904, "safety": 1905, "fundamental": 1906, "movements": 1907, "nazi": 1908, "split": 1909, "crown": 1910, "populations": 1911, "mental": 1912, "coming": 1913, "silver": 1914, "greatly": 1915, "sides": 1916, "lives": 1917, "expression": 1918, "temperatures": 1919, "vehicles": 1920, "radiation": 1921, "strength": 1922, "setting": 1923, "supporting": 1924, "movie": 1925, "debate": 1926, "al": 1927, "covered": 1928, "accounts": 1929, "seats": 1930, "managed": 1931, "painting": 1932, "protect": 1933, "transfer": 1934, "steel": 1935, "succeeded": 1936, "concepts": 1937, "rapid": 1938, "1978": 1939, "writings": 1940, "calendar": 1941, "womens": 1942, "ran": 1943, "composition": 1944, "images": 1945, "connection": 1946, "el": 1947, "ago": 1948, "visit": 1949, "finland": 1950, "hands": 1951, "forward": 1952, "search": 1953, "hill": 1954, "personnel": 1955, "ministry": 1956, "instrument": 1957, "titled": 1958, "quantum": 1959, "advantage": 1960, "dominant": 1961, "tribes": 1962, "establishment": 1963, "establish": 1964, "0": 1965, "teaching": 1966, "your": 1967, "extent": 1968, "broke": 1969, "networks": 1970, "useful": 1971, "peninsula": 1972, "attributed": 1973, "file": 1974, "argues": 1975, "islam": 1976, "why": 1977, "roles": 1978, "constitutional": 1979, "pieces": 1980, "producer": 1981, "experienced": 1982, "cars": 1983, "musicians": 1984, "script": 1985, "chosen": 1986, "electrons": 1987, "drive": 1988, "southeast": 1989, "quite": 1990, "master": 1991, "dates": 1992, "afghanistan": 1993, "principal": 1994, "severe": 1995, "determine": 1996, "sir": 1997, "rose": 1998, "focused": 1999, "UNKNOWN": 2000} \ No newline at end of file diff --git a/nlp_class2/logistic.py b/nlp_class2/logistic.py index 352c2f57..eda2357e 100644 --- a/nlp_class2/logistic.py +++ b/nlp_class2/logistic.py @@ -47,7 +47,7 @@ # train a logistic model - W = np.random.randn(V, V) / np.sqrt(V) + W = np.random.randn(V, V)/np.sqrt(V) losses = [] epochs = 1 @@ -56,7 +56,7 @@ def softmax(a): a = a - a.max() exp_a = np.exp(a) - return exp_a / exp_a.sum(axis=1, keepdims=True) + return exp_a/exp_a.sum(axis=1, keepdims=True) # what is the loss if we set W = log(bigram_probs)? W_bigram = np.log(bigram_probs) @@ -85,19 +85,19 @@ def softmax(a): W = W - lr * inputs.T.dot(predictions - targets) # keep track of the loss - loss = -np.sum(targets * np.log(predictions)) / (n - 1) + loss = -np.sum(targets*np.log(predictions))/(n - 1) losses.append(loss) # keep track of the bigram loss # only do it for the first epoch to avoid redundancy if epoch == 0: bigram_predictions = softmax(inputs.dot(W_bigram)) - bigram_loss = -np.sum(targets * np.log(bigram_predictions)) / (n - 1) + bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n - 1) bigram_losses.append(bigram_loss) - if j % 10 == 0: - print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss) + if j%10 == 0: + print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}") j += 1 print("Elapsed time training:", datetime.now() - t0) @@ -114,8 +114,8 @@ def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): - z = decay * last + (1 - decay) * x[t] - y[t] = z / (1 - decay ** (t + 1)) + z = decay*last + (1 - decay)*x[t] + y[t] = z/(1 - decay**(t + 1)) last = z return y diff --git a/nlp_class2/ner_tf.py b/nlp_class2/ner_tf.py index 7f8fa2c1..6c2e7a53 100644 --- a/nlp_class2/ner_tf.py +++ b/nlp_class2/ner_tf.py @@ -13,16 +13,15 @@ import os import sys sys.path.append(os.path.abspath('..')) -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle from util import init_weight from datetime import datetime -from sklearn.metrics import f1_score - -from tensorflow.contrib.rnn import static_rnn as get_rnn_output -from tensorflow.contrib.rnn import BasicRNNCell, GRUCell - +#from sklearn.metrics import f1_score +from tensorflow.keras.layers import GRUCell, RNN #type: ignore +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() def get_data(split_sequences=False): word2idx = {} @@ -33,7 +32,7 @@ def get_data(split_sequences=False): Ytrain = [] currentX = [] currentY = [] - for line in open('ner.txt'): + for line in open('ner.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -95,16 +94,16 @@ def flatten(l): # pad sequences Xtrain = tf.keras.preprocessing.sequence.pad_sequences(Xtrain, maxlen=sequence_length) Ytrain = tf.keras.preprocessing.sequence.pad_sequences(Ytrain, maxlen=sequence_length) -Xtest = tf.keras.preprocessing.sequence.pad_sequences(Xtest, maxlen=sequence_length) -Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen=sequence_length) +Xtest = tf.keras.preprocessing.sequence.pad_sequences(Xtest, maxlen=sequence_length) +Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen=sequence_length) print("Xtrain.shape:", Xtrain.shape) print("Ytrain.shape:", Ytrain.shape) # inputs -inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) -targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) +inputs = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) +targets = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding @@ -119,19 +118,18 @@ def flatten(l): tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) -# make the rnn unit -rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) - +rnn_unit = RNN(GRUCell( + units=hidden_layer_size, activation=tf.nn.relu), return_sequences=True, return_state=True) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x D # into a list of length T, where each element is a tensor of shape N x D -x = tf.unstack(x, sequence_length, 1) +#x = tf.unstack(x, sequence_length, 1) # get the rnn output -outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) +outputs, states = rnn_unit(x) # outputs are now of size (T, N, M) @@ -151,14 +149,14 @@ def flatten(l): labels=labels_flat ) ) -train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op) +train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(cost_op) # init stuff -sess = tf.InteractiveSession() -init = tf.global_variables_initializer() +sess = tf.compat.v1.InteractiveSession() +init = tf.compat.v1.global_variables_initializer() sess.run(init) diff --git a/nlp_class2/neural_network.py b/nlp_class2/neural_network.py index d44c6f52..4f5148a3 100644 --- a/nlp_class2/neural_network.py +++ b/nlp_class2/neural_network.py @@ -48,8 +48,8 @@ # train a shallow neural network model D = 100 - W1 = np.random.randn(V, D) / np.sqrt(V) - W2 = np.random.randn(D, V) / np.sqrt(D) + W1 = np.random.randn(V, D)/np.sqrt(V) + W2 = np.random.randn(D, V)/np.sqrt(D) losses = [] epochs = 1 @@ -58,7 +58,7 @@ def softmax(a): a = a - a.max() exp_a = np.exp(a) - return exp_a / exp_a.sum(axis=1, keepdims=True) + return exp_a/exp_a.sum(axis=1, keepdims=True) # what is the loss if we set W = log(bigram_probs)? W_bigram = np.log(bigram_probs) @@ -84,24 +84,24 @@ def softmax(a): predictions = softmax(hidden.dot(W2)) # do a gradient descent step - W2 = W2 - lr * hidden.T.dot(predictions - targets) - dhidden = (predictions - targets).dot(W2.T) * (1 - hidden * hidden) - W1 = W1 - lr * inputs.T.dot(dhidden) + W2 = W2 - lr*hidden.T.dot(predictions - targets) + dhidden = (predictions - targets).dot(W2.T)*(1 - hidden*hidden) + W1 = W1 - lr*inputs.T.dot(dhidden) # keep track of the loss - loss = -np.sum(targets * np.log(predictions)) / (n - 1) + loss = -np.sum(targets*np.log(predictions))/(n - 1) losses.append(loss) # keep track of the bigram loss # only do it for the first epoch to avoid redundancy if epoch == 0: bigram_predictions = softmax(inputs.dot(W_bigram)) - bigram_loss = -np.sum(targets * np.log(bigram_predictions)) / (n - 1) + bigram_loss = -np.sum(targets*np.log(bigram_predictions))/(n - 1) bigram_losses.append(bigram_loss) if j % 10 == 0: - print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss) + print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}") j += 1 print("Elapsed time training:", datetime.now() - t0) @@ -118,8 +118,8 @@ def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): - z = decay * last + (1 - decay) * x[t] - y[t] = z / (1 - decay ** (t + 1)) + z = decay*last + (1 - decay)*x[t] + y[t] = z/(1 - decay**(t + 1)) last = z return y diff --git a/nlp_class2/neural_network2.py b/nlp_class2/neural_network2.py index 159dc571..c9df7f11 100644 --- a/nlp_class2/neural_network2.py +++ b/nlp_class2/neural_network2.py @@ -48,8 +48,8 @@ # train a shallow neural network model D = 100 - W1 = np.random.randn(V, D) / np.sqrt(V) - W2 = np.random.randn(D, V) / np.sqrt(D) + W1 = np.random.randn(V, D)/np.sqrt(V) + W2 = np.random.randn(D, V)/np.sqrt(D) losses = [] epochs = 1 @@ -58,7 +58,7 @@ def softmax(a): a = a - a.max() exp_a = np.exp(a) - return exp_a / exp_a.sum(axis=1, keepdims=True) + return exp_a/exp_a.sum(axis=1, keepdims=True) # what is the loss if we set W = log(bigram_probs)? W_bigram = np.log(bigram_probs) @@ -82,7 +82,7 @@ def softmax(a): predictions = softmax(hidden.dot(W2)) # keep track of the loss - loss = -np.sum(np.log(predictions[np.arange(n - 1), targets])) / (n - 1) + loss = -np.sum(np.log(predictions[np.arange(n - 1), targets]))/(n - 1) losses.append(loss) # do a gradient descent step @@ -90,14 +90,14 @@ def softmax(a): # we don't want to make a copy because it would be slow doutput = predictions # N x V doutput[np.arange(n - 1), targets] -= 1 - W2 = W2 - lr * hidden.T.dot(doutput) # (D x N) (N x V) - dhidden = doutput.dot(W2.T) * (1 - hidden * hidden) # (N x V) (V x D) * (N x D) + W2 = W2 - lr*hidden.T.dot(doutput) # (D x N) (N x V) + dhidden = doutput.dot(W2.T)*(1 - hidden*hidden) # (N x V) (V x D) * (N x D) # # for reference: # # original: W1 = W1 - lr * inputs.T.dot(dhidden) # VxN NxD --> VxD # fastest way W1_copy = W1.copy() - np.subtract.at(W1, inputs, lr * dhidden) + np.subtract.at(W1, inputs, lr*dhidden) # vs this # W1_test = W1_copy.copy() @@ -118,12 +118,12 @@ def softmax(a): # only do it for the first epoch to avoid redundancy if epoch == 0: bigram_predictions = softmax(W_bigram[inputs]) - bigram_loss = -np.sum(np.log(bigram_predictions[np.arange(n - 1), targets])) / (n - 1) + bigram_loss = -np.sum(np.log(bigram_predictions[np.arange(n - 1), targets]))/(n - 1) bigram_losses.append(bigram_loss) if j % 100 == 0: - print("epoch:", epoch, "sentence: %s/%s" % (j, len(sentences)), "loss:", loss) + print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}") j += 1 @@ -141,8 +141,8 @@ def smoothed_loss(x, decay=0.99): y = np.zeros(len(x)) last = 0 for t in range(len(x)): - z = decay * last + (1 - decay) * x[t] - y[t] = z / (1 - decay ** (t + 1)) + z = decay*last + (1 - decay)*x[t] + y[t] = z / (1 - decay**(t + 1)) last = z return y diff --git a/nlp_class2/pmi.py b/nlp_class2/pmi.py index b321e91f..941517cb 100644 --- a/nlp_class2/pmi.py +++ b/nlp_class2/pmi.py @@ -49,7 +49,7 @@ def remove_punctuation_3(s): num_lines = 0 num_tokens = 0 for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): # don't count headers, structured data, lists, etc... if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'): num_lines += 1 @@ -112,7 +112,7 @@ def remove_punctuation_3(s): k = 0 # for line in open('../large_files/text8'): for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): # don't count headers, structured data, lists, etc... if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'): line_as_idx = [] @@ -153,7 +153,7 @@ def remove_punctuation_3(s): # PMI(w, c) = #(w, c) / #(w) / p(c) # pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays -pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr() +pmi = wc_counts.multiply(1.0/wc_counts.sum(axis=1)/c_probs).tocsr() # this operation changes it to a coo_matrix # which doesn't have functions we need, e.g log1p() # so convert it back to a csr @@ -172,9 +172,9 @@ def remove_punctuation_3(s): # initialize weights -W = np.random.randn(V, D) / np.sqrt(V + D) +W = np.random.randn(V, D)/np.sqrt(V + D) b = np.zeros(V) -U = np.random.randn(V, D) / np.sqrt(V + D) +U = np.random.randn(V, D)/np.sqrt(V + D) c = np.zeros(V) mu = logX.mean() @@ -220,7 +220,7 @@ def remove_punctuation_3(s): W = np.linalg.solve(matrix, vector).T # vectorized update b - b = (logX - W.dot(U.T) - c.reshape(1, V) - mu).sum(axis=1) / V + b = (logX - W.dot(U.T) - c.reshape(1, V) - mu).sum(axis=1)/V # vectorized update U matrix = reg*np.eye(D) + W.T.dot(W) @@ -228,7 +228,7 @@ def remove_punctuation_3(s): U = np.linalg.solve(matrix, vector).T # vectorized update c - c = (logX - W.dot(U.T) - b.reshape(V, 1) - mu).sum(axis=0) / V + c = (logX - W.dot(U.T) - b.reshape(V, 1) - mu).sum(axis=0)/V print("train duration:", datetime.now() - t0) @@ -259,6 +259,9 @@ def remove_punctuation_3(s): # set word embedding matrix # W = (W + U) / 2 +vec = np.asarray(vec) +W = np.asarray(W) + distances = pairwise_distances(vec.reshape(1, D), W, metric='cosine').reshape(V) idx = distances.argsort()[:10] @@ -266,7 +269,9 @@ def remove_punctuation_3(s): for i in idx: print(top_words[i], distances[i]) -print("dist to queen:", cos_dist(W[word2idx['queen']], vec)) +queen_vector = np.squeeze(W[word2idx['queen']]) +vec = np.squeeze(vec) +print("dist to queen:", cos_dist(queen_vector, vec)) diff --git a/nlp_class2/pmi_counts_2000.npz b/nlp_class2/pmi_counts_2000.npz new file mode 100644 index 00000000..d626d8d6 Binary files /dev/null and b/nlp_class2/pmi_counts_2000.npz differ diff --git a/nlp_class2/pos_hmm.py b/nlp_class2/pos_hmm.py index e3065cd2..0e3345ab 100644 --- a/nlp_class2/pos_hmm.py +++ b/nlp_class2/pos_hmm.py @@ -15,7 +15,7 @@ sys.path.append(os.path.abspath('..')) from hmm_class.hmmd_scaled import HMM -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle from datetime import datetime from sklearn.metrics import f1_score @@ -28,7 +28,7 @@ def accuracy(T, Y): for t, y in zip(T, Y): n_correct += np.sum(t == y) n_total += len(y) - return float(n_correct) / n_total + return float(n_correct)/n_total def total_f1_score(T, Y): @@ -41,6 +41,78 @@ def total_f1_score(T, Y): # def flatten(l): # return [item for sublist in l for item in sublist] +def get_data(split_sequences=False): + if not os.path.exists('chunking'): + print("Please create a folder in your local directory called 'chunking'") + print("train.txt and test.txt should be stored in there.") + print("Please check the comments to get the download link.") + exit() + elif not os.path.exists('chunking/train.txt'): + print("train.txt is not in chunking/train.txt") + print("Please check the comments to get the download link.") + exit() + elif not os.path.exists('chunking/test.txt'): + print("test.txt is not in chunking/test.txt") + print("Please check the comments to get the download link.") + exit() + + word2idx = {} + tag2idx = {} + word_idx = 0 + tag_idx = 0 + Xtrain = [] + Ytrain = [] + currentX = [] + currentY = [] + for line in open('chunking/train.txt', encoding='utf-8'): + line = line.rstrip() + if line: + r = line.split() + word, tag, _ = r + if word not in word2idx: + word2idx[word] = word_idx + word_idx += 1 + currentX.append(word2idx[word]) + + if tag not in tag2idx: + tag2idx[tag] = tag_idx + tag_idx += 1 + currentY.append(tag2idx[tag]) + elif split_sequences: + Xtrain.append(currentX) + Ytrain.append(currentY) + currentX = [] + currentY = [] + + if not split_sequences: + Xtrain = currentX + Ytrain = currentY + + # load and score test data + Xtest = [] + Ytest = [] + currentX = [] + currentY = [] + for line in open('chunking/test.txt', encoding='utf-8'): + line = line.rstrip() + if line: + r = line.split() + word, tag, _ = r + if word in word2idx: + currentX.append(word2idx[word]) + else: + currentX.append(word_idx) # use this as unknown + currentY.append(tag2idx[tag]) + elif split_sequences: + Xtest.append(currentX) + Ytest.append(currentY) + currentX = [] + currentY = [] + if not split_sequences: + Xtest = currentX + Ytest = currentY + + return Xtrain, Ytrain, Xtest, Ytest, word2idx def main(smoothing=1e-1): # X = words, Y = POS tags diff --git a/nlp_class2/pos_ner_keras.py b/nlp_class2/pos_ner_keras.py index 7a1335e1..9c64609e 100644 --- a/nlp_class2/pos_ner_keras.py +++ b/nlp_class2/pos_ner_keras.py @@ -9,20 +9,21 @@ import numpy as np import matplotlib.pyplot as plt +import tensorflow as tf import os import sys sys.path.append(os.path.abspath('..')) -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle -from util import init_weight +#from util import init_weight from datetime import datetime -from sklearn.metrics import f1_score +#from sklearn.metrics import f1_score -from keras.models import Model -from keras.layers import Input, Dense, Embedding, LSTM, GRU -from keras.preprocessing.sequence import pad_sequences -from keras.preprocessing.text import Tokenizer -from keras.optimizers import Adam +from tensorflow.keras.models import Model #type: ignore +from tensorflow.keras.layers import Input, Dense, Embedding, GRU, LSTM, SimpleRNN #type: ignore +from tensorflow.keras.preprocessing.sequence import pad_sequences #type: ignore +from tensorflow.keras.preprocessing.text import Tokenizer #type: ignore +from tensorflow.keras.optimizers import Adam #type: ignore MAX_VOCAB_SIZE = 20000 @@ -30,73 +31,12 @@ -def get_data_pos(split_sequences=False): - if not os.path.exists('chunking'): - print("Please create a folder in your local directory called 'chunking'") - print("train.txt and test.txt should be stored in there.") - print("Please check the comments to get the download link.") - exit() - elif not os.path.exists('chunking/train.txt'): - print("train.txt is not in chunking/train.txt") - print("Please check the comments to get the download link.") - exit() - elif not os.path.exists('chunking/test.txt'): - print("test.txt is not in chunking/test.txt") - print("Please check the comments to get the download link.") - exit() - - Xtrain = [] - Ytrain = [] - currentX = [] - currentY = [] - for line in open('chunking/train.txt'): - line = line.rstrip() - if line: - r = line.split() - word, tag, _ = r - currentX.append(word) - - currentY.append(tag) - elif split_sequences: - Xtrain.append(currentX) - Ytrain.append(currentY) - currentX = [] - currentY = [] - - if not split_sequences: - Xtrain = currentX - Ytrain = currentY - - # load and score test data - Xtest = [] - Ytest = [] - currentX = [] - currentY = [] - for line in open('chunking/test.txt'): - line = line.rstrip() - if line: - r = line.split() - word, tag, _ = r - currentX.append(word) - currentY.append(tag) - elif split_sequences: - Xtest.append(currentX) - Ytest.append(currentY) - currentX = [] - currentY = [] - if not split_sequences: - Xtest = currentX - Ytest = currentY - - return Xtrain, Ytrain, Xtest, Ytest - - def get_data_ner(split_sequences=False): Xtrain = [] Ytrain = [] currentX = [] currentY = [] - for line in open('ner.txt'): + for line in open('ner.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -138,7 +78,7 @@ def get_data_ner(split_sequences=False): # get word -> integer mapping word2idx = tokenizer.word_index -print('Found %s unique tokens.' % len(word2idx)) +print(f'Found {len(word2idx)} unique tokens.') vocab_size = min(MAX_VOCAB_SIZE, len(word2idx) + 1) @@ -150,7 +90,7 @@ def get_data_ner(split_sequences=False): # get tag -> integer mapping tag2idx = tokenizer2.word_index -print('Found %s unique tags.' % len(tag2idx)) +print(f'Found {len(tag2idx)} unique tags.') num_tags = min(MAX_TAGS, len(tag2idx) + 1) @@ -189,34 +129,32 @@ def get_data_ner(split_sequences=False): # build the model input_ = Input(shape=(sequence_length,)) x = Embedding(vocab_size, embedding_dim)(input_) -x = GRU(hidden_layer_size, return_sequences=True)(x) +x = SimpleRNN(hidden_layer_size, return_sequences=True)(x) output = Dense(num_tags, activation='softmax')(x) model = Model(input_, output) model.compile( loss='categorical_crossentropy', - optimizer=Adam(lr=1e-2), + optimizer=Adam(learning_rate=1e-2), metrics=['accuracy'] ) print('Training model...') -r = model.fit( - Xtrain, - Ytrain_onehot, - batch_size=batch_size, - epochs=epochs, - validation_data=(Xtest, Ytest_onehot) -) +r = model.fit(Xtrain, + Ytrain_onehot, + batch_size=batch_size, + epochs=epochs, + validation_data=(Xtest, Ytest_onehot)) -# plot some data +# plot loss plt.plot(r.history['loss'], label='loss') plt.plot(r.history['val_loss'], label='val_loss') plt.legend() plt.show() -# accuracies +# plot accuracy plt.plot(r.history['accuracy'], label='acc') plt.plot(r.history['val_accuracy'], label='val_acc') plt.legend() diff --git a/nlp_class2/pos_tf.py b/nlp_class2/pos_tf.py index 974453b6..c4d1724c 100644 --- a/nlp_class2/pos_tf.py +++ b/nlp_class2/pos_tf.py @@ -13,15 +13,15 @@ import os import sys sys.path.append(os.path.abspath('..')) -from pos_baseline import get_data +#from pos_baseline import get_data from sklearn.utils import shuffle from util import init_weight from datetime import datetime -from sklearn.metrics import f1_score - -from tensorflow.contrib.rnn import static_rnn as get_rnn_output -from tensorflow.contrib.rnn import BasicRNNCell, GRUCell +#from sklearn.metrics import f1_score +from tensorflow.keras.layers import GRUCell, RNN #type: ignore +if tf.__version__.startswith('2'): + tf.compat.v1.disable_eager_execution() def get_data(split_sequences=False): @@ -47,7 +47,7 @@ def get_data(split_sequences=False): Ytrain = [] currentX = [] currentY = [] - for line in open('chunking/train.txt'): + for line in open('chunking/train.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -76,7 +76,7 @@ def get_data(split_sequences=False): Ytest = [] currentX = [] currentY = [] - for line in open('chunking/test.txt'): + for line in open('chunking/test.txt', encoding='utf-8'): line = line.rstrip() if line: r = line.split() @@ -110,7 +110,7 @@ def flatten(l): # training config -epochs = 20 +epochs = 200 learning_rate = 1e-2 mu = 0.99 batch_size = 32 @@ -131,8 +131,8 @@ def flatten(l): # inputs -inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) -targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) +inputs = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) +targets = tf.compat.v1.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding @@ -148,7 +148,8 @@ def flatten(l): tfbo = tf.Variable(bo) # make the rnn unit -rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) +rnn_unit = RNN(GRUCell( + units=hidden_layer_size, activation=tf.nn.relu), return_sequences=True, return_state=True) # get the output @@ -156,10 +157,10 @@ def flatten(l): # converts x from a tensor of shape N x T x M # into a list of length T, where each element is a tensor of shape N x M -x = tf.unstack(x, sequence_length, 1) +#x = tf.unstack(x, sequence_length, 1) # get the rnn output -outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) +outputs, states = rnn_unit(x) # outputs are now of size (T, N, M) @@ -179,14 +180,14 @@ def flatten(l): labels=labels_flat ) ) -train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op) +train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(cost_op) # init stuff -sess = tf.InteractiveSession() -init = tf.global_variables_initializer() +sess = tf.compat.v1.InteractiveSession() +init = tf.compat.v1.global_variables_initializer() sess.run(init) @@ -222,8 +223,7 @@ def flatten(l): # print stuff out periodically if j % 10 == 0: sys.stdout.write( - "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % - (j, n_batches, float(n_correct)/n_total, cost) + f"j/N: {j}/{n_batches} correct rate so far: {float(n_correct)/n_total}, cost so far: {cost}\r" ) sys.stdout.flush() @@ -236,13 +236,13 @@ def flatten(l): pii = pi[yi > 0] n_test_correct += np.sum(yii == pii) n_test_total += len(yii) - test_acc = float(n_test_correct) / n_test_total + test_acc = float(n_test_correct)/n_test_total print( - "i:", i, "cost:", "%.4f" % cost, - "train acc:", "%.4f" % (float(n_correct)/n_total), - "test acc:", "%.4f" % test_acc, - "time for epoch:", (datetime.now() - t0) + f'''i: {i}, cost: {cost:.4f}, + train acc: {float(n_correct)/n_total:.4f}, + test acc: {test_acc:.4f}, + time for epoch: {(datetime.now() - t0)}''' ) costs.append(cost) diff --git a/nlp_class2/rntn_tensorflow.py b/nlp_class2/rntn_tensorflow.py index 77b563f2..f3022da1 100644 --- a/nlp_class2/rntn_tensorflow.py +++ b/nlp_class2/rntn_tensorflow.py @@ -47,9 +47,9 @@ def __init__(self, V, D, K, activation): We = init_weight(V, D) # quadratic terms - W11 = np.random.randn(D, D, D) / np.sqrt(3*D) - W22 = np.random.randn(D, D, D) / np.sqrt(3*D) - W12 = np.random.randn(D, D, D) / np.sqrt(3*D) + W11 = np.random.randn(D, D, D)/np.sqrt(3*D) + W22 = np.random.randn(D, D, D)/np.sqrt(3*D) + W12 = np.random.randn(D, D, D)/np.sqrt(3*D) # linear terms W1 = init_weight(D, D) diff --git a/nlp_class2/rntn_tensorflow_rnn.py b/nlp_class2/rntn_tensorflow_rnn.py index 816ff4a2..a47d40aa 100644 --- a/nlp_class2/rntn_tensorflow_rnn.py +++ b/nlp_class2/rntn_tensorflow_rnn.py @@ -13,7 +13,7 @@ import tensorflow as tf from sklearn.utils import shuffle -from util import init_weight, get_ptb_data, display_tree +from util import init_weight, get_ptb_data from datetime import datetime from sklearn.metrics import f1_score @@ -191,7 +191,7 @@ def condition(hiddens, n): it += 1 if it % 10 == 0: sys.stdout.write( - "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % + "j/N: %d/%d correct rate so far: %.4f, cost so far: %.4f\r" % (it, N, float(n_correct)/n_total, cost) ) sys.stdout.flush() @@ -212,10 +212,10 @@ def condition(hiddens, n): print( - "i:", i, "cost:", cost, - "train acc:", float(n_correct)/n_total, - "test acc:", float(n_test_correct)/n_test_total, - "time for epoch:", (datetime.now() - t0) + "i: ",i, "cost: %.4f", cost, + "train acc: %.4f", float(n_correct)/n_total, + "test acc: %.4f", float(n_test_correct)/n_test_total, + "time for epoch: ",(datetime.now() - t0) ) costs.append(cost) diff --git a/nlp_class2/tfidf_tsne.py b/nlp_class2/tfidf_tsne.py index 55bd4ce5..329ef46a 100644 --- a/nlp_class2/tfidf_tsne.py +++ b/nlp_class2/tfidf_tsne.py @@ -20,7 +20,7 @@ import sys sys.path.append(os.path.abspath('..')) from rnn_class.util import get_wikipedia_data -from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx +#from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx from util import find_analogies from sklearn.feature_extraction.text import TfidfTransformer diff --git a/nlp_class2/util.py b/nlp_class2/util.py index f2a79888..620192ba 100644 --- a/nlp_class2/util.py +++ b/nlp_class2/util.py @@ -182,7 +182,7 @@ def get_ptb_data(): test = [] # train set first - for line in open('../large_files/trees/train.txt'): + for line in open('../large_files/trees/train.txt', encoding='utf-8'): line = line.rstrip() if line: t = str2tree(line, word2idx) @@ -194,7 +194,7 @@ def get_ptb_data(): # break # test set - for line in open('../large_files/trees/test.txt'): + for line in open('../large_files/trees/test.txt', encoding='utf-8'): line = line.rstrip() if line: t = str2tree(line, word2idx) diff --git a/nlp_class2/visualize_countries.py b/nlp_class2/visualize_countries.py index 9d0a44e8..456ff0e5 100644 --- a/nlp_class2/visualize_countries.py +++ b/nlp_class2/visualize_countries.py @@ -31,7 +31,7 @@ def main(we_file='glove_model_50.npz', w2i_file='glove_word2idx_50.json'): Z = Z[idx] plt.scatter(Z[:,0], Z[:,1]) for i in range(len(words)): - plt.annotate(s=words[i], xy=(Z[i,0], Z[i,1])) + plt.annotate(text=words[i], xy=(Z[i,0], Z[i,1])) plt.show() diff --git a/nlp_class2/word2vec.py b/nlp_class2/word2vec.py index ba92e68c..e8989fb6 100644 --- a/nlp_class2/word2vec.py +++ b/nlp_class2/word2vec.py @@ -49,7 +49,7 @@ def get_wiki(): files = glob('../large_files/enwiki*.txt') all_word_counts = {} for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -68,7 +68,7 @@ def get_wiki(): sents = [] for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -100,7 +100,7 @@ def train_model(savedir): # learning rate decay - learning_rate_delta = (learning_rate - final_learning_rate) / epochs + learning_rate_delta = (learning_rate - final_learning_rate)/epochs # params @@ -122,7 +122,7 @@ def train_model(savedir): # for subsampling each sentence threshold = 1e-5 - p_drop = 1 - np.sqrt(threshold / p_neg) + p_drop = 1 - np.sqrt(threshold/p_neg) # train the model @@ -137,9 +137,7 @@ def train_model(savedir): t0 = datetime.now() for sentence in sentences: # keep only certain words based on p_neg - sentence = [w for w in sentence \ - if np.random.random() < (1 - p_drop[w]) - ] + sentence = [w for w in sentence if np.random.random()<(1 - p_drop[w])] if len(sentence) < 2: continue @@ -170,14 +168,14 @@ def train_model(savedir): counter += 1 if counter % 100 == 0: - sys.stdout.write("processed %s / %s\r" % (counter, len(sentences))) + sys.stdout.write(f"processed {counter}/{len(sentence)}\r") sys.stdout.flush() # break # print stuff so we don't stare at a blank screen dt = datetime.now() - t0 - print("epoch complete:", epoch, "cost:", cost, "dt:", dt) + print(f"epoch complete: {epoch}, cost: {cost}, dt: {dt}") # save the cost costs.append(cost) @@ -195,10 +193,10 @@ def train_model(savedir): if not os.path.exists(savedir): os.mkdir(savedir) - with open('%s/word2idx.json' % savedir, 'w') as f: + with open(f'{savedir}/word2idx.json', 'w') as f: json.dump(word2idx, f) - np.savez('%s/weights.npz' % savedir, W, V) + np.savez(f'{savedir}/weights.npz', W, V) # return the model return word2idx, W, V @@ -220,7 +218,7 @@ def get_negative_sampling_distribution(sentences, vocab_size): p_neg = word_freq**0.75 # normalize it - p_neg = p_neg / p_neg.sum() + p_neg = p_neg/p_neg.sum() assert(np.all(p_neg > 0)) return p_neg @@ -259,12 +257,12 @@ def sgd(input_, targets, label, learning_rate, W, V): W[input_] -= learning_rate*gW # D # return cost (binary cross entropy) - cost = label * np.log(prob + 1e-10) + (1 - label) * np.log(1 - prob + 1e-10) + cost = label*np.log(prob + 1e-10) + (1 - label)*np.log(1 - prob + 1e-10) return cost.sum() def load_model(savedir): - with open('%s/word2idx.json' % savedir) as f: + with open(f'{savedir}/word2idx.json') as f: word2idx = json.load(f) npz = np.load('%s/weights.npz' % savedir) W = npz['arr_0'] @@ -277,7 +275,7 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): V, D = W.shape # don't actually use pos2 in calculation, just print what's expected - print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2)) + print(f"testing: {pos1} - {neg1} = {pos2} - {neg2}") for w in (pos1, neg1, pos2, neg2): if w not in word2idx: print("Sorry, %s not in word2idx" % w) @@ -303,12 +301,12 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): break # print("best_idx:", best_idx) - print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[best_idx], neg2)) + print(f"got: {pos1} - {neg1} = {idx2word[best_idx]} - {neg2}") print("closest 10:") for i in idx: print(idx2word[i], distances[i]) - print("dist to %s:" % pos2, cos_dist(p2, vec)) + print(f"dist to {pos2}: {cos_dist(p2, vec)}") def test_model(word2idx, W, V): diff --git a/nlp_class2/word2vec_tf.py b/nlp_class2/word2vec_tf.py index d272b003..2d53ab22 100644 --- a/nlp_class2/word2vec_tf.py +++ b/nlp_class2/word2vec_tf.py @@ -47,23 +47,23 @@ def download_text8(dst): pass -def get_text8(): - # download the data if it is not yet in the right place - path = '../large_files/text8' - if not os.path.exists(path): - download_text8(path) - - words = open(path).read() - word2idx = {} - sents = [[]] - count = 0 - for word in words.split(): - if word not in word2idx: - word2idx[word] = count - count += 1 - sents[0].append(word2idx[word]) - print("count:", count) - return sents, word2idx +# def get_text8(): +# # download the data if it is not yet in the right place +# path = '../large_files/text8' +# if not os.path.exists(path): +# download_text8(path) + +# words = open(path).read() +# word2idx = {} +# sents = [[]] +# count = 0 +# for word in words.split(): +# if word not in word2idx: +# word2idx[word] = count +# count += 1 +# sents[0].append(word2idx[word]) +# print("count:", count) +# return sents, word2idx def get_wiki(): @@ -71,7 +71,7 @@ def get_wiki(): files = glob('../large_files/enwiki*.txt') all_word_counts = {} for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -90,7 +90,7 @@ def get_wiki(): sents = [] for f in files: - for line in open(f): + for line in open(f, encoding='utf-8'): if line and line[0] not in '[*-|=\{\}': s = remove_punctuation(line).lower().split() if len(s) > 1: @@ -122,7 +122,7 @@ def train_model(savedir): D = 50 # word embedding size # learning rate decay - learning_rate_delta = (learning_rate - final_learning_rate) / epochs + learning_rate_delta = (learning_rate - final_learning_rate)/epochs # distribution for drawing negative samples p_neg = get_negative_sampling_distribution(sentences) @@ -202,7 +202,7 @@ def dot(A, B): # for subsampling each sentence threshold = 1e-5 - p_drop = 1 - np.sqrt(threshold / p_neg) + p_drop = 1 - np.sqrt(threshold/p_neg) # train the model @@ -221,9 +221,7 @@ def dot(A, B): for sentence in sentences: # keep only certain words based on p_neg - sentence = [w for w in sentence \ - if np.random.random() < (1 - p_drop[w]) - ] + sentence = [w for w in sentence if np.random.random() < (1 - p_drop[w])] if len(sentence) < 2: continue @@ -282,14 +280,14 @@ def dot(A, B): counter += 1 if counter % 100 == 0: - sys.stdout.write("processed %s / %s\r" % (counter, len(sentences))) + sys.stdout.write(f"processed {counter}/{len(sentences)}\r") sys.stdout.flush() # break # print stuff so we don't stare at a blank screen dt = datetime.now() - t0 - print("epoch complete:", epoch, "cost:", cost, "dt:", dt) + print(f"epoch complete: {epoch}, cost: {cost}, dt: {dt}") # save the cost costs.append(cost) @@ -310,10 +308,10 @@ def dot(A, B): if not os.path.exists(savedir): os.mkdir(savedir) - with open('%s/word2idx.json' % savedir, 'w') as f: + with open(f'{savedir}/word2idx.json', 'w') as f: json.dump(word2idx, f) - np.savez('%s/weights.npz' % savedir, W, V) + np.savez(f'{savedir}/weights.npz', W, V) # return the model return word2idx, W, V @@ -341,7 +339,7 @@ def get_negative_sampling_distribution(sentences): p_neg[j] = word_freq[j]**0.75 # normalize it - p_neg = p_neg / p_neg.sum() + p_neg = p_neg/p_neg.sum() assert(np.all(p_neg > 0)) return p_neg @@ -366,9 +364,9 @@ def get_context(pos, sentence, window_size): def load_model(savedir): - with open('%s/word2idx.json' % savedir) as f: + with open(f'{savedir}/word2idx.json') as f: word2idx = json.load(f) - npz = np.load('%s/weights.npz' % savedir) + npz = np.load(f'{savedir}/weights.npz') W = npz['arr_0'] V = npz['arr_1'] return word2idx, W, V @@ -379,10 +377,10 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): V, D = W.shape # don't actually use pos2 in calculation, just print what's expected - print("testing: %s - %s = %s - %s" % (pos1, neg1, pos2, neg2)) + print(f"testing: {pos1} - {neg1} = {pos2} - {neg2}") for w in (pos1, neg1, pos2, neg2): if w not in word2idx: - print("Sorry, %s not in word2idx" % w) + print(f"Sorry, {w} not in word2idx") return p1 = W[word2idx[pos1]] @@ -403,12 +401,12 @@ def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W): best_idx = i break - print("got: %s - %s = %s - %s" % (pos1, neg1, idx2word[idx[0]], neg2)) + print(f"got: {pos1} - {neg1} = {idx2word[idx[0]]} - {neg2}" ) print("closest 10:") for i in idx: print(idx2word[i], distances[i]) - print("dist to %s:" % pos2, cos_dist(p2, vec)) + print(f"dist to {pos2}: {cos_dist(p2, vec)}") def test_model(word2idx, W, V): diff --git a/recommenders/autorec.py b/recommenders/autorec.py index fa0bd415..9d044099 100644 --- a/recommenders/autorec.py +++ b/recommenders/autorec.py @@ -5,17 +5,17 @@ # Note: you may need to update your version of future # sudo pip install -U future -import numpy as np -import pandas as pd +#import numpy as np +#import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle -from scipy.sparse import save_npz, load_npz +from scipy.sparse import load_npz -import keras.backend as K -from keras.models import Model -from keras.layers import Input, Dropout, Dense -from keras.regularizers import l2 -from keras.optimizers import SGD +import tensorflow.keras.backend as K #type:ignore +from tensorflow.keras.models import Model #type:ignore +from tensorflow.keras.layers import Input, Dropout, Dense #type:ignore +from tensorflow.keras.regularizers import l2 #type:ignore +from tensorflow.keras.optimizers import SGD #type:ignore # config batch_size = 128 @@ -23,8 +23,8 @@ reg = 0.0001 # reg = 0 -A = load_npz("Atrain.npz") -A_test = load_npz("Atest.npz") +A = load_npz(".\\large_files\\movielens-20m-dataset\\Atrain.npz") +A_test = load_npz(".\\large_files\\movielens-20m-dataset\\Atest.npz") mask = (A > 0) * 1.0 mask_test = (A_test > 0) * 1.0 @@ -56,6 +56,8 @@ def custom_loss(y_true, y_pred): mask = K.cast(K.not_equal(y_true, 0), dtype='float32') + y_true = K.cast(y_true, dtype='float32') + y_pred = K.cast(y_pred, dtype='float32') diff = y_pred - y_true sqdiff = diff * diff * mask sse = K.sum(K.sum(sqdiff)) @@ -96,7 +98,7 @@ def test_generator(A, M, A_test, M_test): model = Model(i, x) model.compile( loss=custom_loss, - optimizer=SGD(lr=0.08, momentum=0.9), + optimizer=SGD(learning_rate=0.08, momentum=0.9), # optimizer='adam', metrics=[custom_loss], ) diff --git a/recommenders/itembased.py b/recommenders/itembased.py index f87f9481..ff5619a8 100644 --- a/recommenders/itembased.py +++ b/recommenders/itembased.py @@ -1,37 +1,37 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +from builtins import range#, input # Note: you may need to update your version of future # sudo pip install -U future import pickle import numpy as np -import pandas as pd -import matplotlib.pyplot as plt +#import pandas as pd +#import matplotlib.pyplot as plt from sklearn.utils import shuffle -from datetime import datetime +#from datetime import datetime from sortedcontainers import SortedList # load in the data import os -if not os.path.exists('user2movie.json') or \ - not os.path.exists('movie2user.json') or \ - not os.path.exists('usermovie2rating.json') or \ - not os.path.exists('usermovie2rating_test.json'): +if not os.path.exists('.\\large_files\\movielens-20m-dataset\\user2movie.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\movie2user.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json'): import preprocess2dict -with open('user2movie.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'rb') as f: user2movie = pickle.load(f) -with open('movie2user.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'rb') as f: movie2user = pickle.load(f) -with open('usermovie2rating.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'rb') as f: usermovie2rating = pickle.load(f) -with open('usermovie2rating_test.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'rb') as f: usermovie2rating_test = pickle.load(f) diff --git a/recommenders/mf2.py b/recommenders/mf2.py index 62b599c6..bcf54b54 100644 --- a/recommenders/mf2.py +++ b/recommenders/mf2.py @@ -1,13 +1,13 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +from builtins import range#, input # Note: you may need to update your version of future # sudo pip install -U future import pickle import numpy as np -import pandas as pd +#import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle from datetime import datetime @@ -15,23 +15,23 @@ # load in the data import os -if not os.path.exists('user2movie.json') or \ - not os.path.exists('movie2user.json') or \ - not os.path.exists('usermovie2rating.json') or \ - not os.path.exists('usermovie2rating_test.json'): +if not os.path.exists('.\\large_files\\movielens-20m-dataset\\user2movie.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\movie2user.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json'): import preprocess2dict -with open('user2movie.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'rb') as f: user2movie = pickle.load(f) -with open('movie2user.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'rb') as f: movie2user = pickle.load(f) -with open('usermovie2rating.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'rb') as f: usermovie2rating = pickle.load(f) -with open('usermovie2rating_test.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'rb') as f: usermovie2rating_test = pickle.load(f) diff --git a/recommenders/mf_keras.py b/recommenders/mf_keras.py index efc3315b..5f8ea4ad 100644 --- a/recommenders/mf_keras.py +++ b/recommenders/mf_keras.py @@ -1,23 +1,23 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -import pickle -import numpy as np +#import pickle +#import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle -from keras.models import Model -from keras.layers import Input, Embedding, Dot, Add, Flatten -from keras.regularizers import l2 -from keras.optimizers import SGD, Adam +from tensorflow.keras.models import Model #type:ignore +from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten #type:ignore +from tensorflow.keras.regularizers import l2 #type:ignore +from tensorflow.keras.optimizers import SGD #type:ignore # load in the data -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') N = df.userId.max() + 1 # number of users M = df.movie_idx.max() + 1 # number of movies @@ -71,7 +71,7 @@ loss='mse', # optimizer='adam', # optimizer=Adam(lr=0.01), - optimizer=SGD(lr=0.08, momentum=0.9), + optimizer=SGD(learning_rate=0.08, momentum=0.9), metrics=['mse'], ) diff --git a/recommenders/mf_keras_deep.py b/recommenders/mf_keras_deep.py index f3888a7a..b22c4abb 100644 --- a/recommenders/mf_keras_deep.py +++ b/recommenders/mf_keras_deep.py @@ -1,24 +1,24 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -import pickle -import numpy as np +#import pickle +#import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.utils import shuffle -from keras.models import Model -from keras.layers import Input, Embedding, Flatten, Dense, Concatenate -from keras.layers import Dropout, BatchNormalization, Activation -from keras.regularizers import l2 -from keras.optimizers import SGD, Adam +from tensorflow.keras.models import Model # type:ignore +from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate # type:ignore +from tensorflow.keras.layers import Dropout, BatchNormalization, Activation # type:ignore +#from tensorflow.keras.regularizers import l2 +from tensorflow.keras.optimizers import SGD#, Adam # type:ignore # load in the data -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') N = df.userId.max() + 1 # number of users M = df.movie_idx.max() + 1 # number of movies @@ -47,12 +47,12 @@ # the neural network x = Dense(400)(x) -# x = BatchNormalization()(x) +x = BatchNormalization()(x) +x = Activation('relu')(x) +x = Dropout(0.5)(x) +x = Dense(100)(x) +x = BatchNormalization()(x) x = Activation('relu')(x) -# x = Dropout(0.5)(x) -# x = Dense(100)(x) -# x = BatchNormalization()(x) -# x = Activation('relu')(x) x = Dense(1)(x) model = Model(inputs=[u, m], outputs=x) @@ -60,7 +60,7 @@ loss='mse', # optimizer='adam', # optimizer=Adam(lr=0.01), - optimizer=SGD(lr=0.08, momentum=0.9), + optimizer=SGD(learning_rate=0.08, momentum=0.9), metrics=['mse'], ) diff --git a/recommenders/preprocess.py b/recommenders/preprocess.py index 72585460..9e8d19ef 100644 --- a/recommenders/preprocess.py +++ b/recommenders/preprocess.py @@ -1,14 +1,14 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future import pandas as pd # https://www.kaggle.com/grouplens/movielens-20m-dataset -df = pd.read_csv('../large_files/movielens-20m-dataset/rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\rating.csv') @@ -34,8 +34,9 @@ # add them to the data frame # takes awhile -df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1) +#df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1) +df['movie_idx'] = df.movieId.map(movie2idx) df = df.drop(columns=['timestamp']) -df.to_csv('../large_files/movielens-20m-dataset/edited_rating.csv', index=False) \ No newline at end of file +df.to_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv', index=False) \ No newline at end of file diff --git a/recommenders/preprocess2dict.py b/recommenders/preprocess2dict.py index 2ed5d8b7..e019cde4 100644 --- a/recommenders/preprocess2dict.py +++ b/recommenders/preprocess2dict.py @@ -1,19 +1,19 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future import pickle -import numpy as np +#import numpy as np import pandas as pd -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt from sklearn.utils import shuffle # load in the data # https://www.kaggle.com/grouplens/movielens-20m-dataset -df = pd.read_csv('../large_files/movielens-20m-dataset/very_small_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\small_rating.csv') N = df.userId.max() + 1 # number of users M = df.movie_idx.max() + 1 # number of movies @@ -25,58 +25,62 @@ df_test = df.iloc[cutoff:] # a dictionary to tell us which users have rated which movies -user2movie = {} +user2movie = df_train.groupby('userId').movie_idx.agg(list).to_dict() # a dicationary to tell us which movies have been rated by which users -movie2user = {} +movie2user = df_train.groupby('movie_idx').userId.agg(list).to_dict() # a dictionary to look up ratings -usermovie2rating = {} -print("Calling: update_user2movie_and_movie2user") -count = 0 -def update_user2movie_and_movie2user(row): - global count - count += 1 - if count % 100000 == 0: - print("processed: %.3f" % (float(count)/cutoff)) - - i = int(row.userId) - j = int(row.movie_idx) - if i not in user2movie: - user2movie[i] = [j] - else: - user2movie[i].append(j) - - if j not in movie2user: - movie2user[j] = [i] - else: - movie2user[j].append(i) - - usermovie2rating[(i,j)] = row.rating -df_train.apply(update_user2movie_and_movie2user, axis=1) +user_movie_keys = zip(df_train.userId, df_train.movie_idx) +usermovie2rating = pd.Series(df_train.rating.values, index=user_movie_keys).to_dict() + +# print("Calling: update_user2movie_and_movie2user") +# count = 0 +# def update_user2movie_and_movie2user(row): +# global count +# count += 1 +# if count % 100000 == 0: +# print("processed: %.3f" % (float(count)/cutoff)) + +# i = int(row.userId) +# j = int(row.movie_idx) +# if i not in user2movie: +# user2movie[i] = [j] +# else: +# user2movie[i].append(j) + +# if j not in movie2user: +# movie2user[j] = [i] +# else: +# movie2user[j].append(i) + +# usermovie2rating[(i,j)] = row.rating +#df_train.apply(update_user2movie_and_movie2user, axis=1) # test ratings dictionary -usermovie2rating_test = {} -print("Calling: update_usermovie2rating_test") -count = 0 -def update_usermovie2rating_test(row): - global count - count += 1 - if count % 100000 == 0: - print("processed: %.3f" % (float(count)/len(df_test))) - - i = int(row.userId) - j = int(row.movie_idx) - usermovie2rating_test[(i,j)] = row.rating -df_test.apply(update_usermovie2rating_test, axis=1) +user_movie_keys_test = zip(df_test.userId, df_test.movie_idx) +usermovie2rating_test = pd.Series(df_test.rating.values, index=user_movie_keys_test).to_dict() + +# print("Calling: update_usermovie2rating_test") +# count = 0 +# def update_usermovie2rating_test(row): +# global count +# count += 1 +# if count % 100000 == 0: +# print("processed: %.3f" % (float(count)/len(df_test))) + +# i = int(row.userId) +# j = int(row.movie_idx) +# usermovie2rating_test[(i,j)] = row.rating +# df_test.apply(update_usermovie2rating_test, axis=1) # note: these are not really JSONs -with open('user2movie.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'wb') as f: pickle.dump(user2movie, f) -with open('movie2user.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'wb') as f: pickle.dump(movie2user, f) -with open('usermovie2rating.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'wb') as f: pickle.dump(usermovie2rating, f) -with open('usermovie2rating_test.json', 'wb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'wb') as f: pickle.dump(usermovie2rating_test, f) diff --git a/recommenders/preprocess2sparse.py b/recommenders/preprocess2sparse.py index 864de56d..fdc525a7 100644 --- a/recommenders/preprocess2sparse.py +++ b/recommenders/preprocess2sparse.py @@ -5,14 +5,14 @@ # Note: you may need to update your version of future # sudo pip install -U future -import numpy as np +#import numpy as np import pandas as pd -import matplotlib.pyplot as plt +#import matplotlib.pyplot as plt from sklearn.utils import shuffle -from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz +from scipy.sparse import lil_matrix, save_npz # load in the data -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') # df = pd.read_csv('../large_files/movielens-20m-dataset/small_rating.csv') N = df.userId.max() + 1 # number of users @@ -41,7 +41,7 @@ def update_train(row): # mask, to tell us which entries exist and which do not A = A.tocsr() mask = (A > 0) -save_npz("Atrain.npz", A) +save_npz(".\\large_files\\movielens-20m-dataset\\Atrain.npz", A) # test ratings dictionary A_test = lil_matrix((N, M)) @@ -59,4 +59,4 @@ def update_test(row): df_test.apply(update_test, axis=1) A_test = A_test.tocsr() mask_test = (A_test > 0) -save_npz("Atest.npz", A_test) +save_npz(".\\large_files\\movielens-20m-dataset\\Atest.npz", A_test) diff --git a/recommenders/preprocess_shrink.py b/recommenders/preprocess_shrink.py index 665a80e6..e7aa5b87 100644 --- a/recommenders/preprocess_shrink.py +++ b/recommenders/preprocess_shrink.py @@ -1,18 +1,18 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +#from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future -import pickle -import numpy as np +#import pickle +#import numpy as np import pandas as pd from collections import Counter # load in the data # https://www.kaggle.com/grouplens/movielens-20m-dataset -df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv') +df = pd.read_csv('.\\large_files\\movielens-20m-dataset\\edited_rating.csv') print("original dataframe size:", len(df)) N = df.userId.max() + 1 # number of users @@ -25,8 +25,8 @@ n = 10000 m = 2000 -user_ids = [u for u, c in user_ids_count.most_common(n)] -movie_ids = [m for m, c in movie_ids_count.most_common(m)] +user_ids = [u for u, _ in user_ids_count.most_common(n)] +movie_ids = [m for m, _ in movie_ids_count.most_common(m)] # make a copy, otherwise ids won't be overwritten df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy() @@ -55,4 +55,4 @@ print("max movie id:", df_small.movie_idx.max()) print("small dataframe size:", len(df_small)) -df_small.to_csv('../large_files/movielens-20m-dataset/small_rating.csv', index=False) +df_small.to_csv('.\\large_files\\movielens-20m-dataset\\small_rating.csv', index=False) diff --git a/recommenders/rbm_tf_k_faster.py b/recommenders/rbm_tf_k_faster.py index 9a1a242a..71b9666b 100644 --- a/recommenders/rbm_tf_k_faster.py +++ b/recommenders/rbm_tf_k_faster.py @@ -10,8 +10,8 @@ import matplotlib.pyplot as plt from sklearn.utils import shuffle -import pandas as pd -from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz +#import pandas as pd +from scipy.sparse import load_npz from datetime import datetime if tf.__version__.startswith('2'): @@ -33,6 +33,7 @@ def dot2(H, W): class RBM(object): def __init__(self, D, M, K): + super().__init__() self.D = D # input feature size self.M = M # hidden size self.K = K # number of ratings @@ -115,7 +116,7 @@ def build(self, D, M, K): self.session.run(initop) def fit(self, X, X_test, epochs=10, batch_sz=256, show_fig=True): - N, D = X.shape + N, _ = X.shape n_batches = N // batch_sz @@ -134,7 +135,7 @@ def fit(self, X, X_test, epochs=10, batch_sz=256, show_fig=True): ) if j % 100 == 0: - print("j / n_batches:", j, "/", n_batches, "cost:", c) + print(f"j / n_batches: {j}/{n_batches}", "cost: ",c) print("duration:", datetime.now() - t0) # calculate the true train and test cost @@ -209,10 +210,10 @@ def get_sse(self, X, Xt): def main(): - A = load_npz("Atrain.npz") - A_test = load_npz("Atest.npz") + A = load_npz(".\\large_files\\movielens-20m-dataset\\Atrain.npz") + A_test = load_npz(".\\large_files\\movielens-20m-dataset\\Atest.npz") - N, M = A.shape + _, M = A.shape rbm = RBM(M, 50, 10) rbm.fit(A, A_test) diff --git a/recommenders/spark.py b/recommenders/spark.py index 23ea365a..33154899 100644 --- a/recommenders/spark.py +++ b/recommenders/spark.py @@ -9,11 +9,14 @@ # tmp = p.take(5) # print(tmp) -from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating -import os +from pyspark.mllib.recommendation import ALS, Rating +from pyspark import SparkContext +#import os # load in the data -data = sc.textFile("../large_files/movielens-20m-dataset/small_rating.csv") +sc = SparkContext('local', 'random') +data = sc.textFile(".\\large_files\\movielens-20m-dataset\\small_rating.csv") +#'/mnt/c/Users/Saif/Downloads/personal/Udemy_labs/nlp/machine_learning_examples/large_files/movielens-20m-dataset//small_ratings.csv' # filter out header header = data.first() #extract header diff --git a/recommenders/spark2.py b/recommenders/spark2.py index 5879269d..8310c69b 100644 --- a/recommenders/spark2.py +++ b/recommenders/spark2.py @@ -7,7 +7,7 @@ # tmp = p.take(5) # print(tmp) -from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating +from pyspark.mllib.recommendation import ALS, Rating from pyspark import SparkContext # increase memory @@ -18,8 +18,8 @@ # load in the data -# data = sc.textFile("../large_files/movielens-20m-dataset/small_rating.csv") -data = sc.textFile("../large_files/movielens-20m-dataset/rating.csv.gz") +data = sc.textFile("/mnt/c/Users/Saif/Downloads/personal/Udemy_labs/nlp/machine_learning_examples/large_files/movielens-20m-dataset/rating.csv") +#data = sc.textFile(".\\large_files\\movielens-20m-dataset\\rating.csv.gz") # filter out header header = data.first() #extract header diff --git a/recommenders/tfidf.py b/recommenders/tfidf.py index a6078ec3..7c380205 100644 --- a/recommenders/tfidf.py +++ b/recommenders/tfidf.py @@ -2,12 +2,12 @@ import json from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances +from sklearn.metrics.pairwise import cosine_similarity # get the data from: https://www.kaggle.com/tmdb/tmdb-movie-metadata # load in the data -df = pd.read_csv('../large_files/tmdb_5000_movies.csv') +df = pd.read_csv('.\\large_files\\tmdb_5000_movies.csv') # convert the relevant data for each movie into a single string @@ -59,7 +59,7 @@ def recommend(title): recommended_idx = (-scores).argsort()[1:6] # return the titles of the recommendations - return df['title'].iloc[recommended_idx] + return df['title'].iloc[recommended_idx].values print("\nRecommendations for 'Scream 3':") diff --git a/recommenders/userbased.py b/recommenders/userbased.py index b512a722..07e84489 100644 --- a/recommenders/userbased.py +++ b/recommenders/userbased.py @@ -1,44 +1,44 @@ # https://udemy.com/recommender-systems # https://deeplearningcourses.com/recommender-systems from __future__ import print_function, division -from builtins import range, input +from builtins import range#, input # Note: you may need to update your version of future # sudo pip install -U future import pickle import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from sklearn.utils import shuffle -from datetime import datetime +#import pandas as pd +#import matplotlib.pyplot as plt +#from sklearn.utils import shuffle +#from datetime import datetime from sortedcontainers import SortedList # load in the data import os -if not os.path.exists('user2movie.json') or \ - not os.path.exists('movie2user.json') or \ - not os.path.exists('usermovie2rating.json') or \ - not os.path.exists('usermovie2rating_test.json'): +if not os.path.exists('.\\large_files\\movielens-20m-dataset\\user2movie.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\movie2user.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json') or \ + not os.path.exists('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json'): import preprocess2dict -with open('user2movie.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\user2movie.json', 'rb') as f: user2movie = pickle.load(f) -with open('movie2user.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\movie2user.json', 'rb') as f: movie2user = pickle.load(f) -with open('usermovie2rating.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating.json', 'rb') as f: usermovie2rating = pickle.load(f) -with open('usermovie2rating_test.json', 'rb') as f: +with open('.\\large_files\\movielens-20m-dataset\\usermovie2rating_test.json', 'rb') as f: usermovie2rating_test = pickle.load(f) N = np.max(list(user2movie.keys())) + 1 # the test set may contain movies the train set doesn't have data on m1 = np.max(list(movie2user.keys())) -m2 = np.max([m for (u, m), r in usermovie2rating_test.items()]) +m2 = np.max([m for (_, m), _ in usermovie2rating_test.items()]) M = max(m1, m2) + 1 print("N:", N, "M:", M) diff --git a/rnn_class/util.py b/rnn_class/util.py index 54801efa..5aa98a66 100644 --- a/rnn_class/util.py +++ b/rnn_class/util.py @@ -118,7 +118,7 @@ def get_wikipedia_data(n_files, n_vocab, by_paragraph=False): for f in input_files: print("reading:", f) - for line in open(prefix + f): + for line in open(prefix + f, encoding='utf-8'): line = line.strip() # don't count headers, structured data, lists, etc... if line and line[0] not in ('[', '*', '-', '|', '=', '{', '}'):