Skip to content

Latest commit

 

History

History
176 lines (139 loc) · 5.68 KB

code.md

File metadata and controls

176 lines (139 loc) · 5.68 KB

import numpy as np import pandas as pd import cPickle from collections import defaultdict import re

from bs4 import BeautifulSoup

import sys import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils.np_utils import to_categorical

from keras.layers import Embedding from keras.layers import Dense, Input, Flatten from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional from keras.models import Model

from keras import backend as K from keras.engine.topology import Layer, InputSpec from keras import initializations

MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 20000 EMBEDDING_DIM = 100 VALIDATION_SPLIT = 0.2

def clean_str(string): """ Tokenization/string cleaning for dataset Every dataset is lower cased except """ string = re.sub(r"\", "", string)
string = re.sub(r"'", "", string)
string = re.sub(r""", "", string)
return string.strip().lower()

data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t') print data_train.shape

texts = [] labels = []

for idx in range(data_train.review.shape[0]): text = BeautifulSoup(data_train.review[idx]) texts.append(clean_str(text.get_text().encode('ascii','ignore'))) labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:]

print('Traing and validation set number of positive and negative reviews') print y_train.sum(axis=0) print y_val.sum(axis=0)

GLOVE_DIR = "/ext/home/analyst/Testground/data/glove" embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') embedded_sequences = embedding_layer(sequence_input) l_lstm = Bidirectional(LSTM(100))(embedded_sequences) preds = Dense(2, activation='softmax')(l_lstm) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

print("model fitting - Bidirectional LSTM") model.summary() model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=50)

Attention GRU network

class AttLayer(Layer): def init(self, **kwargs): self.init = initializations.get('normal') #self.input_spec = [InputSpec(ndim=3)] super(AttLayer, self).init(**kwargs)

def build(self, input_shape):
    assert len(input_shape)==3
    #self.W = self.init((input_shape[-1],1))
    self.W = self.init((input_shape[-1],))
    #self.input_spec = [InputSpec(shape=input_shape)]
    self.trainable_weights = [self.W]
    super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

def call(self, x, mask=None):
    eij = K.tanh(K.dot(x, self.W))
    
    ai = K.exp(eij)
    weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
    
    weighted_input = x*weights.dimshuffle(0,1,'x')
    return weighted_input.sum(axis=1)

def get_output_shape_for(self, input_shape):
    return (input_shape[0], input_shape[-1])

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') embedded_sequences = embedding_layer(sequence_input) l_gru = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences) l_att = AttLayer()(l_gru) preds = Dense(2, activation='softmax')(l_att) model = Model(sequence_input, preds) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

print("model fitting - attention GRU network") model.summary() model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=50)