-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
109 lines (97 loc) · 3.76 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import tensorflow as tf
import re
import string
import shutil
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from keras.layers import Flatten, Conv1D
from tensorflow.keras.layers import Dropout
## Utilities related to NLP training
# No vocabularies considered
vocabularies = 10000
# Length cap of words in each review
lengthCapOfWords = 250
# Embedding vector/ feature representation size
embeddingDimension = 500
# Path of weights saved to and loaded from
checkpoint_path = "models/group53_NLP_model/group53_nlp_model.ckpt"
# Returns vectorized token of words after applying preprocessing predicates
def getVectorizationPredicates():
wordVectorizePredicates = TextVectorization(
standardize=getWordPrepocessingPredicates,
max_tokens=vocabularies,
output_mode='int',
output_sequence_length=lengthCapOfWords)
return wordVectorizePredicates
# Returns generic model decided for this task
def getModel():
SentimentModel = Sequential([
getVectorizationPredicates(),
Embedding(vocabularies, embeddingDimension, name="embedding"),
Conv1D(filters=40, kernel_size=3, padding='same', activation='relu'),
Dropout(0.2),
Flatten(),
Dense(20, activation='relu'),
Dense(1, activation='sigmoid')
])
SentimentModel.compile(optimizer='adam',
loss=BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])
return SentimentModel
# Syncs dataset
def getIMDBData():
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz",
url,
untar=True,
cache_dir='.',
cache_subdir='')
return os.path.join(os.path.dirname(dataset), 'aclImdb')
# Removes unsup directory
def removeRedundantDirectory():
trainDir = os.path.join('aclImdb', 'train')
unsupDir = os.path.join(trainDir, 'unsup')
if os.path.isdir(unsupDir):
shutil.rmtree(unsupDir)
# Returns lowercase string
def makeTextsLowerCase(inputData):
return tf.strings.lower(inputData)
# unnecessary br tags are observed in dataset
# Returns strings after removing HTML br tags
def removeHTMLTags(inputData):
return tf.strings.regex_replace(inputData, '<br />',
' ')
# Returns strings after removing punctuations
def removePunctuations(inputData):
return tf.strings.regex_replace(inputData,
'[%s]' % re.escape(string.punctuation),
'')
# Return the rules/ predicates for word processing
def getWordPrepocessingPredicates(inputData):
processedData = makeTextsLowerCase(inputData)
processedData = removePunctuations(processedData)
processedData = removeHTMLTags(processedData)
return processedData
# Plot model accuracy Vs epochs
def plotAccuracyVsEpochs(history):
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# Plot Loss vs epochs
def plotLossVsEpochs(history):
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()