-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSentiment_Analyzer.py
290 lines (202 loc) · 8.51 KB
/
Sentiment_Analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 12 10:13:46 2023
@author: tarunvannelli
"""
import re
from nltk.corpus import wordnet
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from afinn import Afinn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import streamlit as st
import base64
import pandas as pd
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
df = pd.read_csv("/Users/tarunvannelli/Amazon_Product_Reviews.csv")
df = df.dropna(subset=['title', 'body'])
df['text'] = df['title'] + ' ' + df['body']
# find sentences containing HTML tags
i = 0
for sent in df['text'].values:
if (len(re.findall('<.*?>', sent))):
print(i)
print(sent)
break
i += 1
data = df['text']
# Converting text to lower
data = df['text'].apply(lambda x: x.lower())
# Remove special characters and punctuation
def remove_special_chars(text):
# Remove special characters
text = re.sub('[^a-zA-Z0-9\s]', '', text)
# Remove punctuation
text = re.sub('[^\w\s]', '', text)
return text
data = data.apply(remove_special_chars)
# assuming 'data' is a Series object
df = data.to_frame()
for index, row in df.iterrows():
# Here we are filtering out all the words that contains link
words_without_links = [
word for word in row.text.split() if 'http' not in word]
df.at[index, 'text'] = ' '.join(words_without_links)
data = pd.Series(df['text'].values, index=df.index)
# Function for POS
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
# Tokenization
nltk.download('punkt')
# download stopwords
nltk.download('stopwords')
# import stopwords and word_tokenize from NLTK
# create a list of stopwords
stop_words = set(stopwords.words('english'))
# tokenize the text column into individual words
data = data.apply(lambda x: word_tokenize(x))
# remove stopwords
tokenized_data = data.apply(
lambda x: [word for word in x if word not in stop_words])
# Lemmatiztion
nltk.download('omw-1.4')
nltk.download('wordnet')
# create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# apply lemmatizing to each word in the data
lematized_data = tokenized_data.apply(
lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])
# Spearating positive and negative Words
# initialize Afinn
af = Afinn()
# initialize lists to store sentiment scores and sentiment categories
sentiment_scores = []
sentiment_categories = []
# iterate over each review and calculate its sentiment score
for review in lematized_data:
if isinstance(review, list): # check if review is a list
review = ' '.join(review) # join list of words into a single string
sentiment_score = af.score(review)
sentiment_scores.append(sentiment_score)
# classify each review as positive, negative, or neutral based on its sentiment score
if sentiment_score > 0:
sentiment_categories.append('positive')
elif sentiment_score < 0:
sentiment_categories.append('negative')
else:
sentiment_categories.append('neutral')
# count number of reviews in each sentiment category
positive_count = sentiment_categories.count('positive')
negative_count = sentiment_categories.count('negative')
neutral_count = sentiment_categories.count('neutral')
# print results
print('Positive reviews:', positive_count)
print('Negative reviews:', negative_count)
print('Neutral reviews:', neutral_count)
# TF-TDF
# join the lists of lemmatized words for each document into a single string
lm_data = lematized_data.apply(lambda x: ' '.join(x))
# create a TfidfVectorizer object
vectorizer = TfidfVectorizer(max_features=1500, min_df=8, stop_words='english')
# fit the vectorizer to the preprocessed data
vectorizer.fit(lm_data)
# create TF-IDF vectors for the preprocessed data
tfidf_vectors = vectorizer.transform(lm_data)
# print the shape of the TF-IDF matrix
print(tfidf_vectors.shape)
# SPlitting Data
# create a new DataFrame with the preprocessed data and sentiment labels
# create X and y DataFrames
X = pd.DataFrame(tfidf_vectors.toarray(),
columns=vectorizer.get_feature_names_out())
y = pd.DataFrame(sentiment_categories, columns=['sentiment'])
# encode the sentiment labels as integers
le = LabelEncoder()
y = le.fit_transform(sentiment_categories)
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
# print the shapes of the train and test sets
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
# Balancing Reviews
# Smote
# Resample training data using SMOTE
smote = SMOTE()
x_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Resample test data using SMOTE
x_test_resampled, y_test_resampled = smote.fit_resample(X_test, y_test)
# Convert resampled target vector to pandas Series
y_train_resampled_series = pd.Series(y_train_resampled)
# Check distribution of classes in resampled data
print(y_train_resampled_series.value_counts())
# Convert resampled target vector to pandas Series
y_test_resampled_series = pd.Series(y_test_resampled)
# Check distribution of classes in resampled data
print(y_test_resampled_series.value_counts())
# Model Building
# Create logistic regression model with best hyperparameters
lr_model = LogisticRegression(
C=10, multi_class='ovr', penalty='l1', solver='liblinear')
# Fit model to data
lr = lr_model.fit(x_train_resampled, y_train_resampled)
y_pred_lr = lr.predict(x_test_resampled)
print(classification_report(y_test_resampled, y_pred_lr))
st.title(":red[Sentiment Analysis App] :smile: :neutral_face: :disappointed:")
st.write("Text to Analyze")
txt = st.text_area(
'Enter Text', '''Hi I am really excited to present how good this sentiment analyzer works''')
def emoji_pattern(senti):
if senti == 'positive':
custom_emoji = "😄"
elif senti == 'negative':
custom_emoji = '😞'
elif senti == 'neutral':
custom_emoji = '😐'
return custom_emoji
def run_sentiment_analysis(txt):
sentences = nltk.sent_tokenize(txt)
corpus = []
for i in range(len(sentences)):
review = re.sub('[^a-zA-Z]', ' ', sentences[i])
review = review.lower()
review = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(
review) if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
X = vectorizer.transform(corpus)
predicted_sent = lr.predict(X)
transform_sentiment = le.inverse_transform(predicted_sent)
df_sent = pd.DataFrame()
df_sent['text'] = sentences
df_sent['sentiment'] = transform_sentiment
df_sent['emoji_sentiment'] = df_sent['sentiment'].apply(emoji_pattern)
agree = st.checkbox('View Sentiment for all Sentences')
if agree:
st.write(df_sent)
if df_sent['sentiment'].value_counts().idxmax() == 'positive':
st.markdown("![POSITIVE](https://media3.giphy.com/media/v1.Y2lkPTc5MGI3NjExNWVlM2UzNjA2NmJjNmNmOTg2ZGE0ODNjZGZiMWQ2YTI0NzRhNTlhYyZjdD1n/xTiN0E03sgnvms9Uli/giphy.gif)")
elif df_sent['sentiment'].value_counts().idxmax() == 'negative':
st.markdown("![NEGATIVE](https://media4.giphy.com/media/v1.Y2lkPTc5MGI3NjExZGExZjU2MzQ2ZmM0N2MwZmY0MmRmYjhkNmI5Zjg2NzI4MDgyNTg4YSZjdD1n/StAnQV9TUCuys/giphy.gif)")
else:
st.markdown("![NEUTRAL](https://media4.giphy.com/media/v1.Y2lkPTc5MGI3NjExN2RkODJhNWQ2NGQ0NzI0ZjQ1MTNjNGE3MjhjYmU5MzUzYzc1ODY3YyZjdD1n/PQMJiBHKkGgbQiORyx/giphy.gif)")
return df_sent['sentiment'].value_counts().idxmax()
if txt:
st.write('Overall Sentiment:', run_sentiment_analysis(txt))