-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrends_final_code.py
397 lines (290 loc) · 13.9 KB
/
Trends_final_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
# Trends MarketPlace
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
import speech_recognition as sr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
import timeit
def survey_analysis(survey_file):
warnings.filterwarnings('ignore')
data = pd.read_csv(survey_file)
data.describe()
data.head()
#Missing values check
data.isnull().sum()
# View the distribution of data
num_bins = 10
data.hist(bins=num_bins, figsize=(20,15))
plt.savefig("Histogram plots")
plt.show()
data['Attrition'].value_counts()
data.groupby('Attrition').mean()
# Plot the distribution of churn employee
num_bins = 10
data_churn = data[data['Attrition'] == 'Yes'].hist(bins=num_bins, figsize=(20,15))
plt.savefig("Histogram plots_Churn")
plt.show()
# Transfer attrition: Yes = 1, No = 0
attrition_val = {'Yes': 1, 'No': 0}
data['Attrition'] = data['Attrition'].apply(attrition_val.get)
# Select categorical data and turn them into dummy variables, deleted "Over18" for it's the same for all rows
data_cat = data[['Attrition', 'BusinessTravel','Department', 'EducationField',
'Gender', 'JobRole', 'MaritalStatus', 'OverTime']].copy()
data_cat = pd.get_dummies(data_cat)
data_cat.shape
# Pick numerical variables
data_num = data[['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeNumber', 'EnvironmentSatisfaction',
'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate',
'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']]
# Check correlation of numerical variables
sns.set(style="white")
# Compute the correlation matrix
corr = data_num.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap="YlGnBu", vmax=1, vmin=-1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
# Remove column with same value in all rows (EmployeeCount, StandardHour), unrelated (EmployeeNumber)
# Remove highly correlated variables: MonlhlyIncome, TotalWorkingYears, PercentSalaryHike, YearsWithCurrManager
data_num = data[["Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement",
"JobLevel", "JobSatisfaction", "MonthlyRate", "NumCompaniesWorked", "PerformanceRating",
"RelationshipSatisfaction", "StockOptionLevel", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole",
"YearsSinceLastPromotion"]].copy()
data_num.shape
# Merge final data
final_data = pd.concat([data_num, data_cat], axis=1)
final_data.shape
# Select independent variables and dependent variable
data_vars=final_data.columns.values.tolist()
y_var=['Attrition']
x_var=[var for var in data_vars if var not in y_var]
x=final_data[x_var]
y=final_data['Attrition']
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train, test, target_train, target_test = train_test_split(x, y, test_size=0.3, random_state=42)
# Resolve imbalanced data issue
from imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=42)
smote_train, smote_target = oversampler.fit_sample(train,target_train)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(smote_train)
train_std = sc.transform(smote_train)
test_std = sc.transform(test)
logreg = LogisticRegression()
logreg.fit(smote_train, smote_target)
from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(target_test, logreg.predict(test))))
coefs = logreg.coef_.transpose()
df = pd.DataFrame(coefs, columns=['Coef'])
df['Variables'] = x_var
df['abs_Coef'] = np.absolute(coefs)
final_df = df.sort_values(by=['abs_Coef'], ascending=False)
final_df
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(smote_train, smote_target)
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(target_test, rf.predict(test))))
feat_importances = pd.DataFrame(rf.feature_importances_, index=x_var, columns=['Importance']).sort_values(by=['Importance'], ascending=False)
top_10_factors = feat_importances[0:10]
top_10_factors.plot.barh().invert_yaxis()
plt.title('Random Forest Top 10 Important Factors')
plt.savefig('Top 10 Important Factors')
plt.show()
# SVM
from sklearn.svm import SVC
svc = SVC(random_state=42)
svc.fit(train,target_train)
print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(target_test, svc.predict(test))))
# Cross Validation
# Cross Validation for Logistic Regression
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, smote_train, smote_target, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy for logistic regression: %.3f" % (results.mean()))
# Cross Validation for Logistic Regression
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, smote_train, smote_target, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy for Random Forest: %.3f" % (results.mean()))
# Classification report for logistic regression
from sklearn.metrics import classification_report
print(classification_report(target_test, logreg.predict(test)))
# Classification report for Random Forest
from sklearn.metrics import classification_report
print(classification_report(target_test, rf.predict(test)))
# Confusion matrix for logistic regression
logreg_y_pred = logreg.predict(test)
logreg_cm = metrics.confusion_matrix(logreg_y_pred, target_test, [1,0])
sns.heatmap(logreg_cm, cmap="YlGnBu", annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Logistic Regression')
plt.savefig('logistic_regression')
plt.show()
# Confusion matrix for random forest
rf_y_pred = rf.predict(test)
rf_cm = metrics.confusion_matrix(rf_y_pred, target_test, [1,0])
sns.heatmap(rf_cm, cmap="YlGnBu", annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Logistic Regression')
plt.savefig('logistic_regression')
plt.show()
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(target_test, logreg.predict(test))
fpr, tpr, thresholds = roc_curve(target_test, logreg.predict_proba(test)[:,1])
rf_roc_auc = roc_auc_score(target_test, rf.predict(test))
rf_fpr, rf_tpr, rf_thresholds = roc_curve(target_test, rf.predict_proba(test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC')
plt.show()
# Results of different models
print('\n')
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(target_test, logreg.predict(test))))
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(target_test, rf.predict(test))))
print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(target_test, svc.predict(test))))
feat_importances = pd.DataFrame(rf.feature_importances_, index=x_var, columns=['Importance']).sort_values(by=['Importance'], ascending=False)
top_10_factors = feat_importances[0:10]
top_10_factors.plot.barh().invert_yaxis()
plt.title('Random Forest Top 10 Important Factors')
plt.savefig('Top 10 Important Factors')
plt.show()
# Audio to text
def audio_txt(audio_file):
r = sr.Recognizer()
counter = 0
with open(audio_file, 'r') as audio_name:
for line3 in audio_name:
data2 = line3.strip()
counter +=1
data3 = "exit_interview" + str(counter) + ".txt"
with sr.AudioFile(data2) as source:
audio = r.record(source)
try:
s = r.recognize_google(audio)
#print("Text: "+s)
with open(data3, 'w' ) as myfile:
myfile.writelines(s)
except Exception as e:
print("Exception: "+str(e))
# combine all the text files
def read_file(filename):
with open(filename, 'r') as fin1:
with open("Output.txt", 'a' ) as f_out:
for line in fin1:
data = line.rstrip('\n')
with open(data, 'r') as fin2:
for line2 in fin2:
f_out.writelines(line2)
f_out.writelines('\n \n')
# text analytics
def text_analytics(filename1):
with open(filename1, 'r' ) as fin:
sentence = ""
for line in fin:
sentence += line.strip()
# token
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
# lower capitalization
tokens = [word.lower() for word in tokens]
# Adding stopwords
my_stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['could','would', 'company','change','always','really','great', 'like','new']
my_stopwords.extend(newStopWords)
# Removing stopwords
filtered_words = [word for word in tokens if word not in my_stopwords]
#Lemmatization
lmtzr = WordNetLemmatizer()
lem_list = [lmtzr.lemmatize(word) for word in filtered_words]
# # stem
# snowball_stemmer = SnowballStemmer('english')
# stemmed_list = [snowball_stemmer.stem(word) for word in filtered_words]
# frequency of each word
fdist1 = FreqDist(lem_list)
#fdist1.plot(15,cumulative=False)
#plt.show()
# fdist2 = FreqDist(stemmed_list)
# fdist2.plot(15,cumulative=False)
# plt.show()
# frequency of each word
fdist1 = FreqDist(lem_list)
df_fdist1 = pd.DataFrame.from_dict(fdist1, orient='index')
df_fdist1.columns = ['Frequency']
df_fdist1.index.name = 'Term'
df_fdist1['word'] = df_fdist1.index
df_fdist1.sort_values(by = ['Frequency'], axis = 0 , ascending = False , inplace = True)
# fdist2 = FreqDist(stemmed_list)
# df_fdist2 = pd.DataFrame.from_dict(fdist2, orient='index')
# df_fdist2.columns = ['Frequency']
# df_fdist2.index.name = 'Term'
# df_fdist2['word'] = df_fdist2.index
# df_fdist2.sort_values(by = ['Frequency'], axis = 0 , ascending = False , inplace = True)
# Word Cloud
word_cld = df_fdist1.head(30)
new_t = (word_cld['word'] + ' ') * word_cld['Frequency']
my_string = ''
for word in new_t:
my_string += word
# Create and generate a word cloud image:
import wordcloud as wc
wordcloud = wc.WordCloud(background_color='black', max_words=300, collocations=False).generate(my_string)
plt.figure(figsize=(15,10))
plt.clf()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show
def main():
survey_analysis('HR data.csv')
audio_txt("audio_files.txt")
#print("Audio to text completed")
read_file("exit_interview_list.txt")
#print("All exit interviews clubbed together")
#print("Results")
text_analytics("Output.txt")
main()
# run file
# Exit interview audio file
# Demo
# Presentation