7
7
from team_comm_tools .utils .summarize_features import *
8
8
from team_comm_tools .utils .gini_coefficient import *
9
9
from team_comm_tools .utils .preprocess import *
10
+ from fuzzywuzzy import process
10
11
11
12
class ConversationLevelFeaturesCalculator :
12
13
"""
@@ -29,13 +30,13 @@ class ConversationLevelFeaturesCalculator:
29
30
:type input_columns: list
30
31
:param convo_aggregation: If true, will aggregate features at the conversational level
31
32
:type convo_aggregation: bool
32
- :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, std ...)
33
+ :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev ...)
33
34
:type convo_methods: list
34
35
:param convo_columns: Specifies which columns (at the chat level) users want aggregated
35
36
:type convo_columns: list
36
37
:param user_aggregation: If true, will aggregate features at the user level
37
38
:type convo_aggregation: bool
38
- :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std ...) at the user level
39
+ :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev ...) at the user level
39
40
:type user_methods: list
40
41
:param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level
41
42
:type user_columns: list
@@ -78,27 +79,89 @@ def __init__(self, chat_data: pd.DataFrame,
78
79
if 'conversation_num' not in self .input_columns :
79
80
self .input_columns .append ('conversation_num' )
80
81
82
+ # check if user inputted convo_columns is None
81
83
if convo_columns is None :
82
84
self .columns_to_summarize = [column for column in self .chat_data .columns \
83
85
if (column not in self .input_columns ) and pd .api .types .is_numeric_dtype (self .chat_data [column ])]
84
86
else :
85
87
if convo_aggregation == True and len (convo_columns ) == 0 :
86
- warnings . warn (
88
+ print (
87
89
"Warning: convo_aggregation is True but no convo_columns specified. Defaulting convo_aggregation to False."
88
90
)
89
91
self .convo_aggregation = False
90
92
else :
91
93
convo_columns_in_data = list (set (convo_columns ).intersection (set (self .chat_data .columns )))
92
94
93
95
if (len (convo_columns_in_data ) != len (convo_columns )):
94
- warnings . warn (
96
+ print (
95
97
"Warning: One or more requested user columns are not present in the data. Ignoring them."
96
98
)
97
99
100
+ for i in convo_columns :
101
+ matches = process .extract (i , self .chat_data .columns , limit = 3 )
102
+ best_match , similarity = matches [0 ]
103
+
104
+ if similarity == 100 :
105
+ continue
106
+ elif similarity >= 80 :
107
+ print ("Did you mean" , best_match , "instead of" , i , "?" )
108
+ else :
109
+ print (i , "not found in data and no close match." )
110
+
111
+
98
112
self .columns_to_summarize = convo_columns_in_data
113
+
114
+ # check if user inputted user_columns is None
115
+ if user_columns is None :
116
+ self .user_columns = [column for column in self .chat_data .columns \
117
+ if (column not in self .input_columns ) and pd .api .types .is_numeric_dtype (self .chat_data [column ])]
118
+ else :
119
+ if user_aggregation == True and len (user_columns ) == 0 :
120
+ print ("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False." )
121
+ self .user_aggregation = False
122
+ else :
123
+ user_columns_in_data = list (set (user_columns ).intersection (set (self .chat_data .columns )))
124
+ if (len (user_columns_in_data ) != len (user_columns )):
125
+ print (
126
+ "Warning: One or more requested user columns are not present in the data. Ignoring them."
127
+ )
128
+
129
+ print (user_columns_in_data , user_columns )
130
+
131
+ for i in user_columns :
132
+ matches = process .extract (i , self .chat_data .columns , limit = 3 )
133
+ best_match , similarity = matches [0 ]
134
+
135
+ if similarity == 100 :
136
+ continue
137
+ elif similarity >= 80 :
138
+ print ("Did you mean" , best_match , "instead of" , i , "?" )
139
+ else :
140
+ print (i , "not found in data and no close match." )
141
+
142
+ self .user_columns = user_columns_in_data
99
143
100
144
self .summable_columns = ["num_words" , "num_chars" , "num_messages" ]
101
145
146
+ # ensure all lowercase
147
+ self .convo_methods = [col .lower () for col in self .convo_methods ]
148
+ self .user_methods = [col .lower () for col in self .user_methods ]
149
+ self .columns_to_summarize = [col .lower () for col in self .columns_to_summarize ]
150
+ self .user_columns = [col .lower () for col in self .user_columns ]
151
+
152
+ # replace interchangable words in columns_to_summarize
153
+ for i in range (len (self .convo_methods )):
154
+ if self .convo_methods [i ] == "average" :
155
+ self .convo_methods [i ] = "mean"
156
+ elif self .convo_methods [i ] == "maximum" :
157
+ self .convo_methods [i ] = "max"
158
+ elif self .convo_methods [i ] == "minimum" :
159
+ self .convo_methods [i ] = "min"
160
+ elif self .convo_methods [i ] == "standard deviation" :
161
+ self .convo_methods [i ] = "stdev"
162
+ elif self .convo_methods [i ] == "sd" :
163
+ self .convo_methods = "stdev"
164
+
102
165
def calculate_conversation_level_features (self , feature_methods : list ) -> pd .DataFrame :
103
166
"""
104
167
Main driver function for creating conversation-level features.
@@ -185,7 +248,7 @@ def get_conversation_level_aggregates(self) -> None:
185
248
)
186
249
187
250
# Standard Deviation of feature across the Conversation
188
- if 'std ' in self .convo_methods :
251
+ if 'stdev ' in self .convo_methods :
189
252
self .conv_data = pd .merge (
190
253
left = self .conv_data ,
191
254
right = get_stdev (self .chat_data .copy (), column , 'stdev_' + column , self .conversation_id_col ),
@@ -210,6 +273,15 @@ def get_conversation_level_aggregates(self) -> None:
210
273
on = [self .conversation_id_col ],
211
274
how = "inner"
212
275
)
276
+
277
+ # Median for the feature across the Conversation
278
+ if 'median' in self .convo_methods :
279
+ self .conv_data = pd .merge (
280
+ left = self .conv_data ,
281
+ right = get_median (self .chat_data .copy (), column , 'median_' + column , self .conversation_id_col ),
282
+ on = [self .conversation_id_col ],
283
+ how = "inner"
284
+ )
213
285
214
286
# Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini.
215
287
for column in self .summable_columns :
@@ -242,7 +314,7 @@ def get_user_level_aggregates(self) -> None:
242
314
243
315
if self .convo_aggregation == True and self .user_aggregation == True :
244
316
245
- # this may be right??
317
+ # aggregates from the user level based on conversation methods
246
318
if 'mean' in self .convo_methods :
247
319
for user_column in self .user_columns :
248
320
for user_method in self .user_methods :
@@ -254,7 +326,7 @@ def get_user_level_aggregates(self) -> None:
254
326
how = "inner"
255
327
)
256
328
257
- if 'std ' in self .convo_methods :
329
+ if 'stdev ' in self .convo_methods :
258
330
for user_column in self .user_columns :
259
331
for user_method in self .user_methods :
260
332
# Standard Deviation of User-Level Feature
@@ -286,95 +358,17 @@ def get_user_level_aggregates(self) -> None:
286
358
on = [self .conversation_id_col ],
287
359
how = "inner"
288
360
)
289
-
290
-
291
- # Sum Columns were created using self.get_user_level_summed_features()
292
- # for column in self.columns_to_summarize:
293
- # # change to self.user_columns
294
- # # should be summable_columns
295
-
296
- # # for method in self.user_methods:
297
- # # self.conv_data = pd.merge(
298
- # # left=self.conv_data,
299
- # # right=get_average(self.user_data.copy(), method+"_"+column, 'average_user_' + method + "_" +column, self.conversation_id_col),
300
- # # on=[self.conversation_id_col],
301
- # # how="inner"
302
- # # )
303
-
304
- # if 'mean' in self.convo_methods:
305
- # # Average/Mean of User-Level Feature
306
- # self.conv_data = pd.merge(
307
- # left=self.conv_data,
308
- # right=get_average(self.user_data.copy(), "sum_"+column, 'average_user_sum_'+column, self.conversation_id_col),
309
- # on=[self.conversation_id_col],
310
- # how="inner"
311
- # )
312
-
313
- # if 'std' in self.convo_methods:
314
- # # Standard Deviation of User-Level Feature
315
- # self.conv_data = pd.merge(
316
- # left=self.conv_data,
317
- # right=get_stdev(self.user_data.copy(), "sum_"+column, 'stdev_user_sum_'+column, self.conversation_id_col),
318
- # on=[self.conversation_id_col],
319
- # how="inner"
320
- # )
321
-
322
- # if 'min' in self.convo_methods:
323
- # # Minima of User-Level Feature
324
- # self.conv_data = pd.merge(
325
- # left=self.conv_data,
326
- # right=get_min(self.user_data.copy(), "sum_"+column, 'min_user_sum_'+column, self.conversation_id_col),
327
- # on=[self.conversation_id_col],
328
- # how="inner"
329
- # )
330
-
331
- # if 'max' in self.convo_methods:
332
- # # Maxima of User-Level Feature
333
- # self.conv_data = pd.merge(
334
- # left=self.conv_data,
335
- # right=get_max(self.user_data.copy(), "sum_"+column, 'max_user_sum_'+column, self.conversation_id_col),
336
- # on=[self.conversation_id_col],
337
- # how="inner"
338
- # )
339
-
340
- # Average Columns were created using self.get_user_level_mean_features()
341
- for column in self .columns_to_summarize :
342
-
343
- if 'mean' in self .convo_methods :
344
- # Average/Mean of User-Level Feature
345
- self .conv_data = pd .merge (
346
- left = self .conv_data ,
347
- right = get_mean (self .user_data .copy (), "mean_" + column , 'mean_user_avg_' + column , self .conversation_id_col ),
348
- on = [self .conversation_id_col ],
349
- how = "inner"
350
- )
351
-
352
- if 'std' in self .convo_methods :
353
- # Standard Deviation of User-Level Feature
354
- self .conv_data = pd .merge (
355
- left = self .conv_data ,
356
- right = get_stdev (self .user_data .copy (), "mean_" + column , 'stdev_user_avg_' + column , self .conversation_id_col ),
357
- on = [self .conversation_id_col ],
358
- how = "inner"
359
- )
360
-
361
- if 'min' in self .convo_methods :
362
- # Minima of User-Level Feature
363
- self .conv_data = pd .merge (
364
- left = self .conv_data ,
365
- right = get_min (self .user_data .copy (), "mean_" + column , 'min_user_avg_' + column , self .conversation_id_col ),
366
- on = [self .conversation_id_col ],
367
- how = "inner"
368
- )
369
-
370
- if 'max' in self .convo_methods :
371
- # Maxima of User-Level Feature
372
- self .conv_data = pd .merge (
373
- left = self .conv_data ,
374
- right = get_max (self .user_data .copy (), "mean_" + column , 'max_user_avg_' + column , self .conversation_id_col ),
375
- on = [self .conversation_id_col ],
376
- how = "inner"
377
- )
361
+
362
+ if 'median' in self .convo_methods :
363
+ for user_column in self .user_columns :
364
+ for user_method in self .user_methods :
365
+ # Median of User-Level Feature
366
+ self .conv_data = pd .merge (
367
+ left = self .conv_data ,
368
+ right = get_median (self .user_data .copy (), user_method + "_" + user_column , 'median_user_' + user_method + "_" + user_column , self .conversation_id_col ),
369
+ on = [self .conversation_id_col ],
370
+ how = "inner"
371
+ )
378
372
379
373
380
374
def get_discursive_diversity_features (self ) -> None :
0 commit comments