Skip to content

Commit 576a376

Browse files
committed
updates to package aggregation
1 parent d35aeb1 commit 576a376

File tree

6 files changed

+327
-184
lines changed

6 files changed

+327
-184
lines changed

examples/featurize.py

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8')
1919
csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8')
2020
csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8')
21+
test_df = pd.read_csv("C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv", encoding='utf-8')
22+
23+
# C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv
2124

2225
"""
2326
TINY / TEST DATASETS -------------------------------
@@ -68,7 +71,7 @@
6871
# )
6972
# tiny_juries_feature_builder_custom.featurize(col="message")
7073

71-
# # Tiny Juries with NO aggregations
74+
# Tiny Juries with NO aggregations
7275
# print("Tiny Juries with No Aggregation...")
7376
# tiny_juries_feature_builder_no_aggregation = FeatureBuilder(
7477
# input_df = tiny_juries_df,
@@ -84,23 +87,38 @@
8487
# tiny_juries_feature_builder_no_aggregation.featurize(col="message")
8588

8689
# Tiny Juries with custom Aggregations
87-
print("Tiny Juries with Custom Aggregation...")
88-
tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
89-
input_df = tiny_juries_df,
90+
# print("Tiny Juries with Custom Aggregation...")
91+
# tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
92+
# input_df = tiny_juries_df,
93+
# grouping_keys = ["batch_num", "round_num"],
94+
# vector_directory = "./vector_data/",
95+
# output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
96+
# output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
97+
# output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
98+
# convo_methods = ['max', 'median'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
99+
# convo_columns = ['positive_bert'],
100+
# user_methods = ['max', 'mean', 'min', 'median'],
101+
# user_columns = ['positive_bert', 'negative_bert'],
102+
# turns = False,
103+
# )
104+
# tiny_juries_feature_builder_custom_aggregation.featurize(col="message")
105+
106+
# Testing package aggregation
107+
print("Testing package aggregation...")
108+
testing_feature_builder_custom_aggregation = FeatureBuilder(
109+
input_df = test_df,
90110
grouping_keys = ["batch_num", "round_num"],
91111
vector_directory = "./vector_data/",
92-
output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
93-
output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
94-
output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
95-
convo_methods = ['mean'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
96-
convo_columns = ['positive_bert'],
97-
user_methods = ['mean', 'max'],
98-
user_columns = ['positive_bert', 'negative_bert'],
99-
# user_methods = ['max'],
100-
# user_columns = ['negative_bert'],
112+
output_file_path_chat_level = "./test_package_TINY_chat_level_custom_agg.csv",
113+
output_file_path_user_level = "./test_package_TINY_user_level_custom_agg.csv",
114+
output_file_path_conv_level = "./test_package_TINY_conversation_level_custom_agg.csv",
115+
convo_methods = ['max'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
116+
convo_columns = ['positive_bert', 'negativity_bert'],
117+
user_methods = ['MAX'],
118+
user_columns = ['negative_bert'],
101119
turns = False,
102120
)
103-
tiny_juries_feature_builder_custom_aggregation.featurize(col="message")
121+
testing_feature_builder_custom_aggregation.featurize(col="message")
104122

105123

106124
# # Tiny multi-task

src/team_comm_tools/feature_builder.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ class FeatureBuilder:
9292
:param convo_aggregation: If true, will aggregate features at the conversational level. Defaults to True.
9393
:type convo_aggregation: bool, optional
9494
95-
:param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the conversational level. Defaults to ['mean', 'max', 'min', 'std'].
95+
:param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the conversational level. Defaults to ['mean', 'max', 'min', 'stdev'].
9696
:type convo_methods: list, optional
9797
9898
:param convo_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the conversational level. Defauts to all all numeric columns.
@@ -101,7 +101,7 @@ class FeatureBuilder:
101101
:param user_aggregation: If true, will aggregate features at the speaker/user level. Defaults to True.
102102
:type convo_aggregation: bool, optional
103103
104-
:param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'std'].
104+
:param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'stdev'].
105105
:type convo_methods: list, optional
106106
107107
:param user_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the speaker/user level. Defauts to all all numeric columns.
@@ -133,10 +133,10 @@ def __init__(
133133
regenerate_vectors: bool = False,
134134
custom_vect_path: str = None,
135135
convo_aggregation = True,
136-
convo_methods: list = ['mean', 'max', 'min', 'std'],
136+
convo_methods: list = ['mean', 'max', 'min', 'stdev'],
137137
convo_columns: list = None,
138138
user_aggregation = True,
139-
user_methods: list = ['mean', 'max', 'min', 'std'],
139+
user_methods: list = ['mean', 'max', 'min', 'stdev'],
140140
user_columns: list = None
141141
) -> None:
142142

src/team_comm_tools/utils/calculate_conversation_level_features.py

Lines changed: 90 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from team_comm_tools.utils.summarize_features import *
88
from team_comm_tools.utils.gini_coefficient import *
99
from team_comm_tools.utils.preprocess import *
10+
from fuzzywuzzy import process
1011

1112
class ConversationLevelFeaturesCalculator:
1213
"""
@@ -29,13 +30,13 @@ class ConversationLevelFeaturesCalculator:
2930
:type input_columns: list
3031
:param convo_aggregation: If true, will aggregate features at the conversational level
3132
:type convo_aggregation: bool
32-
:param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, std...)
33+
:param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...)
3334
:type convo_methods: list
3435
:param convo_columns: Specifies which columns (at the chat level) users want aggregated
3536
:type convo_columns: list
3637
:param user_aggregation: If true, will aggregate features at the user level
3738
:type convo_aggregation: bool
38-
:param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) at the user level
39+
:param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level
3940
:type user_methods: list
4041
:param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level
4142
:type user_columns: list
@@ -78,27 +79,89 @@ def __init__(self, chat_data: pd.DataFrame,
7879
if 'conversation_num' not in self.input_columns:
7980
self.input_columns.append('conversation_num')
8081

82+
# check if user inputted convo_columns is None
8183
if convo_columns is None:
8284
self.columns_to_summarize = [column for column in self.chat_data.columns \
8385
if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
8486
else:
8587
if convo_aggregation == True and len(convo_columns) == 0:
86-
warnings.warn(
88+
print(
8789
"Warning: convo_aggregation is True but no convo_columns specified. Defaulting convo_aggregation to False."
8890
)
8991
self.convo_aggregation = False
9092
else:
9193
convo_columns_in_data = list(set(convo_columns).intersection(set(self.chat_data.columns)))
9294

9395
if(len(convo_columns_in_data) != len(convo_columns)):
94-
warnings.warn(
96+
print(
9597
"Warning: One or more requested user columns are not present in the data. Ignoring them."
9698
)
9799

100+
for i in convo_columns:
101+
matches = process.extract(i, self.chat_data.columns, limit=3)
102+
best_match, similarity = matches[0]
103+
104+
if similarity == 100:
105+
continue
106+
elif similarity >= 80:
107+
print("Did you mean", best_match, "instead of", i, "?")
108+
else:
109+
print(i, "not found in data and no close match.")
110+
111+
98112
self.columns_to_summarize = convo_columns_in_data
113+
114+
# check if user inputted user_columns is None
115+
if user_columns is None:
116+
self.user_columns = [column for column in self.chat_data.columns \
117+
if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
118+
else:
119+
if user_aggregation == True and len(user_columns) == 0:
120+
print("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.")
121+
self.user_aggregation = False
122+
else:
123+
user_columns_in_data = list(set(user_columns).intersection(set(self.chat_data.columns)))
124+
if(len(user_columns_in_data) != len(user_columns)):
125+
print(
126+
"Warning: One or more requested user columns are not present in the data. Ignoring them."
127+
)
128+
129+
print(user_columns_in_data, user_columns)
130+
131+
for i in user_columns:
132+
matches = process.extract(i, self.chat_data.columns, limit=3)
133+
best_match, similarity = matches[0]
134+
135+
if similarity == 100:
136+
continue
137+
elif similarity >= 80:
138+
print("Did you mean", best_match, "instead of", i, "?")
139+
else:
140+
print(i, "not found in data and no close match.")
141+
142+
self.user_columns = user_columns_in_data
99143

100144
self.summable_columns = ["num_words", "num_chars", "num_messages"]
101145

146+
# ensure all lowercase
147+
self.convo_methods = [col.lower() for col in self.convo_methods]
148+
self.user_methods = [col.lower() for col in self.user_methods]
149+
self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize]
150+
self.user_columns = [col.lower() for col in self.user_columns]
151+
152+
# replace interchangable words in columns_to_summarize
153+
for i in range(len(self.convo_methods)):
154+
if self.convo_methods[i] == "average":
155+
self.convo_methods[i] = "mean"
156+
elif self.convo_methods[i] == "maximum":
157+
self.convo_methods[i] = "max"
158+
elif self.convo_methods[i] == "minimum":
159+
self.convo_methods[i] = "min"
160+
elif self.convo_methods[i] == "standard deviation":
161+
self.convo_methods[i] = "stdev"
162+
elif self.convo_methods[i] == "sd":
163+
self.convo_methods = "stdev"
164+
102165
def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame:
103166
"""
104167
Main driver function for creating conversation-level features.
@@ -185,7 +248,7 @@ def get_conversation_level_aggregates(self) -> None:
185248
)
186249

187250
# Standard Deviation of feature across the Conversation
188-
if 'std' in self.convo_methods:
251+
if 'stdev' in self.convo_methods:
189252
self.conv_data = pd.merge(
190253
left=self.conv_data,
191254
right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col),
@@ -210,6 +273,15 @@ def get_conversation_level_aggregates(self) -> None:
210273
on=[self.conversation_id_col],
211274
how="inner"
212275
)
276+
277+
# Median for the feature across the Conversation
278+
if 'median' in self.convo_methods:
279+
self.conv_data = pd.merge(
280+
left=self.conv_data,
281+
right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col),
282+
on=[self.conversation_id_col],
283+
how="inner"
284+
)
213285

214286
# Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini.
215287
for column in self.summable_columns:
@@ -242,7 +314,7 @@ def get_user_level_aggregates(self) -> None:
242314

243315
if self.convo_aggregation == True and self.user_aggregation == True:
244316

245-
# this may be right??
317+
# aggregates from the user level based on conversation methods
246318
if 'mean' in self.convo_methods:
247319
for user_column in self.user_columns:
248320
for user_method in self.user_methods:
@@ -254,7 +326,7 @@ def get_user_level_aggregates(self) -> None:
254326
how="inner"
255327
)
256328

257-
if 'std' in self.convo_methods:
329+
if 'stdev' in self.convo_methods:
258330
for user_column in self.user_columns:
259331
for user_method in self.user_methods:
260332
# Standard Deviation of User-Level Feature
@@ -286,95 +358,17 @@ def get_user_level_aggregates(self) -> None:
286358
on=[self.conversation_id_col],
287359
how="inner"
288360
)
289-
290-
291-
# Sum Columns were created using self.get_user_level_summed_features()
292-
# for column in self.columns_to_summarize:
293-
# # change to self.user_columns
294-
# # should be summable_columns
295-
296-
# # for method in self.user_methods:
297-
# # self.conv_data = pd.merge(
298-
# # left=self.conv_data,
299-
# # right=get_average(self.user_data.copy(), method+"_"+column, 'average_user_' + method + "_" +column, self.conversation_id_col),
300-
# # on=[self.conversation_id_col],
301-
# # how="inner"
302-
# # )
303-
304-
# if 'mean' in self.convo_methods:
305-
# # Average/Mean of User-Level Feature
306-
# self.conv_data = pd.merge(
307-
# left=self.conv_data,
308-
# right=get_average(self.user_data.copy(), "sum_"+column, 'average_user_sum_'+column, self.conversation_id_col),
309-
# on=[self.conversation_id_col],
310-
# how="inner"
311-
# )
312-
313-
# if 'std' in self.convo_methods:
314-
# # Standard Deviation of User-Level Feature
315-
# self.conv_data = pd.merge(
316-
# left=self.conv_data,
317-
# right=get_stdev(self.user_data.copy(), "sum_"+column, 'stdev_user_sum_'+column, self.conversation_id_col),
318-
# on=[self.conversation_id_col],
319-
# how="inner"
320-
# )
321-
322-
# if 'min' in self.convo_methods:
323-
# # Minima of User-Level Feature
324-
# self.conv_data = pd.merge(
325-
# left=self.conv_data,
326-
# right=get_min(self.user_data.copy(), "sum_"+column, 'min_user_sum_'+column, self.conversation_id_col),
327-
# on=[self.conversation_id_col],
328-
# how="inner"
329-
# )
330-
331-
# if 'max' in self.convo_methods:
332-
# # Maxima of User-Level Feature
333-
# self.conv_data = pd.merge(
334-
# left=self.conv_data,
335-
# right=get_max(self.user_data.copy(), "sum_"+column, 'max_user_sum_'+column, self.conversation_id_col),
336-
# on=[self.conversation_id_col],
337-
# how="inner"
338-
# )
339-
340-
# Average Columns were created using self.get_user_level_mean_features()
341-
for column in self.columns_to_summarize:
342-
343-
if 'mean' in self.convo_methods:
344-
# Average/Mean of User-Level Feature
345-
self.conv_data = pd.merge(
346-
left=self.conv_data,
347-
right=get_mean(self.user_data.copy(), "mean_"+column, 'mean_user_avg_'+column, self.conversation_id_col),
348-
on=[self.conversation_id_col],
349-
how="inner"
350-
)
351-
352-
if 'std' in self.convo_methods:
353-
# Standard Deviation of User-Level Feature
354-
self.conv_data = pd.merge(
355-
left=self.conv_data,
356-
right=get_stdev(self.user_data.copy(), "mean_"+column, 'stdev_user_avg_'+column, self.conversation_id_col),
357-
on=[self.conversation_id_col],
358-
how="inner"
359-
)
360-
361-
if 'min' in self.convo_methods:
362-
# Minima of User-Level Feature
363-
self.conv_data = pd.merge(
364-
left=self.conv_data,
365-
right=get_min(self.user_data.copy(), "mean_"+column, 'min_user_avg_'+column, self.conversation_id_col),
366-
on=[self.conversation_id_col],
367-
how="inner"
368-
)
369-
370-
if 'max' in self.convo_methods:
371-
# Maxima of User-Level Feature
372-
self.conv_data = pd.merge(
373-
left=self.conv_data,
374-
right=get_max(self.user_data.copy(), "mean_"+column, 'max_user_avg_'+column, self.conversation_id_col),
375-
on=[self.conversation_id_col],
376-
how="inner"
377-
)
361+
362+
if 'median' in self.convo_methods:
363+
for user_column in self.user_columns:
364+
for user_method in self.user_methods:
365+
# Median of User-Level Feature
366+
self.conv_data = pd.merge(
367+
left=self.conv_data,
368+
right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col),
369+
on=[self.conversation_id_col],
370+
how="inner"
371+
)
378372

379373

380374
def get_discursive_diversity_features(self) -> None:

0 commit comments

Comments
 (0)