Skip to content

Commit d35aeb1

Browse files
committed
updated user aggregation methods (max)
1 parent b2ed12a commit d35aeb1

File tree

4 files changed

+108
-32
lines changed

4 files changed

+108
-32
lines changed

examples/featurize.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,10 @@
9494
output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
9595
convo_methods = ['mean'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
9696
convo_columns = ['positive_bert'],
97-
# user_methods = ['mean', 'max'],
98-
# user_columns = ['positive_bert', 'negative_bert'],
99-
user_methods = ['max'],
100-
user_columns = ['negative_bert'],
97+
user_methods = ['mean', 'max'],
98+
user_columns = ['positive_bert', 'negative_bert'],
99+
# user_methods = ['max'],
100+
# user_columns = ['negative_bert'],
101101
turns = False,
102102
)
103103
tiny_juries_feature_builder_custom_aggregation.featurize(col="message")

src/team_comm_tools/utils/calculate_conversation_level_features.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ def __init__(self, chat_data: pd.DataFrame,
7171
self.convo_aggregation = convo_aggregation
7272
self.convo_methods = convo_methods
7373
self.user_aggregation = user_aggregation
74+
self.user_methods = user_methods
75+
self.user_columns = user_columns
7476
# Denotes the columns that can be summarized from the chat level, onto the conversation level.
7577
self.input_columns = list(input_columns)
7678
if 'conversation_num' not in self.input_columns:
@@ -177,7 +179,7 @@ def get_conversation_level_aggregates(self) -> None:
177179
if 'mean' in self.convo_methods:
178180
self.conv_data = pd.merge(
179181
left=self.conv_data,
180-
right=get_average(self.chat_data.copy(), column, 'average_'+column, self.conversation_id_col),
182+
right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col),
181183
on=[self.conversation_id_col],
182184
how="inner"
183185
)
@@ -247,7 +249,7 @@ def get_user_level_aggregates(self) -> None:
247249
# Average/Mean of User-Level Feature
248250
self.conv_data = pd.merge(
249251
left=self.conv_data,
250-
right=get_average(self.user_data.copy(), user_method + "_" +user_column, 'average_user_' + user_method + "_" +user_column, self.conversation_id_col),
252+
right=get_mean(self.user_data.copy(), user_method + "_" +user_column, "mean_user_" + user_method + "_" +user_column, self.conversation_id_col),
251253
on=[self.conversation_id_col],
252254
how="inner"
253255
)
@@ -269,7 +271,7 @@ def get_user_level_aggregates(self) -> None:
269271
# Minima of User-Level Feature
270272
self.conv_data = pd.merge(
271273
left=self.conv_data,
272-
right=get_min(self.user_data.copy(), user_method + "_" + user_column, 'min_user_sum_' + user_method + "_" + user_column, self.conversation_id_col),
274+
right=get_min(self.user_data.copy(), user_method + "_" + user_column, 'min_user_' + user_method + "_" + user_column, self.conversation_id_col),
273275
on=[self.conversation_id_col],
274276
how="inner"
275277
)
@@ -280,7 +282,7 @@ def get_user_level_aggregates(self) -> None:
280282
# Maxima of User-Level Feature
281283
self.conv_data = pd.merge(
282284
left=self.conv_data,
283-
right=get_max(self.user_data.copy(), user_method + "_" + user_column, 'max_user_sum_' + user_method + "_" + user_column, self.conversation_id_col),
285+
right=get_max(self.user_data.copy(), user_method + "_" + user_column, 'max_user_' + user_method + "_" + user_column, self.conversation_id_col),
284286
on=[self.conversation_id_col],
285287
how="inner"
286288
)
@@ -335,14 +337,14 @@ def get_user_level_aggregates(self) -> None:
335337
# how="inner"
336338
# )
337339

338-
# Average Columns were created using self.get_user_level_averaged_features()
340+
# Average Columns were created using self.get_user_level_mean_features()
339341
for column in self.columns_to_summarize:
340342

341343
if 'mean' in self.convo_methods:
342344
# Average/Mean of User-Level Feature
343345
self.conv_data = pd.merge(
344346
left=self.conv_data,
345-
right=get_average(self.user_data.copy(), "average_"+column, 'average_user_avg_'+column, self.conversation_id_col),
347+
right=get_mean(self.user_data.copy(), "mean_"+column, 'mean_user_avg_'+column, self.conversation_id_col),
346348
on=[self.conversation_id_col],
347349
how="inner"
348350
)
@@ -351,7 +353,7 @@ def get_user_level_aggregates(self) -> None:
351353
# Standard Deviation of User-Level Feature
352354
self.conv_data = pd.merge(
353355
left=self.conv_data,
354-
right=get_stdev(self.user_data.copy(), "average_"+column, 'stdev_user_avg_'+column, self.conversation_id_col),
356+
right=get_stdev(self.user_data.copy(), "mean_"+column, 'stdev_user_avg_'+column, self.conversation_id_col),
355357
on=[self.conversation_id_col],
356358
how="inner"
357359
)
@@ -360,7 +362,7 @@ def get_user_level_aggregates(self) -> None:
360362
# Minima of User-Level Feature
361363
self.conv_data = pd.merge(
362364
left=self.conv_data,
363-
right=get_min(self.user_data.copy(), "average_"+column, 'min_user_avg_'+column, self.conversation_id_col),
365+
right=get_min(self.user_data.copy(), "mean_"+column, 'min_user_avg_'+column, self.conversation_id_col),
364366
on=[self.conversation_id_col],
365367
how="inner"
366368
)
@@ -369,7 +371,7 @@ def get_user_level_aggregates(self) -> None:
369371
# Maxima of User-Level Feature
370372
self.conv_data = pd.merge(
371373
left=self.conv_data,
372-
right=get_max(self.user_data.copy(), "average_"+column, 'max_user_avg_'+column, self.conversation_id_col),
374+
right=get_max(self.user_data.copy(), "mean_"+column, 'max_user_avg_'+column, self.conversation_id_col),
373375
on=[self.conversation_id_col],
374376
how="inner"
375377
)

src/team_comm_tools/utils/calculate_user_level_features.py

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Importing modules from features
2-
from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_average_dataframe
2+
from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe
33
from team_comm_tools.features.get_user_network import *
44
from team_comm_tools.features.user_centroids import *
55
import warnings
@@ -84,12 +84,15 @@ def calculate_user_level_features(self) -> pd.DataFrame:
8484
:rtype: pd.DataFrame
8585
"""
8686

87-
# Get average features for all features
88-
self.get_user_level_averaged_features()
87+
# Get mean features for all features
88+
self.get_user_level_mean_features()
8989

9090
# Get total counts for all features
9191
self.get_user_level_summed_features()
9292

93+
# Get user summary statistics for all features
94+
self.get_user_level_summary_statistics_features()
95+
9396
# Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range)
9497
# self.get_centroids()
9598

@@ -111,7 +114,47 @@ def get_user_level_summary_statistics_features(self) -> None:
111114
112115
This is an open question, so we are putting a TODO here.
113116
"""
114-
pass
117+
118+
if self.user_aggregation == True:
119+
# For each summarizable feature
120+
for column in self.columns_to_summarize:
121+
122+
# # Average/Mean of feature across the Conversation
123+
# if 'mean' in self.user_methods:
124+
# self.conv_data = pd.merge(
125+
# left=self.conv_data,
126+
# right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col),
127+
# on=[self.conversation_id_col],
128+
# how="inner"
129+
# )
130+
131+
# # Standard Deviation of feature across the Conversation
132+
# if 'std' in self.convo_methods:
133+
# self.conv_data = pd.merge(
134+
# left=self.conv_data,
135+
# right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col),
136+
# on=[self.conversation_id_col],
137+
# how="inner"
138+
# )
139+
140+
# # Minima for the feature across the Conversation
141+
# if 'min' in self.convo_methods:
142+
# self.conv_data = pd.merge(
143+
# left=self.conv_data,
144+
# right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col),
145+
# on=[self.conversation_id_col],
146+
# how="inner"
147+
# )
148+
149+
# Maxima for the feature across the Conversation
150+
if 'max' in self.user_methods:
151+
# print('HELLO')
152+
self.user_data = pd.merge(
153+
left=self.user_data,
154+
right=get_user_max_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
155+
on=[self.conversation_id_col],
156+
how="inner"
157+
)
115158

116159
def get_user_level_summed_features(self) -> None:
117160
"""
@@ -157,11 +200,11 @@ def get_user_level_summed_features(self) -> None:
157200
how="inner"
158201
)
159202

160-
def get_user_level_averaged_features(self) -> None:
203+
def get_user_level_mean_features(self) -> None:
161204
"""
162-
Aggregate summary statistics by calculating average user-level features from chat-level features.
205+
Aggregate summary statistics by calculating mean user-level features from chat-level features.
163206
164-
This function calculates and merges the average features into the user-level data.
207+
This function calculates and merges the mean features into the user-level data.
165208
166209
:return: None
167210
:rtype: None
@@ -175,10 +218,11 @@ def get_user_level_averaged_features(self) -> None:
175218
# Average/Mean of feature across the Conversation
176219
self.user_data = pd.merge(
177220
left=self.user_data,
178-
right=get_user_average_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
221+
right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
179222
on=[self.conversation_id_col, self.speaker_id_col],
180223
how="inner"
181224
)
225+
182226

183227
def get_centroids(self) -> None:
184228
"""

src/team_comm_tools/utils/summarize_features.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,35 +31,65 @@ def get_user_sum_dataframe(chat_level_data, on_column, conversation_id_col, spea
3131
return(grouped_conversation_data)
3232

3333

34-
def get_user_average_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
34+
def get_user_mean_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
3535
"""Generate a user-level summary DataFrame by averaging a specified column per individual.
3636
37-
This function groups chat-level data by user and conversation, calculates the average values
37+
This function groups chat-level data by user and conversation, calculates the mean values
3838
of a specified numeric column for each user, and returns the resulting DataFrame.
3939
4040
:param chat_level_data: The DataFrame in which each row represents a single chat.
4141
:type chat_level_data: pandas.DataFrame
42-
:param on_column: The name of the numeric column to average for each user.
42+
:param on_column: The name of the numeric column to mean for each user.
4343
:type on_column: str
4444
:param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
4545
:type conversation_id_col: str
4646
:param speaker_id: The column name representing the user identifier.
4747
:type speaker_id: str
48-
:return: A grouped DataFrame with the average of the specified column per individual.
48+
:return: A grouped DataFrame with the mean of the specified column per individual.
4949
:rtype: pandas.DataFrame
5050
"""
5151
grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).mean().reset_index()
52-
grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "average_"+on_column}) # gets this dataframe:
53-
# Batch# Round# Speaker Average Number of Words
52+
grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "mean_"+on_column}) # gets this dataframe:
53+
# Batch# Round# Speaker Mean Number of Words
5454
# 0 1 Priya 100
5555
# 0 1 Yuluan 90
5656
return(grouped_conversation_data)
5757

58-
def get_average(input_data, column_to_summarize, new_column_name, conversation_id_col):
59-
"""Generate a summary DataFrame with the average of a specified column per conversation.
58+
def get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
59+
"""Generate a user-level summary DataFrame by maxing a specified column per individual.
6060
61-
This function calculates the average of a specified column for each conversation in the input data,
62-
and returns a DataFrame containing the conversation number and the calculated average.
61+
This function groups chat-level data by user and conversation, calculates the max values
62+
of a specified numeric column for each user, and returns the resulting DataFrame.
63+
64+
:param chat_level_data: The DataFrame in which each row represents a single chat.
65+
:type chat_level_data: pandas.DataFrame
66+
:param on_column: The name of the numeric column to max for each user.
67+
:type on_column: str
68+
:param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
69+
:type conversation_id_col: str
70+
:param speaker_id: The column name representing the user identifier.
71+
:type speaker_id: str
72+
:return: A grouped DataFrame with the max of the specified column per individual.
73+
:rtype: pandas.DataFrame
74+
"""
75+
grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).max().reset_index()
76+
grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "max_"+on_column}) # gets this dataframe:
77+
# Batch# Round# Speaker Max Number of Words
78+
# 0 1 Priya 100
79+
# 0 1 Yuluan 90
80+
return(grouped_conversation_data)
81+
82+
def get_user_min_dataframe():
83+
pass
84+
85+
def get_user_stdev_dataframe():
86+
pass
87+
88+
def get_mean(input_data, column_to_summarize, new_column_name, conversation_id_col):
89+
"""Generate a summary DataFrame with the mean of a specified column per conversation.
90+
91+
This function calculates the mean of a specified column for each conversation in the input data,
92+
and returns a DataFrame containing the conversation number and the calculated mean.
6393
6494
:param input_data: The DataFrame containing data at the chat or user level.
6595
:type input_data: pandas.DataFrame
@@ -69,7 +99,7 @@ def get_average(input_data, column_to_summarize, new_column_name, conversation_i
6999
:type new_column_name: str
70100
:param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
71101
:type conversation_id_col: str
72-
:return: A DataFrame with the conversation number and the average of the specified column.
102+
:return: A DataFrame with the conversation number and the mean of the specified column.
73103
:rtype: pandas.DataFrame
74104
"""
75105
input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.mean(x))

0 commit comments

Comments
 (0)