From 576a376899db997b6e2a5871ccba77b338b6bbcc Mon Sep 17 00:00:00 2001 From: amytangzheng Date: Wed, 16 Oct 2024 14:38:51 -0400 Subject: [PATCH] updates to package aggregation --- examples/featurize.py | 46 +++-- src/team_comm_tools/feature_builder.py | 8 +- .../calculate_conversation_level_features.py | 186 +++++++++--------- .../utils/calculate_user_level_features.py | 171 ++++++++++------ .../utils/summarize_features.py | 96 ++++++++- .../cleaned_data/test_package_aggregation.csv | 4 + 6 files changed, 327 insertions(+), 184 deletions(-) create mode 100644 tests/data/cleaned_data/test_package_aggregation.csv diff --git a/examples/featurize.py b/examples/featurize.py index 70f184f9..05f2b958 100644 --- a/examples/featurize.py +++ b/examples/featurize.py @@ -18,6 +18,9 @@ juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8') csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8') csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8') + test_df = pd.read_csv("C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv", encoding='utf-8') + + # C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv """ TINY / TEST DATASETS ------------------------------- @@ -68,7 +71,7 @@ # ) # tiny_juries_feature_builder_custom.featurize(col="message") - # # Tiny Juries with NO aggregations + # Tiny Juries with NO aggregations # print("Tiny Juries with No Aggregation...") # tiny_juries_feature_builder_no_aggregation = FeatureBuilder( # input_df = tiny_juries_df, @@ -84,23 +87,38 @@ # tiny_juries_feature_builder_no_aggregation.featurize(col="message") # Tiny Juries with custom Aggregations - print("Tiny Juries with Custom Aggregation...") - tiny_juries_feature_builder_custom_aggregation = FeatureBuilder( - input_df = tiny_juries_df, + # print("Tiny Juries with Custom Aggregation...") + # tiny_juries_feature_builder_custom_aggregation = FeatureBuilder( + # input_df = tiny_juries_df, + # grouping_keys = ["batch_num", "round_num"], + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv", + # output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv", + # output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv", + # convo_methods = ['max', 'median'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max. + # convo_columns = ['positive_bert'], + # user_methods = ['max', 'mean', 'min', 'median'], + # user_columns = ['positive_bert', 'negative_bert'], + # turns = False, + # ) + # tiny_juries_feature_builder_custom_aggregation.featurize(col="message") + + # Testing package aggregation + print("Testing package aggregation...") + testing_feature_builder_custom_aggregation = FeatureBuilder( + input_df = test_df, grouping_keys = ["batch_num", "round_num"], vector_directory = "./vector_data/", - output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv", - output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv", - output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv", - convo_methods = ['mean'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max. - convo_columns = ['positive_bert'], - user_methods = ['mean', 'max'], - user_columns = ['positive_bert', 'negative_bert'], - # user_methods = ['max'], - # user_columns = ['negative_bert'], + output_file_path_chat_level = "./test_package_TINY_chat_level_custom_agg.csv", + output_file_path_user_level = "./test_package_TINY_user_level_custom_agg.csv", + output_file_path_conv_level = "./test_package_TINY_conversation_level_custom_agg.csv", + convo_methods = ['max'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max. + convo_columns = ['positive_bert', 'negativity_bert'], + user_methods = ['MAX'], + user_columns = ['negative_bert'], turns = False, ) - tiny_juries_feature_builder_custom_aggregation.featurize(col="message") + testing_feature_builder_custom_aggregation.featurize(col="message") # # Tiny multi-task diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py index 8b28b0dc..8d523716 100644 --- a/src/team_comm_tools/feature_builder.py +++ b/src/team_comm_tools/feature_builder.py @@ -92,7 +92,7 @@ class FeatureBuilder: :param convo_aggregation: If true, will aggregate features at the conversational level. Defaults to True. :type convo_aggregation: bool, optional - :param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the conversational level. Defaults to ['mean', 'max', 'min', 'std']. + :param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the conversational level. Defaults to ['mean', 'max', 'min', 'stdev']. :type convo_methods: list, optional :param convo_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the conversational level. Defauts to all all numeric columns. @@ -101,7 +101,7 @@ class FeatureBuilder: :param user_aggregation: If true, will aggregate features at the speaker/user level. Defaults to True. :type convo_aggregation: bool, optional - :param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'std']. + :param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'stdev']. :type convo_methods: list, optional :param user_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the speaker/user level. Defauts to all all numeric columns. @@ -133,10 +133,10 @@ def __init__( regenerate_vectors: bool = False, custom_vect_path: str = None, convo_aggregation = True, - convo_methods: list = ['mean', 'max', 'min', 'std'], + convo_methods: list = ['mean', 'max', 'min', 'stdev'], convo_columns: list = None, user_aggregation = True, - user_methods: list = ['mean', 'max', 'min', 'std'], + user_methods: list = ['mean', 'max', 'min', 'stdev'], user_columns: list = None ) -> None: diff --git a/src/team_comm_tools/utils/calculate_conversation_level_features.py b/src/team_comm_tools/utils/calculate_conversation_level_features.py index 7a4c4032..aa148fbb 100644 --- a/src/team_comm_tools/utils/calculate_conversation_level_features.py +++ b/src/team_comm_tools/utils/calculate_conversation_level_features.py @@ -7,6 +7,7 @@ from team_comm_tools.utils.summarize_features import * from team_comm_tools.utils.gini_coefficient import * from team_comm_tools.utils.preprocess import * +from fuzzywuzzy import process class ConversationLevelFeaturesCalculator: """ @@ -29,13 +30,13 @@ class ConversationLevelFeaturesCalculator: :type input_columns: list :param convo_aggregation: If true, will aggregate features at the conversational level :type convo_aggregation: bool - :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) + :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) :type convo_methods: list :param convo_columns: Specifies which columns (at the chat level) users want aggregated :type convo_columns: list :param user_aggregation: If true, will aggregate features at the user level :type convo_aggregation: bool - :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) at the user level + :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level :type user_methods: list :param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level :type user_columns: list @@ -78,12 +79,13 @@ def __init__(self, chat_data: pd.DataFrame, if 'conversation_num' not in self.input_columns: self.input_columns.append('conversation_num') + # check if user inputted convo_columns is None if convo_columns is None: self.columns_to_summarize = [column for column in self.chat_data.columns \ if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])] else: if convo_aggregation == True and len(convo_columns) == 0: - warnings.warn( + print( "Warning: convo_aggregation is True but no convo_columns specified. Defaulting convo_aggregation to False." ) self.convo_aggregation = False @@ -91,14 +93,75 @@ def __init__(self, chat_data: pd.DataFrame, convo_columns_in_data = list(set(convo_columns).intersection(set(self.chat_data.columns))) if(len(convo_columns_in_data) != len(convo_columns)): - warnings.warn( + print( "Warning: One or more requested user columns are not present in the data. Ignoring them." ) + for i in convo_columns: + matches = process.extract(i, self.chat_data.columns, limit=3) + best_match, similarity = matches[0] + + if similarity == 100: + continue + elif similarity >= 80: + print("Did you mean", best_match, "instead of", i, "?") + else: + print(i, "not found in data and no close match.") + + self.columns_to_summarize = convo_columns_in_data + + # check if user inputted user_columns is None + if user_columns is None: + self.user_columns = [column for column in self.chat_data.columns \ + if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])] + else: + if user_aggregation == True and len(user_columns) == 0: + print("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.") + self.user_aggregation = False + else: + user_columns_in_data = list(set(user_columns).intersection(set(self.chat_data.columns))) + if(len(user_columns_in_data) != len(user_columns)): + print( + "Warning: One or more requested user columns are not present in the data. Ignoring them." + ) + + print(user_columns_in_data, user_columns) + + for i in user_columns: + matches = process.extract(i, self.chat_data.columns, limit=3) + best_match, similarity = matches[0] + + if similarity == 100: + continue + elif similarity >= 80: + print("Did you mean", best_match, "instead of", i, "?") + else: + print(i, "not found in data and no close match.") + + self.user_columns = user_columns_in_data self.summable_columns = ["num_words", "num_chars", "num_messages"] + # ensure all lowercase + self.convo_methods = [col.lower() for col in self.convo_methods] + self.user_methods = [col.lower() for col in self.user_methods] + self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize] + self.user_columns = [col.lower() for col in self.user_columns] + + # replace interchangable words in columns_to_summarize + for i in range(len(self.convo_methods)): + if self.convo_methods[i] == "average": + self.convo_methods[i] = "mean" + elif self.convo_methods[i] == "maximum": + self.convo_methods[i] = "max" + elif self.convo_methods[i] == "minimum": + self.convo_methods[i] = "min" + elif self.convo_methods[i] == "standard deviation": + self.convo_methods[i] = "stdev" + elif self.convo_methods[i] == "sd": + self.convo_methods = "stdev" + def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame: """ Main driver function for creating conversation-level features. @@ -185,7 +248,7 @@ def get_conversation_level_aggregates(self) -> None: ) # Standard Deviation of feature across the Conversation - if 'std' in self.convo_methods: + if 'stdev' in self.convo_methods: self.conv_data = pd.merge( left=self.conv_data, right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col), @@ -210,6 +273,15 @@ def get_conversation_level_aggregates(self) -> None: on=[self.conversation_id_col], how="inner" ) + + # Median for the feature across the Conversation + if 'median' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) # Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini. for column in self.summable_columns: @@ -242,7 +314,7 @@ def get_user_level_aggregates(self) -> None: if self.convo_aggregation == True and self.user_aggregation == True: - # this may be right?? + # aggregates from the user level based on conversation methods if 'mean' in self.convo_methods: for user_column in self.user_columns: for user_method in self.user_methods: @@ -254,7 +326,7 @@ def get_user_level_aggregates(self) -> None: how="inner" ) - if 'std' in self.convo_methods: + if 'stdev' in self.convo_methods: for user_column in self.user_columns: for user_method in self.user_methods: # Standard Deviation of User-Level Feature @@ -286,95 +358,17 @@ def get_user_level_aggregates(self) -> None: on=[self.conversation_id_col], how="inner" ) - - - # Sum Columns were created using self.get_user_level_summed_features() - # for column in self.columns_to_summarize: - # # change to self.user_columns - # # should be summable_columns - - # # for method in self.user_methods: - # # self.conv_data = pd.merge( - # # left=self.conv_data, - # # right=get_average(self.user_data.copy(), method+"_"+column, 'average_user_' + method + "_" +column, self.conversation_id_col), - # # on=[self.conversation_id_col], - # # how="inner" - # # ) - - # if 'mean' in self.convo_methods: - # # Average/Mean of User-Level Feature - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_average(self.user_data.copy(), "sum_"+column, 'average_user_sum_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # if 'std' in self.convo_methods: - # # Standard Deviation of User-Level Feature - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_stdev(self.user_data.copy(), "sum_"+column, 'stdev_user_sum_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # if 'min' in self.convo_methods: - # # Minima of User-Level Feature - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_min(self.user_data.copy(), "sum_"+column, 'min_user_sum_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # if 'max' in self.convo_methods: - # # Maxima of User-Level Feature - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_max(self.user_data.copy(), "sum_"+column, 'max_user_sum_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # Average Columns were created using self.get_user_level_mean_features() - for column in self.columns_to_summarize: - - if 'mean' in self.convo_methods: - # Average/Mean of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_mean(self.user_data.copy(), "mean_"+column, 'mean_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - if 'std' in self.convo_methods: - # Standard Deviation of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_stdev(self.user_data.copy(), "mean_"+column, 'stdev_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - if 'min' in self.convo_methods: - # Minima of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_min(self.user_data.copy(), "mean_"+column, 'min_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) - - if 'max' in self.convo_methods: - # Maxima of User-Level Feature - self.conv_data = pd.merge( - left=self.conv_data, - right=get_max(self.user_data.copy(), "mean_"+column, 'max_user_avg_'+column, self.conversation_id_col), - on=[self.conversation_id_col], - how="inner" - ) + + if 'median' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Median of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) def get_discursive_diversity_features(self) -> None: diff --git a/src/team_comm_tools/utils/calculate_user_level_features.py b/src/team_comm_tools/utils/calculate_user_level_features.py index e54a6417..26889fae 100644 --- a/src/team_comm_tools/utils/calculate_user_level_features.py +++ b/src/team_comm_tools/utils/calculate_user_level_features.py @@ -1,8 +1,8 @@ # Importing modules from features -from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe +from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe from team_comm_tools.features.get_user_network import * from team_comm_tools.features.user_centroids import * -import warnings +from fuzzywuzzy import process class UserLevelFeaturesCalculator: """ @@ -25,7 +25,7 @@ class UserLevelFeaturesCalculator: :type input_columns: list :param user_aggregation: If true, will aggregate features at the user level :type user_aggregation: bool - :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) at the user level + :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level :type user_methods: list :param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level :type user_columns: list @@ -57,21 +57,48 @@ def __init__(self, chat_data: pd.DataFrame, if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])] else: if user_aggregation == True and len(user_columns) == 0: - warnings.warn( - "Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False." - ) + print("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.") self.user_aggregation = False else: user_columns_in_data = list(set(user_columns).intersection(set(self.chat_data.columns))) - if(len(user_columns_in_data) != len(user_columns)): - warnings.warn( + print( "Warning: One or more requested user columns are not present in the data. Ignoring them." ) + + # print(user_columns_in_data, user_columns) + + for i in user_columns: + matches = process.extract(i, self.chat_data.columns, limit=3) + best_match, similarity = matches[0] + + if similarity == 100: + continue + elif similarity >= 80: + print("Did you mean", best_match, "instead of", i, "?") + else: + print(i, "not found in data and no close match.") self.columns_to_summarize = user_columns_in_data self.summable_columns = ["num_words", "num_chars", "num_messages"] + + # ensure all lowercase + self.user_methods = [col.lower() for col in self.user_methods] + self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize] + + # replace interchangable words in columns_to_summarize + for i in range(len(self.user_methods)): + if self.user_methods[i] == "average": + self.user_methods[i] = "mean" + elif self.user_methods[i] == "maximum": + self.user_methods[i] = "max" + elif self.user_methods[i] == "minimum": + self.user_methods[i] = "min" + elif self.user_methods[i] == "standard deviation": + self.user_methods[i] = "stdev" + elif self.user_methods[i] == "sd": + self.user_methods = "stdev" def calculate_user_level_features(self) -> pd.DataFrame: """ @@ -85,12 +112,12 @@ def calculate_user_level_features(self) -> pd.DataFrame: """ # Get mean features for all features - self.get_user_level_mean_features() + # self.get_user_level_mean_features() # Get total counts for all features self.get_user_level_summed_features() - # Get user summary statistics for all features + # Get user summary statistics for all features (e.g. mean, min, max, stdev) self.get_user_level_summary_statistics_features() # Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range) @@ -119,40 +146,48 @@ def get_user_level_summary_statistics_features(self) -> None: # For each summarizable feature for column in self.columns_to_summarize: - # # Average/Mean of feature across the Conversation - # if 'mean' in self.user_methods: - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # # Standard Deviation of feature across the Conversation - # if 'std' in self.convo_methods: - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # # Minima for the feature across the Conversation - # if 'min' in self.convo_methods: - # self.conv_data = pd.merge( - # left=self.conv_data, - # right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col), - # on=[self.conversation_id_col], - # how="inner" - # ) - - # Maxima for the feature across the Conversation + # Average/Mean of feature across the User + if 'mean' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Maxima for the feature across the User if 'max' in self.user_methods: - # print('HELLO') self.user_data = pd.merge( left=self.user_data, right=get_user_max_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), - on=[self.conversation_id_col], + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Minima for the feature across the User + if 'min' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_min_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Standard Deviation of feature across the User + if 'stdev' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_stdev_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Median of feature across the User + if 'median' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_median_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], how="inner" ) @@ -172,33 +207,39 @@ def get_user_level_summed_features(self) -> None: :rtype: None """ + # For each summarizable feature + for column in self.summable_columns: + + # Sum of feature across the Conversation + self.user_data = pd.merge( + left=self.user_data, + right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) - if self.user_aggregation == True: - - print("summable: ", self.summable_columns) + # if self.user_aggregation == True: - # For each summarizable feature - for column in self.summable_columns: + # # For each summarizable feature + # for column in self.summable_columns: - # Sum of feature across the Conversation - self.user_data = pd.merge( - left=self.user_data, - right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), - on=[self.conversation_id_col, self.speaker_id_col], - how="inner" - ) - - print("user columns: ", self.columns_to_summarize) - - for column in self.columns_to_summarize: # TODO --- Gini depends on the summation happening; something is happening here where it's causing Gini to break. - if column not in self.summable_columns: - # Sum of feature across the Conversation - self.user_data = pd.merge( - left=self.user_data, - right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), - on=[self.conversation_id_col, self.speaker_id_col], - how="inner" - ) + # # Sum of feature across the Conversation + # self.user_data = pd.merge( + # left=self.user_data, + # right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + # on=[self.conversation_id_col, self.speaker_id_col], + # how="inner" + # ) + + # for column in self.columns_to_summarize: # TODO --- Gini depends on the summation happening; something is happening here where it's causing Gini to break. + # if column not in self.summable_columns: + # # Sum of feature across the Conversation + # self.user_data = pd.merge( + # left=self.user_data, + # right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + # on=[self.conversation_id_col, self.speaker_id_col], + # how="inner" + # ) def get_user_level_mean_features(self) -> None: """ @@ -215,7 +256,7 @@ def get_user_level_mean_features(self) -> None: for column in self.columns_to_summarize: if 'mean' in self.user_methods: - # Average/Mean of feature across the Conversation + # Average/Mean of feature across the User self.user_data = pd.merge( left=self.user_data, right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), diff --git a/src/team_comm_tools/utils/summarize_features.py b/src/team_comm_tools/utils/summarize_features.py index 1ed4bc51..4270c55e 100644 --- a/src/team_comm_tools/utils/summarize_features.py +++ b/src/team_comm_tools/utils/summarize_features.py @@ -56,7 +56,7 @@ def get_user_mean_dataframe(chat_level_data, on_column, conversation_id_col, spe return(grouped_conversation_data) def get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): - """Generate a user-level summary DataFrame by maxing a specified column per individual. + """Generate a user-level summary DataFrame by maximizing a specified column per individual. This function groups chat-level data by user and conversation, calculates the max values of a specified numeric column for each user, and returns the resulting DataFrame. @@ -79,11 +79,77 @@ def get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, spea # 0 1 Yuluan 90 return(grouped_conversation_data) -def get_user_min_dataframe(): - pass +def get_user_min_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame by minmizing a specified column per individual. -def get_user_stdev_dataframe(): - pass + This function groups chat-level data by user and conversation, calculates the min values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to max for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the min of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).min().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "min_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Min Number of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) + +def get_user_stdev_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame with the standard deviation a specified column per individual. + + This function groups chat-level data by user and conversation, calculates the standard deviation values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to standard deviation for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the standard deviation of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).std().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "stdev_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Standard Deviation of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) + +def get_user_median_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col): + """Generate a user-level summary DataFrame with the median of a specified column per individual. + + This function groups chat-level data by user and conversation, calculates the median values + of a specified numeric column for each user, and returns the resulting DataFrame. + + :param chat_level_data: The DataFrame in which each row represents a single chat. + :type chat_level_data: pandas.DataFrame + :param on_column: The name of the numeric column to median for each user. + :type on_column: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :param speaker_id: The column name representing the user identifier. + :type speaker_id: str + :return: A grouped DataFrame with the median of the specified column per individual. + :rtype: pandas.DataFrame + """ + grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).median().reset_index() + grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "median_"+on_column}) # gets this dataframe: + # Batch# Round# Speaker Median of Words + # 0 1 Priya 100 + # 0 1 Yuluan 90 + return(grouped_conversation_data) def get_mean(input_data, column_to_summarize, new_column_name, conversation_id_col): """Generate a summary DataFrame with the mean of a specified column per conversation. @@ -165,6 +231,26 @@ def get_stdev(input_data, column_to_summarize, new_column_name, conversation_id_ input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.std(x)) return(input_data[[conversation_id_col, new_column_name]].drop_duplicates()) +def get_median(input_data, column_to_summarize, new_column_name, conversation_id_col): + """Generate a summary DataFrame with the median of a specified column per conversation. + + This function calculates the median of a specified column for each conversation in the input data, + and returns a DataFrame containing the conversation number and the calculated median. + + :param input_data: The DataFrame containing data at the chat or user level. + :type input_data: pandas.DataFrame + :param column_to_summarize: The name of the column to be aggregated for median. + :type column_to_summarize: str + :param new_column_name: The desired name for the new summary column. + :type new_column_name: str + :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. + :type conversation_id_col: str + :return: A DataFrame with the conversation number and the median of the specified column. + :rtype: pandas.DataFrame + """ + input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.median(x)) + return(input_data[[conversation_id_col, new_column_name]].drop_duplicates()) + def get_sum(input_data, column_to_summarize, new_column_name, conversation_id_col): """Generate a summary DataFrame with the sum of a specified column per conversation. diff --git a/tests/data/cleaned_data/test_package_aggregation.csv b/tests/data/cleaned_data/test_package_aggregation.csv new file mode 100644 index 00000000..af59ca61 --- /dev/null +++ b/tests/data/cleaned_data/test_package_aggregation.csv @@ -0,0 +1,4 @@ +batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes +0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,This is my message.,1,1,0.333333333,3 +0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,Hi!,1,1,0.333333333,3 +0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,How are you?,1,1,0.333333333,3