From 576a376899db997b6e2a5871ccba77b338b6bbcc Mon Sep 17 00:00:00 2001
From: amytangzheng <amy.tang.zheng@gmail.com>
Date: Wed, 16 Oct 2024 14:38:51 -0400
Subject: [PATCH] updates to package aggregation

---
 examples/featurize.py                         |  46 +++--
 src/team_comm_tools/feature_builder.py        |   8 +-
 .../calculate_conversation_level_features.py  | 186 +++++++++---------
 .../utils/calculate_user_level_features.py    | 171 ++++++++++------
 .../utils/summarize_features.py               |  96 ++++++++-
 .../cleaned_data/test_package_aggregation.csv |   4 +
 6 files changed, 327 insertions(+), 184 deletions(-)
 create mode 100644 tests/data/cleaned_data/test_package_aggregation.csv

diff --git a/examples/featurize.py b/examples/featurize.py
index 70f184f9..05f2b958 100644
--- a/examples/featurize.py
+++ b/examples/featurize.py
@@ -18,6 +18,9 @@
 	juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8')
 	csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8')
 	csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8')
+	test_df = pd.read_csv("C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv", encoding='utf-8')
+ 
+	# C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv
 	
 	"""
 	TINY / TEST DATASETS -------------------------------
@@ -68,7 +71,7 @@
 	# )
 	# tiny_juries_feature_builder_custom.featurize(col="message")
 
-	# # Tiny Juries with NO aggregations
+	# Tiny Juries with NO aggregations
 	# print("Tiny Juries with No Aggregation...")
 	# tiny_juries_feature_builder_no_aggregation = FeatureBuilder(
 	# 	input_df = tiny_juries_df,
@@ -84,23 +87,38 @@
 	# tiny_juries_feature_builder_no_aggregation.featurize(col="message")
 
 	# Tiny Juries with custom Aggregations
-	print("Tiny Juries with Custom Aggregation...")
-	tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
-		input_df = tiny_juries_df,
+	# print("Tiny Juries with Custom Aggregation...")
+	# tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
+	# 	input_df = tiny_juries_df,
+	# 	grouping_keys = ["batch_num", "round_num"],
+	# 	vector_directory = "./vector_data/",
+	# 	output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
+	# 	output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
+	# 	output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
+	# 	convo_methods = ['max', 'median'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
+	# 	convo_columns = ['positive_bert'],
+	# 	user_methods = ['max', 'mean', 'min', 'median'],
+	# 	user_columns = ['positive_bert', 'negative_bert'],
+	# 	turns = False,
+	# )
+	# tiny_juries_feature_builder_custom_aggregation.featurize(col="message")	
+ 
+	# Testing package aggregation
+	print("Testing package aggregation...")
+	testing_feature_builder_custom_aggregation = FeatureBuilder(
+		input_df = test_df,
 		grouping_keys = ["batch_num", "round_num"],
 		vector_directory = "./vector_data/",
-		output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
-		output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
-		output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
-		convo_methods = ['mean'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
-		convo_columns = ['positive_bert'],
-		user_methods = ['mean', 'max'],
-		user_columns = ['positive_bert', 'negative_bert'],
-		# user_methods = ['max'],
-		# user_columns = ['negative_bert'],
+		output_file_path_chat_level = "./test_package_TINY_chat_level_custom_agg.csv",
+		output_file_path_user_level = "./test_package_TINY_user_level_custom_agg.csv",
+		output_file_path_conv_level = "./test_package_TINY_conversation_level_custom_agg.csv",
+  		convo_methods = ['max'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
+		convo_columns = ['positive_bert', 'negativity_bert'],
+		user_methods = ['MAX'],
+		user_columns = ['negative_bert'],
 		turns = False,
 	)
-	tiny_juries_feature_builder_custom_aggregation.featurize(col="message")	
+	testing_feature_builder_custom_aggregation.featurize(col="message")	
 
 
 	# # Tiny multi-task
diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py
index 8b28b0dc..8d523716 100644
--- a/src/team_comm_tools/feature_builder.py
+++ b/src/team_comm_tools/feature_builder.py
@@ -92,7 +92,7 @@ class FeatureBuilder:
     :param convo_aggregation: If true, will aggregate features at the conversational level. Defaults to True.
     :type convo_aggregation: bool, optional
 
-    :param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the conversational level. Defaults to ['mean', 'max', 'min', 'std'].
+    :param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the conversational level. Defaults to ['mean', 'max', 'min', 'stdev'].
     :type convo_methods: list, optional
 
     :param convo_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the conversational level. Defauts to all all numeric columns.
@@ -101,7 +101,7 @@ class FeatureBuilder:
     :param user_aggregation: If true, will aggregate features at the speaker/user level. Defaults to True.
     :type convo_aggregation: bool, optional
 
-    :param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'std'].
+    :param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'stdev'].
     :type convo_methods: list, optional
 
     :param user_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the speaker/user level. Defauts to all all numeric columns.
@@ -133,10 +133,10 @@ def __init__(
             regenerate_vectors: bool = False,
             custom_vect_path: str = None,
             convo_aggregation = True,
-            convo_methods: list = ['mean', 'max', 'min', 'std'],
+            convo_methods: list = ['mean', 'max', 'min', 'stdev'],
             convo_columns: list = None,
             user_aggregation = True,
-            user_methods: list = ['mean', 'max', 'min', 'std'],
+            user_methods: list = ['mean', 'max', 'min', 'stdev'],
             user_columns: list = None
         ) -> None:
 
diff --git a/src/team_comm_tools/utils/calculate_conversation_level_features.py b/src/team_comm_tools/utils/calculate_conversation_level_features.py
index 7a4c4032..aa148fbb 100644
--- a/src/team_comm_tools/utils/calculate_conversation_level_features.py
+++ b/src/team_comm_tools/utils/calculate_conversation_level_features.py
@@ -7,6 +7,7 @@
 from team_comm_tools.utils.summarize_features import *
 from team_comm_tools.utils.gini_coefficient import *
 from team_comm_tools.utils.preprocess import *
+from fuzzywuzzy import process
 
 class ConversationLevelFeaturesCalculator:
     """
@@ -29,13 +30,13 @@ class ConversationLevelFeaturesCalculator:
     :type input_columns: list
     :param convo_aggregation: If true, will aggregate features at the conversational level
     :type convo_aggregation: bool
-    :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, std...)
+    :param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...)
     :type convo_methods: list
     :param convo_columns: Specifies which columns (at the chat level) users want aggregated
     :type convo_columns: list
     :param user_aggregation: If true, will aggregate features at the user level
     :type convo_aggregation: bool
-    :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) at the user level
+    :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level
     :type user_methods: list
     :param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level
     :type user_columns: list
@@ -78,12 +79,13 @@ def __init__(self, chat_data: pd.DataFrame,
         if 'conversation_num' not in self.input_columns:
             self.input_columns.append('conversation_num')
 
+        # check if user inputted convo_columns is None
         if convo_columns is None:
             self.columns_to_summarize = [column for column in self.chat_data.columns \
                                         if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
         else:
             if convo_aggregation == True and len(convo_columns) == 0:
-                warnings.warn(
+                print(
                     "Warning: convo_aggregation is True but no convo_columns specified. Defaulting convo_aggregation to False."
                 )
                 self.convo_aggregation = False
@@ -91,14 +93,75 @@ def __init__(self, chat_data: pd.DataFrame,
                 convo_columns_in_data = list(set(convo_columns).intersection(set(self.chat_data.columns)))
 
                 if(len(convo_columns_in_data) != len(convo_columns)):
-                    warnings.warn(
+                    print(
                         "Warning: One or more requested user columns are not present in the data. Ignoring them."
                     )
                     
+                    for i in convo_columns:
+                        matches = process.extract(i, self.chat_data.columns, limit=3)
+                        best_match, similarity = matches[0]
+                        
+                        if similarity == 100:
+                            continue
+                        elif similarity >= 80:
+                            print("Did you mean", best_match, "instead of", i, "?")
+                        else:
+                            print(i, "not found in data and no close match.")
+
+                    
                 self.columns_to_summarize = convo_columns_in_data
+                
+        # check if user inputted user_columns is None 
+        if user_columns is None:
+            self.user_columns = [column for column in self.chat_data.columns \
+                                        if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
+        else:
+            if user_aggregation == True and len(user_columns) == 0:
+                print("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.")
+                self.user_aggregation = False
+            else:
+                user_columns_in_data = list(set(user_columns).intersection(set(self.chat_data.columns)))
+                if(len(user_columns_in_data) != len(user_columns)):
+                    print(
+                        "Warning: One or more requested user columns are not present in the data. Ignoring them."
+                    )
+                    
+                    print(user_columns_in_data, user_columns)
+                    
+                    for i in user_columns:
+                        matches = process.extract(i, self.chat_data.columns, limit=3)
+                        best_match, similarity = matches[0]
+                        
+                        if similarity == 100:
+                            continue
+                        elif similarity >= 80:
+                            print("Did you mean", best_match, "instead of", i, "?")
+                        else:
+                            print(i, "not found in data and no close match.")
+
+                self.user_columns = user_columns_in_data
 
         self.summable_columns = ["num_words", "num_chars", "num_messages"]
         
+        # ensure all lowercase
+        self.convo_methods = [col.lower() for col in self.convo_methods]
+        self.user_methods = [col.lower() for col in self.user_methods]
+        self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize]
+        self.user_columns = [col.lower() for col in self.user_columns]
+        
+        # replace interchangable words in columns_to_summarize
+        for i in range(len(self.convo_methods)):
+            if self.convo_methods[i] == "average":
+                self.convo_methods[i] = "mean"
+            elif self.convo_methods[i] == "maximum":
+                self.convo_methods[i] = "max"
+            elif self.convo_methods[i] == "minimum":
+                self.convo_methods[i] = "min"
+            elif self.convo_methods[i] == "standard deviation":
+                self.convo_methods[i] = "stdev"
+            elif self.convo_methods[i] == "sd":
+                self.convo_methods = "stdev"
+        
     def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame:
         """
         Main driver function for creating conversation-level features.
@@ -185,7 +248,7 @@ def get_conversation_level_aggregates(self) -> None:
                     )
 
                 # Standard Deviation of feature across the Conversation
-                if 'std' in self.convo_methods:
+                if 'stdev' in self.convo_methods:
                     self.conv_data = pd.merge(
                         left=self.conv_data,
                         right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col),
@@ -210,6 +273,15 @@ def get_conversation_level_aggregates(self) -> None:
                         on=[self.conversation_id_col],
                         how="inner"
                     )
+                    
+                # Median for the feature across the Conversation
+                if 'median' in self.convo_methods:
+                    self.conv_data = pd.merge(
+                        left=self.conv_data,
+                        right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col),
+                        on=[self.conversation_id_col],
+                        how="inner"
+                    )
 
         # Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini.
         for column in self.summable_columns:
@@ -242,7 +314,7 @@ def get_user_level_aggregates(self) -> None:
 
         if self.convo_aggregation == True and self.user_aggregation == True:
             
-            # this may be right??
+            # aggregates from the user level based on conversation methods
             if 'mean' in self.convo_methods:
                 for user_column in self.user_columns:
                     for user_method in self.user_methods:
@@ -254,7 +326,7 @@ def get_user_level_aggregates(self) -> None:
                             how="inner"
                         )
 
-            if 'std' in self.convo_methods:
+            if 'stdev' in self.convo_methods:
                 for user_column in self.user_columns:
                     for user_method in self.user_methods:
                         # Standard Deviation of User-Level Feature
@@ -286,95 +358,17 @@ def get_user_level_aggregates(self) -> None:
                             on=[self.conversation_id_col],
                             how="inner"
                         )
-
-
-            # Sum Columns were created using self.get_user_level_summed_features()
-            # for column in self.columns_to_summarize:
-            #     # change to self.user_columns
-            #     # should be summable_columns
-
-            #     # for method in self.user_methods:
-            #     #     self.conv_data = pd.merge(
-            #     #         left=self.conv_data,
-            #     #         right=get_average(self.user_data.copy(), method+"_"+column, 'average_user_' + method + "_" +column, self.conversation_id_col),
-            #     #         on=[self.conversation_id_col],
-            #     #         how="inner"
-            #     #     )
-
-            #     if 'mean' in self.convo_methods:
-            #         # Average/Mean of User-Level Feature
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_average(self.user_data.copy(), "sum_"+column, 'average_user_sum_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-
-            #     if 'std' in self.convo_methods:
-            #         # Standard Deviation of User-Level Feature
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_stdev(self.user_data.copy(), "sum_"+column, 'stdev_user_sum_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-
-            #     if 'min' in self.convo_methods:
-            #         # Minima of User-Level Feature
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_min(self.user_data.copy(), "sum_"+column, 'min_user_sum_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-                
-            #     if 'max' in self.convo_methods:
-            #         # Maxima of User-Level Feature
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_max(self.user_data.copy(), "sum_"+column, 'max_user_sum_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-
-            # Average Columns were created using self.get_user_level_mean_features()
-            for column in self.columns_to_summarize:
-                
-                if 'mean' in self.convo_methods:
-                    # Average/Mean of User-Level Feature
-                    self.conv_data = pd.merge(
-                        left=self.conv_data,
-                        right=get_mean(self.user_data.copy(), "mean_"+column, 'mean_user_avg_'+column, self.conversation_id_col),
-                        on=[self.conversation_id_col],
-                        how="inner"
-                    )
-
-                if 'std' in self.convo_methods:
-                    # Standard Deviation of User-Level Feature
-                    self.conv_data = pd.merge(
-                        left=self.conv_data,
-                        right=get_stdev(self.user_data.copy(), "mean_"+column, 'stdev_user_avg_'+column, self.conversation_id_col),
-                        on=[self.conversation_id_col],
-                        how="inner"
-                    )
-
-                if 'min' in self.convo_methods:
-                    # Minima of User-Level Feature
-                    self.conv_data = pd.merge(
-                        left=self.conv_data,
-                        right=get_min(self.user_data.copy(), "mean_"+column, 'min_user_avg_'+column, self.conversation_id_col),
-                        on=[self.conversation_id_col],
-                        how="inner"
-                    )
-
-                if 'max' in self.convo_methods:
-                    # Maxima of User-Level Feature
-                    self.conv_data = pd.merge(
-                        left=self.conv_data,
-                        right=get_max(self.user_data.copy(), "mean_"+column, 'max_user_avg_'+column, self.conversation_id_col),
-                        on=[self.conversation_id_col],
-                        how="inner"
-                    )
+                        
+            if 'median' in self.convo_methods:
+                for user_column in self.user_columns:
+                    for user_method in self.user_methods:
+                        # Median of User-Level Feature
+                        self.conv_data = pd.merge(
+                            left=self.conv_data,
+                            right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col),
+                            on=[self.conversation_id_col],
+                            how="inner"
+                        )
 
 
     def get_discursive_diversity_features(self) -> None:
diff --git a/src/team_comm_tools/utils/calculate_user_level_features.py b/src/team_comm_tools/utils/calculate_user_level_features.py
index e54a6417..26889fae 100644
--- a/src/team_comm_tools/utils/calculate_user_level_features.py
+++ b/src/team_comm_tools/utils/calculate_user_level_features.py
@@ -1,8 +1,8 @@
 # Importing modules from features
-from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe
+from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe
 from team_comm_tools.features.get_user_network import *
 from team_comm_tools.features.user_centroids import *
-import warnings
+from fuzzywuzzy import process
 
 class UserLevelFeaturesCalculator:
     """
@@ -25,7 +25,7 @@ class UserLevelFeaturesCalculator:
     :type input_columns: list
     :param user_aggregation: If true, will aggregate features at the user level
     :type user_aggregation: bool
-    :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) at the user level
+    :param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level
     :type user_methods: list
     :param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level
     :type user_columns: list
@@ -57,21 +57,48 @@ def __init__(self, chat_data: pd.DataFrame,
                                         if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
         else:
             if user_aggregation == True and len(user_columns) == 0:
-                warnings.warn(
-                    "Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False."
-                )
+                print("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.")
                 self.user_aggregation = False
             else:
                 user_columns_in_data = list(set(user_columns).intersection(set(self.chat_data.columns)))
-
                 if(len(user_columns_in_data) != len(user_columns)):
-                    warnings.warn(
+                    print(
                         "Warning: One or more requested user columns are not present in the data. Ignoring them."
                     )
+                    
+                    # print(user_columns_in_data, user_columns)
+                    
+                    for i in user_columns:
+                        matches = process.extract(i, self.chat_data.columns, limit=3)
+                        best_match, similarity = matches[0]
+                        
+                        if similarity == 100:
+                            continue
+                        elif similarity >= 80:
+                            print("Did you mean", best_match, "instead of", i, "?")
+                        else:
+                            print(i, "not found in data and no close match.")
 
                 self.columns_to_summarize = user_columns_in_data
 
         self.summable_columns = ["num_words", "num_chars", "num_messages"]
+        
+        # ensure all lowercase
+        self.user_methods = [col.lower() for col in self.user_methods]
+        self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize]
+        
+        # replace interchangable words in columns_to_summarize
+        for i in range(len(self.user_methods)):
+            if self.user_methods[i] == "average":
+                self.user_methods[i] = "mean"
+            elif self.user_methods[i] == "maximum":
+                self.user_methods[i] = "max"
+            elif self.user_methods[i] == "minimum":
+                self.user_methods[i] = "min"
+            elif self.user_methods[i] == "standard deviation":
+                self.user_methods[i] = "stdev"
+            elif self.user_methods[i] == "sd":
+                self.user_methods = "stdev"
 
     def calculate_user_level_features(self) -> pd.DataFrame:
         """
@@ -85,12 +112,12 @@ def calculate_user_level_features(self) -> pd.DataFrame:
         """
 
         # Get mean features for all features
-        self.get_user_level_mean_features()
+        # self.get_user_level_mean_features()
         
         # Get total counts for all features
         self.get_user_level_summed_features()
         
-        # Get user summary statistics for all features
+        # Get user summary statistics for all features (e.g. mean, min, max, stdev)
         self.get_user_level_summary_statistics_features()
         
         # Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range)
@@ -119,40 +146,48 @@ def get_user_level_summary_statistics_features(self) -> None:
             # For each summarizable feature
             for column in self.columns_to_summarize:
                 
-            #     # Average/Mean of feature across the Conversation
-            #     if 'mean' in self.user_methods:
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-
-            #     # Standard Deviation of feature across the Conversation
-            #     if 'std' in self.convo_methods:
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-
-            #     # Minima for the feature across the Conversation
-            #     if 'min' in self.convo_methods:
-            #         self.conv_data = pd.merge(
-            #             left=self.conv_data,
-            #             right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col),
-            #             on=[self.conversation_id_col],
-            #             how="inner"
-            #         )
-
-                # Maxima for the feature across the Conversation
+                # Average/Mean of feature across the User
+                if 'mean' in self.user_methods:
+                    self.user_data = pd.merge(
+                        left=self.user_data,
+                        right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+                        on=[self.conversation_id_col, self.speaker_id_col],
+                        how="inner"
+                    )
+                    
+                # Maxima for the feature across the User
                 if 'max' in self.user_methods:
-                    # print('HELLO')
                     self.user_data = pd.merge(
                         left=self.user_data,
                         right=get_user_max_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
-                        on=[self.conversation_id_col],
+                        on=[self.conversation_id_col, self.speaker_id_col],
+                        how="inner"
+                    )
+                    
+                # Minima for the feature across the User
+                if 'min' in self.user_methods:
+                    self.user_data = pd.merge(
+                        left=self.user_data,
+                        right=get_user_min_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+                        on=[self.conversation_id_col, self.speaker_id_col],
+                        how="inner"
+                    )
+                    
+                # Standard Deviation of feature across the User
+                if 'stdev' in self.user_methods:
+                    self.user_data = pd.merge(
+                        left=self.user_data,
+                        right=get_user_stdev_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+                        on=[self.conversation_id_col, self.speaker_id_col],
+                        how="inner"
+                    )
+                    
+                # Median of feature across the User
+                if 'median' in self.user_methods:
+                    self.user_data = pd.merge(
+                        left=self.user_data,
+                        right=get_user_median_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+                        on=[self.conversation_id_col, self.speaker_id_col],
                         how="inner"
                     )
 
@@ -172,33 +207,39 @@ def get_user_level_summed_features(self) -> None:
         :rtype: None
         """
         
+        # For each summarizable feature
+        for column in self.summable_columns:
+                
+            # Sum of feature across the Conversation
+            self.user_data = pd.merge(
+                left=self.user_data,
+                right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+                on=[self.conversation_id_col, self.speaker_id_col],
+                how="inner"
+            )
 
-        if self.user_aggregation == True:
-
-            print("summable: ", self.summable_columns)
+        # if self.user_aggregation == True:
 
-            # For each summarizable feature
-            for column in self.summable_columns:
+        #     # For each summarizable feature
+        #     for column in self.summable_columns:
                 
-                # Sum of feature across the Conversation
-                self.user_data = pd.merge(
-                    left=self.user_data,
-                    right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
-                    on=[self.conversation_id_col, self.speaker_id_col],
-                    how="inner"
-                )
-
-            print("user columns: ", self.columns_to_summarize)
-
-            for column in self.columns_to_summarize: # TODO --- Gini depends on the summation happening; something is happening here where it's causing Gini to break.
-                if column not in self.summable_columns:
-                    # Sum of feature across the Conversation
-                    self.user_data = pd.merge(
-                        left=self.user_data,
-                        right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
-                        on=[self.conversation_id_col, self.speaker_id_col],
-                        how="inner"
-                    )
+        #         # Sum of feature across the Conversation
+        #         self.user_data = pd.merge(
+        #             left=self.user_data,
+        #             right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+        #             on=[self.conversation_id_col, self.speaker_id_col],
+        #             how="inner"
+        #         )
+
+            # for column in self.columns_to_summarize: # TODO --- Gini depends on the summation happening; something is happening here where it's causing Gini to break.
+            #     if column not in self.summable_columns:
+            #         # Sum of feature across the Conversation
+            #         self.user_data = pd.merge(
+            #             left=self.user_data,
+            #             right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
+            #             on=[self.conversation_id_col, self.speaker_id_col],
+            #             how="inner"
+            #         )
 
     def get_user_level_mean_features(self) -> None:
         """
@@ -215,7 +256,7 @@ def get_user_level_mean_features(self) -> None:
             for column in self.columns_to_summarize:
 
                 if 'mean' in self.user_methods:
-                    # Average/Mean of feature across the Conversation
+                    # Average/Mean of feature across the User
                     self.user_data = pd.merge(
                         left=self.user_data,
                         right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
diff --git a/src/team_comm_tools/utils/summarize_features.py b/src/team_comm_tools/utils/summarize_features.py
index 1ed4bc51..4270c55e 100644
--- a/src/team_comm_tools/utils/summarize_features.py
+++ b/src/team_comm_tools/utils/summarize_features.py
@@ -56,7 +56,7 @@ def get_user_mean_dataframe(chat_level_data, on_column, conversation_id_col, spe
     return(grouped_conversation_data)
 
 def get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
-    """Generate a user-level summary DataFrame by maxing a specified column per individual.
+    """Generate a user-level summary DataFrame by maximizing a specified column per individual.
 
     This function groups chat-level data by user and conversation, calculates the max values
     of a specified numeric column for each user, and returns the resulting DataFrame.
@@ -79,11 +79,77 @@ def get_user_max_dataframe(chat_level_data, on_column, conversation_id_col, spea
     # 0      1      Yuluan   90
     return(grouped_conversation_data)
 
-def get_user_min_dataframe():
-    pass
+def get_user_min_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
+    """Generate a user-level summary DataFrame by minmizing a specified column per individual.
 
-def get_user_stdev_dataframe():
-    pass
+    This function groups chat-level data by user and conversation, calculates the min values
+    of a specified numeric column for each user, and returns the resulting DataFrame.
+
+    :param chat_level_data: The DataFrame in which each row represents a single chat.
+    :type chat_level_data: pandas.DataFrame
+    :param on_column: The name of the numeric column to max for each user.
+    :type on_column: str
+    :param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
+    :type conversation_id_col: str
+    :param speaker_id: The column name representing the user identifier.
+    :type speaker_id: str
+    :return: A grouped DataFrame with the min of the specified column per individual.
+    :rtype: pandas.DataFrame
+    """
+    grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).min().reset_index()
+    grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "min_"+on_column})    # gets this dataframe:
+    # Batch# Round# Speaker  Min Number of Words
+    # 0      1      Priya    100
+    # 0      1      Yuluan   90
+    return(grouped_conversation_data)
+
+def get_user_stdev_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
+    """Generate a user-level summary DataFrame with the standard deviation a specified column per individual.
+
+    This function groups chat-level data by user and conversation, calculates the standard deviation values
+    of a specified numeric column for each user, and returns the resulting DataFrame.
+
+    :param chat_level_data: The DataFrame in which each row represents a single chat.
+    :type chat_level_data: pandas.DataFrame
+    :param on_column: The name of the numeric column to standard deviation for each user.
+    :type on_column: str
+    :param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
+    :type conversation_id_col: str
+    :param speaker_id: The column name representing the user identifier.
+    :type speaker_id: str
+    :return: A grouped DataFrame with the standard deviation of the specified column per individual.
+    :rtype: pandas.DataFrame
+    """   
+    grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).std().reset_index()
+    grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "stdev_"+on_column})    # gets this dataframe:
+    # Batch# Round# Speaker  Standard Deviation of Words
+    # 0      1      Priya    100
+    # 0      1      Yuluan   90
+    return(grouped_conversation_data) 
+
+def get_user_median_dataframe(chat_level_data, on_column, conversation_id_col, speaker_id_col):
+    """Generate a user-level summary DataFrame with the median of a specified column per individual.
+
+    This function groups chat-level data by user and conversation, calculates the median values
+    of a specified numeric column for each user, and returns the resulting DataFrame.
+
+    :param chat_level_data: The DataFrame in which each row represents a single chat.
+    :type chat_level_data: pandas.DataFrame
+    :param on_column: The name of the numeric column to median for each user.
+    :type on_column: str
+    :param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
+    :type conversation_id_col: str
+    :param speaker_id: The column name representing the user identifier.
+    :type speaker_id: str
+    :return: A grouped DataFrame with the median of the specified column per individual.
+    :rtype: pandas.DataFrame
+    """   
+    grouped_conversation_data = chat_level_data[[conversation_id_col, speaker_id_col, on_column]].groupby([conversation_id_col, speaker_id_col]).median().reset_index()
+    grouped_conversation_data = grouped_conversation_data.rename(columns = {on_column: "median_"+on_column})    # gets this dataframe:
+    # Batch# Round# Speaker  Median of Words
+    # 0      1      Priya    100
+    # 0      1      Yuluan   90
+    return(grouped_conversation_data) 
 
 def get_mean(input_data, column_to_summarize, new_column_name, conversation_id_col):
     """Generate a summary DataFrame with the mean of a specified column per conversation.
@@ -165,6 +231,26 @@ def get_stdev(input_data, column_to_summarize, new_column_name, conversation_id_
     input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.std(x))
     return(input_data[[conversation_id_col, new_column_name]].drop_duplicates())
 
+def get_median(input_data, column_to_summarize, new_column_name, conversation_id_col):
+    """Generate a summary DataFrame with the median of a specified column per conversation.
+
+    This function calculates the median of a specified column for each conversation in the input data,
+    and returns a DataFrame containing the conversation number and the calculated median.
+
+    :param input_data: The DataFrame containing data at the chat or user level.
+    :type input_data: pandas.DataFrame
+    :param column_to_summarize: The name of the column to be aggregated for median.
+    :type column_to_summarize: str
+    :param new_column_name: The desired name for the new summary column.
+    :type new_column_name: str
+    :param conversation_id_col: A string representing the column name that should be selected as the conversation ID.
+    :type conversation_id_col: str
+    :return: A DataFrame with the conversation number and the median of the specified column.
+    :rtype: pandas.DataFrame
+    """
+    input_data[new_column_name] = input_data.groupby([conversation_id_col], sort=False)[column_to_summarize].transform(lambda x: np.median(x))
+    return(input_data[[conversation_id_col, new_column_name]].drop_duplicates())
+
 def get_sum(input_data, column_to_summarize, new_column_name, conversation_id_col):
     """Generate a summary DataFrame with the sum of a specified column per conversation.
 
diff --git a/tests/data/cleaned_data/test_package_aggregation.csv b/tests/data/cleaned_data/test_package_aggregation.csv
new file mode 100644
index 00000000..af59ca61
--- /dev/null
+++ b/tests/data/cleaned_data/test_package_aggregation.csv
@@ -0,0 +1,4 @@
+﻿batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes
+0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,This is my message.,1,1,0.333333333,3
+0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,Hi!,1,1,0.333333333,3
+0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,How are you?,1,1,0.333333333,3