Watts-Lab · xehu · Dec 3, 2024 · Sep 11, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/examples/featurize.py b/examples/featurize.py
@@ -18,8 +18,8 @@
 	juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8')
 	csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8')
 	csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8')
-	
-	"""
+ 
+ 	"""
 	TINY / TEST DATASETS -------------------------------
 
 	These are smaller versions of (real) empirical datasets for the purpose of testing and demonstration.
@@ -51,6 +51,25 @@
 	)
 	tiny_juries_feature_builder.featurize()
 
+	# Tiny Juries with custom Aggregations
+	print("Tiny Juries with Custom Aggregation...")
+	tiny_juries_feature_builder_custom_agg = FeatureBuilder(
+		input_df = tiny_juries_df,
+		grouping_keys = ["batch_num", "round_num"],
+		output_file_base = "jury_TINY_output_custom_agg", # Naming output files using the output_file_base parameter (recommended)
+		turns = False,
+		custom_features = [
+			"(BERT) Mimicry",
+			"Moving Mimicry",
+			"Forward Flow",
+			"Discursive Diversity"],
+  		convo_methods = ['max', 'median'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
+		convo_columns = ['positive_bert'],
+		user_methods = ['max', 'mean', 'min', 'median'],
+		user_columns = ['positive_bert', 'negative_bert', 'named_entity_recognition'],
+	)
+	tiny_juries_feature_builder_custom_agg.featurize()
+
 	# Tiny multi-task
 	tiny_multi_task_feature_builder = FeatureBuilder(
 		input_df = tiny_multi_task_df,
@@ -104,4 +123,4 @@
 	# 	output_file_path_conv_level = "./csopII_output_conversation_level.csv",
 	# 	turns = True
 	# )
-	# csopII_feature_builder.featurize()
+	# csopII_feature_builder.featurize()
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,8 @@ dependencies = [
   "transformers==4.44.0",
   "tqdm>=4.66.5",
   "tzdata>=2023.3",
-  "tzlocal==5.2"
+  "tzlocal==5.2",
+  "fuzzywuzzy==0.18.0"
 ]
 authors = [
   {name = "Xinlan Emily Hu", email = "[email protected]"},

diff --git a/requirements.txt b/requirements.txt
@@ -28,3 +28,4 @@ transformers==4.44.0
 tqdm>=4.66.5
 tzdata>=2023.3
 tzlocal==5.2
+fuzzywuzzy==0.18.0
diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py
@@ -66,7 +66,8 @@ class FeatureBuilder:
     :param timestamp_col: A string representing the column name that should be selected as the message. Defaults to "timestamp".
     :type timestamp_col: str, optional
 
-    :param grouping_keys: A list of multiple identifiers that collectively identify a conversation. If non-empty, we will group by all of the keys in the list and use the grouped key as the unique "conversational identifier."
+    :param grouping_keys: A list of multiple identifiers that collectively identify a conversation. If non-empty, we will group by all of the keys in the list and use the
+    grouped key as the unique "conversational identifier."
         Defaults to an empty list.
     :type grouping_keys: list, optional
 
@@ -86,11 +87,31 @@ class FeatureBuilder:
     :param ner_cutoff: This is the cutoff value for the confidence of prediction for each named entity. Defaults to 0.9.
     :type ner_cutoff: int
 
-    :param regenerate_vectors: If true, will regenerate vector data even if it already exists. Defaults to False.
+    :param regenerate_vectors: If true, will regenerate vector data even if it already exists.  Defaults to False.
     :type regenerate_vectors: bool, optional
 
     :param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False.
     :type compute_vectors_from_preprocessed: bool, optional
+    :param custom_vect_path: If provided, features will be generated using custom vectors rather than default SBERT.  Defaults to None.
+    :type custom_vect_path: str, optional
+
+    :param convo_aggregation: If true, will aggregate features at the conversational level. Defaults to True.
+    :type convo_aggregation: bool, optional
+
+    :param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the conversational level. Defaults to ['mean', 'max', 'min', 'stdev'].
+    :type convo_methods: list, optional
+
+    :param convo_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the conversational level. Defauts to all all numeric columns.
+    :type convo_columns: list, optional
+
+    :param user_aggregation: If true, will aggregate features at the speaker/user level. Defaults to True.
+    :type convo_aggregation: bool, optional
+
+    :param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'stdev'].
+    :type convo_methods: list, optional
+
+    :param user_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the speaker/user level. Defauts to all all numeric columns.
+    :type convo_columns: list, optional
 
     :return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated.
     :rtype: None
@@ -117,7 +138,14 @@ def __init__(
             ner_training_df: pd.DataFrame = None,
             ner_cutoff: int = 0.9,
             regenerate_vectors: bool = False,
-            compute_vectors_from_preprocessed: bool = False
+            compute_vectors_from_preprocessed: bool = False,
+            custom_vect_path: str = None,
+            convo_aggregation = True,
+            convo_methods: list = ['mean', 'max', 'min', 'stdev'],
+            convo_columns: list = None,
+            user_aggregation = True,
+            user_methods: list = ['mean', 'max', 'min', 'stdev'],
+            user_columns: list = None
         ) -> None:
 
         # Defining input and output paths.
@@ -224,6 +252,12 @@ def __init__(
         self.within_task = within_task
         self.ner_cutoff = ner_cutoff
         self.regenerate_vectors = regenerate_vectors
+        self.convo_aggregation = convo_aggregation
+        self.convo_methods = convo_methods
+        self.convo_columns = convo_columns
+        self.user_aggregation = user_aggregation
+        self.user_methods = user_methods
+        self.user_columns = user_columns
 
         if(compute_vectors_from_preprocessed == True):
             self.vector_colname = self.message_col # because the message col will eventually get preprocessed
@@ -358,7 +392,24 @@ def __init__(
         if not re.match(r"(.*\/|^)output\/", self.output_file_path_user_level):
             self.output_file_path_user_level = re.sub(r'/user/', r'/output/user/', self.output_file_path_user_level)
 
-        self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
+        if custom_vect_path is not None:
+            print("Detected that user has requested custom vectors...")
+            print("We will generate features using custom vectors rather than default SBERT")
+            self.vect_path = custom_vect_path
+        else:
+            self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
+
+        self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
+
+        if custom_vect_path is not None:
+            print("Detected that user has requested custom vectors...")
+            print("We will generate features using custom vectors rather than default SBERT")
+            self.vect_path = custom_vect_path
+        else:
+            self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
+
+        self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
+
         self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name
 
         # Check + generate embeddings
@@ -375,7 +426,11 @@ def __init__(
             if(not need_sentiment and feature_dict[feature]["bert_sentiment_data"]):
                 need_sentiment = True
 
-        check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname)
+        # preprocess chat data again
+        self.preprocess_chat_data()
+        # preprocess chat data again
+        self.preprocess_chat_data()
+        check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, self.message_col)
 
         if(need_sentence):
             self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman')
@@ -487,7 +542,12 @@ def featurize(self) -> None:
             Path(self.output_file_path_user_level).parent.mkdir(parents=True, exist_ok=True)
             Path(self.output_file_path_chat_level).parent.mkdir(parents=True, exist_ok=True)
             Path(self.output_file_path_conv_level).parent.mkdir(parents=True, exist_ok=True)
-
+
+            # Store column names of what we generated, so that the user can easily access them
+            self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"]))
+            self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"]))
+            self.conv_features_all =  [col for col in self.conv_data if col not in self.orig_data and col != 'conversation_num']
+
             # Step 3a. Create user level features.
             print("Generating User Level Features ...")
             self.user_level_features()
@@ -497,14 +557,9 @@ def featurize(self) -> None:
             self.conv_level_features()
             self.merge_conv_data_with_original()
 
-            # Step 4. Write the feartures into the files defined in the output paths.
+            # Step 4. Write the features into the files defined in the output paths.
             print("All Done!")
 
-            # Store column names of what we generated, so that the user can easily access them
-            self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"]))
-            self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"]))
-            self.conv_features_all =  [col for col in self.conv_data if col not in self.orig_data and col != 'conversation_num']
-
             self.save_features()
 
     def preprocess_chat_data(self) -> None:
@@ -607,7 +662,11 @@ def user_level_features(self) -> None:
             vect_data= self.vect_data,
             conversation_id_col = self.conversation_id_col,
             speaker_id_col = self.speaker_id_col,
-            input_columns = self.input_columns
+            input_columns = self.input_columns,
+            user_aggregation = self.user_aggregation,
+            user_methods = self.user_methods,
+            user_columns = self.user_columns,
+            chat_features = self.chat_features
         )
         self.user_data = user_feature_builder.calculate_user_level_features()
         # Remove special characters in column names
@@ -633,7 +692,14 @@ def conv_level_features(self) -> None:
             speaker_id_col = self.speaker_id_col,
             message_col = self.message_col,
             timestamp_col = self.timestamp_col,
-            input_columns = self.input_columns
+            input_columns = self.input_columns,
+            convo_aggregation = self.convo_aggregation,
+            convo_methods = self.convo_methods,
+            convo_columns = self.convo_columns,
+            user_aggregation = self.user_aggregation,
+            user_methods = self.user_methods,
+            user_columns = self.user_columns,
+            chat_features = self.chat_features,
         )
         # Calling the driver inside this class to create the features.
         self.conv_data = conv_feature_builder.calculate_conversation_level_features(self.feature_methods_conv)