Watts-Lab · xehu · Dec 3, 2024 · Oct 2, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
   "emoji==1.7.0",
   "flask==3.0.3",
   "gensim>=4.3.3",
+  "matplotlib>=3.0.0",
   "nltk==3.9.1",
   "numpy<2.0.0",
   "pandas==2.2.2",

diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ convokit==3.0.0
 emoji==1.7.0
 flask==3.0.3
 gensim>=4.3.3
+matplotlib>=3.0.0
 nltk==3.9.1
 numpy<2.0.0
 pandas==2.2.2

diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py
@@ -92,6 +92,9 @@ class FeatureBuilder:
     :param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False.
     :type compute_vectors_from_preprocessed: bool, optional
 
+    :param custom_liwc_dictionary_path: This is the path of the user's own LIWC dictionary file (.dic). Defaults to empty string.
+    :type custom_liwc_dictionary_path: str, optional
+
     :return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated.
     :rtype: None
 
@@ -117,7 +120,8 @@ def __init__(
             ner_training_df: pd.DataFrame = None,
             ner_cutoff: int = 0.9,
             regenerate_vectors: bool = False,
-            compute_vectors_from_preprocessed: bool = False
+            compute_vectors_from_preprocessed: bool = False,
+            custom_liwc_dictionary_path: str = ''
         ) -> None:
 
         # Defining input and output paths.
@@ -128,6 +132,23 @@ def __init__(
 
         print("Initializing Featurization...")
 
+        if not custom_liwc_dictionary_path:
+            self.custom_liwc_dictionary = {}
+        else:
+            # Read .dic file if the path is provided
+            custom_liwc_dictionary_path = Path(custom_liwc_dictionary_path)
+            if not custom_liwc_dictionary_path.suffix == '.dic':
+                print(f"WARNING: The custom LIWC dictionary file is not a .dic file: {custom_liwc_dictionary_path}")
+                self.custom_liwc_dictionary = {}
+            else:
+                with open(custom_liwc_dictionary_path, 'r', encoding='utf-8-sig') as file:
+                    dicText = file.read()
+                    try:
+                        self.custom_liwc_dictionary = load_liwc_dict(dicText)
+                    except Exception as e:
+                        print(f"WARNING: Failed loading custom liwc dictionary: {e}")
+                        self.custom_liwc_dictionary = {}
+
         # Set features to generate
         # TODO --- think through more carefully which ones we want to exclude and why
         self.feature_dict = feature_dict
@@ -564,7 +585,8 @@ def chat_level_features(self) -> None:
             ner_cutoff = self.ner_cutoff,
             conversation_id_col = self.conversation_id_col,
             message_col = self.message_col,
-            timestamp_col = self.timestamp_col
+            timestamp_col = self.timestamp_col,
+            custom_liwc_dictionary = self.custom_liwc_dictionary
         )
         # Calling the driver inside this class to create the features.
         self.chat_data = chat_feature_builder.calculate_chat_level_features(self.feature_methods_chat)

diff --git a/src/team_comm_tools/features/lexical_features_v2.py b/src/team_comm_tools/features/lexical_features_v2.py
@@ -26,14 +26,15 @@ def get_liwc_count(regex, chat):
 	else:
 		return 0
 
-def liwc_features(chat_df: pd.DataFrame, message_col) -> pd.DataFrame:
+def liwc_features(chat_df: pd.DataFrame, message_col: str, custom_liwc_dictionary: dict={}) -> pd.DataFrame:
 	"""
 		This function takes in the chat level input dataframe and computes lexical features 
 		(the number of words from a given lexicon, such as LIWC).
 
 	Args:
 		chat_df (pd.DataFrame): This is a pandas dataframe of the chat level features. Should contain 'message' column.
 		message_col (str): This is a string with the name of the column containing the message / text.
+		custom_liwc_dictionary (dict): This is a dictionary of the user's custom LIWC dic. 
 
 	Returns:
 		pd.DataFrame: Dataframe of the lexical features stacked as columns.
@@ -47,12 +48,19 @@ def liwc_features(chat_df: pd.DataFrame, message_col) -> pd.DataFrame:
 			lexicons_dict = pickle.load(lexicons_pickle_file)
 
 		# Return the lexical features stacked as columns
-		return pd.concat(
+		# return pd.concat(
 			# Finding the # of occurrences of lexicons of each type for all the messages.
-			[pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_count(regex, chat)))\
+		df_lst = [pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_count(regex, chat)))\
 											.rename({message_col + "_original": lexicon_type + "_lexical_wordcount"}, axis=1)\
-				for lexicon_type, regex in lexicons_dict.items()], 
-			axis=1
-		)
-	except:
+				for lexicon_type, regex in lexicons_dict.items()]
+			# , axis=1
+		# )
+		if custom_liwc_dictionary:
+			df_lst += [pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_count(regex, chat)))\
+											.rename({message_col + "_original": lexicon_type + "_lexical_wordcount_custom"}, axis=1)\
+				for lexicon_type, regex in custom_liwc_dictionary.items()]
+		return pd.concat(df_lst, axis=1)
+	except FileNotFoundError:
 		print("WARNING: Lexicons not found. Skipping feature...")
+	except Exception as e:
+		print(f'WARNING: Failed to generate lexicons due to unexpected error: {e}')
diff --git a/src/team_comm_tools/utils/calculate_chat_level_features.py b/src/team_comm_tools/utils/calculate_chat_level_features.py
@@ -32,14 +32,30 @@ class ChatLevelFeaturesCalculator:
 
     :param chat_data: Pandas dataframe of chat-level features read from the input dataset
     :type chat_data: pd.DataFrame
+
     :param vect_data: Pandas dataframe containing vector data
     :type vect_data: pd.DataFrame
+
     :param bert_sentiment_data: Pandas dataframe containing BERT sentiment data
     :type bert_sentiment_data: pd.DataFrame
-    :param ner_training_df: This is a pandas dataframe of training data for named entity recognition feature
-    :type ner_training_df: pd.DataFrame
+
+    :param ner_training: This is a pandas dataframe of training data for named entity recognition feature
+    :type ner_training: pd.DataFrame
+
     :param ner_cutoff: This is the cutoff value for the confidence of prediction for each named entity
     :type ner_cutoff: int
+
+    :param conversation_id_col: A string representing the column name that should be selected as the conversation ID. Defaults to "conversation_num".
+    :type conversation_id_col: str
+
+    :param message_col: A string representing the column name that should be selected as the message. Defaults to "message".
+    :type message_col: str
+
+    :param timestamp_col: A string representing the column name that should be selected as the message. Defaults to "timestamp".
+    :type timestamp_col: str
+
+    :param custom_liwc_dictionary: This is the user's own LIWC dictionary. Defaults to empty dictionary.
+    :type custom_liwc_dictionary: dict
     """
 
     def __init__(
@@ -51,8 +67,9 @@ def __init__(
             ner_cutoff: int,
             conversation_id_col: str,
             message_col: str,
-            timestamp_col: str | tuple[str, str]
-    ) -> None:
+            timestamp_col: str | tuple[str, str],
+            custom_liwc_dictionary: dict
+            ) -> None:
 
         self.chat_data = chat_data
         self.vect_data = vect_data
@@ -62,12 +79,11 @@ def __init__(
         self.conversation_id_col = conversation_id_col
         self.timestamp_col = timestamp_col
         self.message_col = message_col
-        # load easy Dale-Chall words exactly once.
-        self.easy_dale_chall_words = get_dale_chall_easy_words()
-        self.function_words = get_function_words()  # load function words exactly once
-        self.question_words = get_question_words()  # load question words exactly once
-        # load first person words exactly once
-        self.first_person = get_first_person_words()
+        self.custom_liwc_dictionary = custom_liwc_dictionary
+        self.easy_dale_chall_words = get_dale_chall_easy_words() # load easy Dale-Chall words exactly once.
+        self.function_words = get_function_words() # load function words exactly once
+        self.question_words = get_question_words() # load question words exactly once
+        self.first_person = get_first_person_words() # load first person words exactly once
 
     def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame:
         """
@@ -182,9 +198,8 @@ def lexical_features(self) -> None:
         :return: None
         :rtype: None
         """
-        self.chat_data = pd.concat(
-            [self.chat_data, liwc_features(self.chat_data, self.message_col)], axis=1)
-
+        self.chat_data = pd.concat([self.chat_data, liwc_features(self.chat_data, self.message_col, self.custom_liwc_dictionary)], axis = 1)
+
     def calculate_hedge_features(self) -> None:
         """
         Calculate features related to expressing hesitation (or 'hedge').

diff --git a/src/team_comm_tools/utils/check_embeddings.py b/src/team_comm_tools/utils/check_embeddings.py
@@ -24,7 +24,8 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 # Check if embeddings exist
-def check_embeddings(chat_data, vect_path, bert_path, need_sentence, need_sentiment, regenerate_vectors, message_col = "message"):
+def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, need_sentence: bool, 
+                     need_sentiment: bool, regenerate_vectors: bool, message_col: str = "message"):
     """
     Check if embeddings and required lexicons exist, and generate them if they don't.
 
@@ -90,15 +91,19 @@ def read_in_lexicons(directory, lexicons_dict):
                 continue
             lines = []
             for lexicon in lexicons:
-                # get rid of parentheses
                 lexicon = lexicon.strip()
-                lexicon = lexicon.replace('(', '')
-                lexicon = lexicon.replace(')', '')
+                # get rid of parentheses; comment out to keep the emojis like :)
+                # TODO: compare the difference if we keep ()
+                # lexicon = lexicon.replace('(', '')
+                # lexicon = lexicon.replace(')', '')
                 if '*' not in lexicon:
                     lines.append(r"\b" + lexicon.replace("\n", "") + r"\b")
                 else:
                     # get rid of any cases of multiple repeat -- e.g., '**'
-                    lexicon = lexicon.replace('\**', '\*')
+                    # lexicon = lexicon.replace('\**', '\*'); this will throw Invalid syntax error
+                    pattern = re.compile(r'\*+')
+                    lexicon = pattern.sub('*', lexicon)
+                    lexicon = r"\b" + lexicon.replace("\n", "").replace("*", "") + r"\S*\b"
 
                     # build the final lexicon
                     lines.append(r"\b" + lexicon.replace("\n", "").replace("*", "") + r"\S*\b")
@@ -134,6 +139,95 @@ def generate_lexicon_pkl():
     except:
         print("WARNING: Lexicons not found. Skipping pickle generation...")
 
+def fix_abbreviations(dicTerm: str) -> str:
+    """
+    Helper function to fix abbreviations with punctuations.
+    src: https://github.com/ryanboyd/ContentCoder-Py/blob/main/ContentCodingDictionary.py#L714
+
+    This function goes over a list of hardcoded exceptions for the tokenizer / sentence parser 
+    built into LIWC so that it doesn't convert them into separate strings 
+    (e.g., we want "i.e." to not be seen as two  words and two sentences [i, e]).
+
+    :param dicTerm: The lexicon term
+    :type dicTerm: str
+
+    :return: dicTerm
+    :rtype: str
+    """
+
+    AbbreviationList = ['ie.', 'i.e.', 'eg.', 'e.g.', 'vs.', 'ph.d.', 'phd.', 'm.d.', 'd.d.s.', 'b.a.', 
+                    'b.s.', 'm.s.', 'u.s.a.', 'u.s.', 'u.t.', 'attn.', 'prof.', 'mr.', 'dr.', 'mrs.', 
+                    'ms.', 'a.i.', 'a.g.i.', 'tl;dr', 't.t', 't_t']
+    AbbreviationDict = {}
+    for item in AbbreviationList:
+        itemClean = item.replace('.', '-').replace(';', '-').replace('_', '-')
+
+        if len(itemClean) > 2 and itemClean.endswith('-'):
+            numTrailers = len(itemClean)
+            itemClean = itemClean.strip('-')
+            numTrailers = numTrailers - len(itemClean)
+            itemClean = itemClean[:-1] + ''.join(['-'] * numTrailers) + itemClean[-1:]
+
+        AbbreviationDict[item] = itemClean
+        AbbreviationDict[item + ','] = itemClean
+
+    if dicTerm in AbbreviationDict.keys():
+        return AbbreviationDict[dicTerm]
+    else:
+        return dicTerm
+
+def load_liwc_dict(dicText: str) -> dict:
+    """
+    Loads up a dictionary that is in the LIWC 2007/2015 format.
+    src: https://github.com/ryanboyd/ContentCoder-Py/blob/main/ContentCodingDictionary.py#L81
+
+    This functions reads the content of a LIWC dictionary file in the official format,
+    and convert it to a dictionary with lexicon: regular expression format.
+    We assume the dicText has two parts: the header, which maps numbers to "category names," 
+    and the body, which maps words in the lexicon to different category numbers, separated by a '%' sign.
+
+    :param dicText: The content of a .dic file
+    :type dicText: str
+
+    :return: dicCategories
+    :rtype: dict
+    """
+    dicSplit = dicText.split('%', 2)
+    dicHeader, dicBody = dicSplit[1], dicSplit[2]
+    # read headers
+    catNameNumberMap = {}
+    for line in dicHeader.splitlines():
+        if line.strip() == '':
+            continue
+        lineSplit = line.strip().split('\t')
+        catNameNumberMap[lineSplit[0]] = lineSplit[1]
+    # read body
+    dicCategories = {}
+    for line in dicBody.splitlines():
+        lineSplit = line.strip().split('\t')
+        dicTerm, catNums = lineSplit[0], lineSplit[1:]
+        dicTerm = fix_abbreviations(dicTerm=' '.join(lineSplit[0].lower().strip().split()))
+        dicTerm = dicTerm.strip()
+        if dicTerm == '':
+            continue
+
+        if '*' in dicTerm:
+            # Replace consecutive asterisks with a single asterisk -- e.g., '**'->'*'
+            pattern = re.compile(r'\*+')
+            dicTerm = pattern.sub('*', dicTerm)
+            dicTerm = r"\b" + dicTerm.replace("\n", "").replace("*", "") + r"\S*\b"
+        else:
+            dicTerm = r"\b" + dicTerm.replace("\n", "").replace('(', r'\(').replace(')', r'\)') + r"\b"
+
+        for catNum in catNums:
+            cat = catNameNumberMap[catNum]
+            if cat not in dicCategories:
+                dicCategories[cat] = dicTerm
+            else:
+                cur_dicTerm = dicCategories[cat]
+                dicCategories[cat] = cur_dicTerm + "|" + dicTerm
+    return dicCategories
+
 def generate_certainty_pkl():
     """
     Helper function for generating the pickle file containing the certainty lexicon.