Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bring your own LIWC & matplotlib dependency fix #322

Merged
merged 11 commits into from
Dec 3, 2024
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"emoji==1.7.0",
"flask==3.0.3",
"gensim>=4.3.3",
"matplotlib>=3.0.0",
"nltk==3.9.1",
"numpy<2.0.0",
"pandas==2.2.2",
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ convokit==3.0.0
emoji==1.7.0
flask==3.0.3
gensim>=4.3.3
matplotlib>=3.0.0
nltk==3.9.1
numpy<2.0.0
pandas==2.2.2
Expand Down
26 changes: 24 additions & 2 deletions src/team_comm_tools/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ class FeatureBuilder:
:param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False.
:type compute_vectors_from_preprocessed: bool, optional

:param custom_liwc_dictionary_path: This is the path of the user's own LIWC dictionary file (.dic). Defaults to empty string.
:type custom_liwc_dictionary_path: str, optional

:return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated.
:rtype: None

Expand All @@ -117,7 +120,8 @@ def __init__(
ner_training_df: pd.DataFrame = None,
ner_cutoff: int = 0.9,
regenerate_vectors: bool = False,
compute_vectors_from_preprocessed: bool = False
compute_vectors_from_preprocessed: bool = False,
custom_liwc_dictionary_path: str = ''
) -> None:

# Defining input and output paths.
Expand All @@ -128,6 +132,23 @@ def __init__(

print("Initializing Featurization...")

if not custom_liwc_dictionary_path:
self.custom_liwc_dictionary = {}
else:
# Read .dic file if the path is provided
custom_liwc_dictionary_path = Path(custom_liwc_dictionary_path)
if not custom_liwc_dictionary_path.suffix == '.dic':
print(f"WARNING: The custom LIWC dictionary file is not a .dic file: {custom_liwc_dictionary_path}")
self.custom_liwc_dictionary = {}
else:
with open(custom_liwc_dictionary_path, 'r', encoding='utf-8-sig') as file:
dicText = file.read()
try:
self.custom_liwc_dictionary = load_liwc_dict(dicText)
except Exception as e:
print(f"WARNING: Failed loading custom liwc dictionary: {e}")
self.custom_liwc_dictionary = {}

# Set features to generate
# TODO --- think through more carefully which ones we want to exclude and why
self.feature_dict = feature_dict
Expand Down Expand Up @@ -564,7 +585,8 @@ def chat_level_features(self) -> None:
ner_cutoff = self.ner_cutoff,
conversation_id_col = self.conversation_id_col,
message_col = self.message_col,
timestamp_col = self.timestamp_col
timestamp_col = self.timestamp_col,
custom_liwc_dictionary = self.custom_liwc_dictionary
)
# Calling the driver inside this class to create the features.
self.chat_data = chat_feature_builder.calculate_chat_level_features(self.feature_methods_chat)
Expand Down
22 changes: 15 additions & 7 deletions src/team_comm_tools/features/lexical_features_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@ def get_liwc_count(regex, chat):
else:
return 0

def liwc_features(chat_df: pd.DataFrame, message_col) -> pd.DataFrame:
def liwc_features(chat_df: pd.DataFrame, message_col: str, custom_liwc_dictionary: dict={}) -> pd.DataFrame:
"""
This function takes in the chat level input dataframe and computes lexical features
(the number of words from a given lexicon, such as LIWC).

Args:
chat_df (pd.DataFrame): This is a pandas dataframe of the chat level features. Should contain 'message' column.
message_col (str): This is a string with the name of the column containing the message / text.
custom_liwc_dictionary (dict): This is a dictionary of the user's custom LIWC dic.

Returns:
pd.DataFrame: Dataframe of the lexical features stacked as columns.
Expand All @@ -47,12 +48,19 @@ def liwc_features(chat_df: pd.DataFrame, message_col) -> pd.DataFrame:
lexicons_dict = pickle.load(lexicons_pickle_file)

# Return the lexical features stacked as columns
return pd.concat(
# return pd.concat(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving a note to double check the commented-out code here

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this is fine; it just comments out the previous code, which directly concats the dataframe, and replaces it with the new code, which calls it a second time if the custom dictionary is present.

# Finding the # of occurrences of lexicons of each type for all the messages.
[pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_count(regex, chat)))\
df_lst = [pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_count(regex, chat)))\
.rename({message_col + "_original": lexicon_type + "_lexical_wordcount"}, axis=1)\
for lexicon_type, regex in lexicons_dict.items()],
axis=1
)
except:
for lexicon_type, regex in lexicons_dict.items()]
# , axis=1
# )
if custom_liwc_dictionary:
df_lst += [pd.DataFrame(chat_df[message_col + "_original"].apply(lambda chat: get_liwc_count(regex, chat)))\
.rename({message_col + "_original": lexicon_type + "_lexical_wordcount_custom"}, axis=1)\
for lexicon_type, regex in custom_liwc_dictionary.items()]
return pd.concat(df_lst, axis=1)
except FileNotFoundError:
print("WARNING: Lexicons not found. Skipping feature...")
except Exception as e:
print(f'WARNING: Failed to generate lexicons due to unexpected error: {e}')
41 changes: 28 additions & 13 deletions src/team_comm_tools/utils/calculate_chat_level_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,30 @@ class ChatLevelFeaturesCalculator:

:param chat_data: Pandas dataframe of chat-level features read from the input dataset
:type chat_data: pd.DataFrame

:param vect_data: Pandas dataframe containing vector data
:type vect_data: pd.DataFrame

:param bert_sentiment_data: Pandas dataframe containing BERT sentiment data
:type bert_sentiment_data: pd.DataFrame
:param ner_training_df: This is a pandas dataframe of training data for named entity recognition feature
:type ner_training_df: pd.DataFrame

:param ner_training: This is a pandas dataframe of training data for named entity recognition feature
:type ner_training: pd.DataFrame

:param ner_cutoff: This is the cutoff value for the confidence of prediction for each named entity
:type ner_cutoff: int

:param conversation_id_col: A string representing the column name that should be selected as the conversation ID. Defaults to "conversation_num".
:type conversation_id_col: str

:param message_col: A string representing the column name that should be selected as the message. Defaults to "message".
:type message_col: str

:param timestamp_col: A string representing the column name that should be selected as the message. Defaults to "timestamp".
:type timestamp_col: str

:param custom_liwc_dictionary: This is the user's own LIWC dictionary. Defaults to empty dictionary.
:type custom_liwc_dictionary: dict
"""

def __init__(
Expand All @@ -51,8 +67,9 @@ def __init__(
ner_cutoff: int,
conversation_id_col: str,
message_col: str,
timestamp_col: str | tuple[str, str]
) -> None:
timestamp_col: str | tuple[str, str],
custom_liwc_dictionary: dict
) -> None:

self.chat_data = chat_data
self.vect_data = vect_data
Expand All @@ -62,12 +79,11 @@ def __init__(
self.conversation_id_col = conversation_id_col
self.timestamp_col = timestamp_col
self.message_col = message_col
# load easy Dale-Chall words exactly once.
self.easy_dale_chall_words = get_dale_chall_easy_words()
self.function_words = get_function_words() # load function words exactly once
self.question_words = get_question_words() # load question words exactly once
# load first person words exactly once
self.first_person = get_first_person_words()
self.custom_liwc_dictionary = custom_liwc_dictionary
self.easy_dale_chall_words = get_dale_chall_easy_words() # load easy Dale-Chall words exactly once.
self.function_words = get_function_words() # load function words exactly once
self.question_words = get_question_words() # load question words exactly once
self.first_person = get_first_person_words() # load first person words exactly once

def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -182,9 +198,8 @@ def lexical_features(self) -> None:
:return: None
:rtype: None
"""
self.chat_data = pd.concat(
[self.chat_data, liwc_features(self.chat_data, self.message_col)], axis=1)

self.chat_data = pd.concat([self.chat_data, liwc_features(self.chat_data, self.message_col, self.custom_liwc_dictionary)], axis = 1)

def calculate_hedge_features(self) -> None:
"""
Calculate features related to expressing hesitation (or 'hedge').
Expand Down
104 changes: 99 additions & 5 deletions src/team_comm_tools/utils/check_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Check if embeddings exist
def check_embeddings(chat_data, vect_path, bert_path, need_sentence, need_sentiment, regenerate_vectors, message_col = "message"):
def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, need_sentence: bool,
need_sentiment: bool, regenerate_vectors: bool, message_col: str = "message"):
"""
Check if embeddings and required lexicons exist, and generate them if they don't.

Expand Down Expand Up @@ -90,15 +91,19 @@ def read_in_lexicons(directory, lexicons_dict):
continue
lines = []
for lexicon in lexicons:
# get rid of parentheses
lexicon = lexicon.strip()
lexicon = lexicon.replace('(', '')
lexicon = lexicon.replace(')', '')
# get rid of parentheses; comment out to keep the emojis like :)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check the commented out parentheses

# TODO: compare the difference if we keep ()
# lexicon = lexicon.replace('(', '')
# lexicon = lexicon.replace(')', '')
if '*' not in lexicon:
lines.append(r"\b" + lexicon.replace("\n", "") + r"\b")
else:
# get rid of any cases of multiple repeat -- e.g., '**'
lexicon = lexicon.replace('\**', '\*')
# lexicon = lexicon.replace('\**', '\*'); this will throw Invalid syntax error
pattern = re.compile(r'\*+')
lexicon = pattern.sub('*', lexicon)
lexicon = r"\b" + lexicon.replace("\n", "").replace("*", "") + r"\S*\b"

# build the final lexicon
lines.append(r"\b" + lexicon.replace("\n", "").replace("*", "") + r"\S*\b")
Expand Down Expand Up @@ -134,6 +139,95 @@ def generate_lexicon_pkl():
except:
print("WARNING: Lexicons not found. Skipping pickle generation...")

def fix_abbreviations(dicTerm: str) -> str:
"""
Helper function to fix abbreviations with punctuations.
src: https://github.com/ryanboyd/ContentCoder-Py/blob/main/ContentCodingDictionary.py#L714

This function goes over a list of hardcoded exceptions for the tokenizer / sentence parser
built into LIWC so that it doesn't convert them into separate strings
(e.g., we want "i.e." to not be seen as two words and two sentences [i, e]).

:param dicTerm: The lexicon term
:type dicTerm: str

:return: dicTerm
:rtype: str
"""

AbbreviationList = ['ie.', 'i.e.', 'eg.', 'e.g.', 'vs.', 'ph.d.', 'phd.', 'm.d.', 'd.d.s.', 'b.a.',
'b.s.', 'm.s.', 'u.s.a.', 'u.s.', 'u.t.', 'attn.', 'prof.', 'mr.', 'dr.', 'mrs.',
'ms.', 'a.i.', 'a.g.i.', 'tl;dr', 't.t', 't_t']
AbbreviationDict = {}
for item in AbbreviationList:
itemClean = item.replace('.', '-').replace(';', '-').replace('_', '-')

if len(itemClean) > 2 and itemClean.endswith('-'):
numTrailers = len(itemClean)
itemClean = itemClean.strip('-')
numTrailers = numTrailers - len(itemClean)
itemClean = itemClean[:-1] + ''.join(['-'] * numTrailers) + itemClean[-1:]

AbbreviationDict[item] = itemClean
AbbreviationDict[item + ','] = itemClean

if dicTerm in AbbreviationDict.keys():
return AbbreviationDict[dicTerm]
else:
return dicTerm

def load_liwc_dict(dicText: str) -> dict:
"""
Loads up a dictionary that is in the LIWC 2007/2015 format.
src: https://github.com/ryanboyd/ContentCoder-Py/blob/main/ContentCodingDictionary.py#L81

This functions reads the content of a LIWC dictionary file in the official format,
and convert it to a dictionary with lexicon: regular expression format.
We assume the dicText has two parts: the header, which maps numbers to "category names,"
and the body, which maps words in the lexicon to different category numbers, separated by a '%' sign.

:param dicText: The content of a .dic file
:type dicText: str

:return: dicCategories
:rtype: dict
"""
dicSplit = dicText.split('%', 2)
dicHeader, dicBody = dicSplit[1], dicSplit[2]
# read headers
catNameNumberMap = {}
for line in dicHeader.splitlines():
if line.strip() == '':
continue
lineSplit = line.strip().split('\t')
catNameNumberMap[lineSplit[0]] = lineSplit[1]
# read body
dicCategories = {}
for line in dicBody.splitlines():
lineSplit = line.strip().split('\t')
dicTerm, catNums = lineSplit[0], lineSplit[1:]
dicTerm = fix_abbreviations(dicTerm=' '.join(lineSplit[0].lower().strip().split()))
dicTerm = dicTerm.strip()
if dicTerm == '':
continue

if '*' in dicTerm:
# Replace consecutive asterisks with a single asterisk -- e.g., '**'->'*'
pattern = re.compile(r'\*+')
dicTerm = pattern.sub('*', dicTerm)
dicTerm = r"\b" + dicTerm.replace("\n", "").replace("*", "") + r"\S*\b"
else:
dicTerm = r"\b" + dicTerm.replace("\n", "").replace('(', r'\(').replace(')', r'\)') + r"\b"

for catNum in catNums:
cat = catNameNumberMap[catNum]
if cat not in dicCategories:
dicCategories[cat] = dicTerm
else:
cur_dicTerm = dicCategories[cat]
dicCategories[cat] = cur_dicTerm + "|" + dicTerm
return dicCategories

def generate_certainty_pkl():
"""
Helper function for generating the pickle file containing the certainty lexicon.
Expand Down
Loading