Skip to content

Commit

Permalink
updates to package aggregation
Browse files Browse the repository at this point in the history
  • Loading branch information
amytangzheng committed Oct 16, 2024
1 parent d35aeb1 commit 576a376
Show file tree
Hide file tree
Showing 6 changed files with 327 additions and 184 deletions.
46 changes: 32 additions & 14 deletions examples/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8')
csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8')
csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8')
test_df = pd.read_csv("C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv", encoding='utf-8')

# C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv

"""
TINY / TEST DATASETS -------------------------------
Expand Down Expand Up @@ -68,7 +71,7 @@
# )
# tiny_juries_feature_builder_custom.featurize(col="message")

# # Tiny Juries with NO aggregations
# Tiny Juries with NO aggregations
# print("Tiny Juries with No Aggregation...")
# tiny_juries_feature_builder_no_aggregation = FeatureBuilder(
# input_df = tiny_juries_df,
Expand All @@ -84,23 +87,38 @@
# tiny_juries_feature_builder_no_aggregation.featurize(col="message")

# Tiny Juries with custom Aggregations
print("Tiny Juries with Custom Aggregation...")
tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
input_df = tiny_juries_df,
# print("Tiny Juries with Custom Aggregation...")
# tiny_juries_feature_builder_custom_aggregation = FeatureBuilder(
# input_df = tiny_juries_df,
# grouping_keys = ["batch_num", "round_num"],
# vector_directory = "./vector_data/",
# output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
# output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
# output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
# convo_methods = ['max', 'median'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
# convo_columns = ['positive_bert'],
# user_methods = ['max', 'mean', 'min', 'median'],
# user_columns = ['positive_bert', 'negative_bert'],
# turns = False,
# )
# tiny_juries_feature_builder_custom_aggregation.featurize(col="message")

# Testing package aggregation
print("Testing package aggregation...")
testing_feature_builder_custom_aggregation = FeatureBuilder(
input_df = test_df,
grouping_keys = ["batch_num", "round_num"],
vector_directory = "./vector_data/",
output_file_path_chat_level = "./jury_TINY_output_chat_level_custom_agg.csv",
output_file_path_user_level = "./jury_TINY_output_user_level_custom_agg.csv",
output_file_path_conv_level = "./jury_TINY_output_conversation_level_custom_agg.csv",
convo_methods = ['mean'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
convo_columns = ['positive_bert'],
user_methods = ['mean', 'max'],
user_columns = ['positive_bert', 'negative_bert'],
# user_methods = ['max'],
# user_columns = ['negative_bert'],
output_file_path_chat_level = "./test_package_TINY_chat_level_custom_agg.csv",
output_file_path_user_level = "./test_package_TINY_user_level_custom_agg.csv",
output_file_path_conv_level = "./test_package_TINY_conversation_level_custom_agg.csv",
convo_methods = ['max'], # This will aggregate ONLY the "positive_bert" at the conversation level, using mean; it will aggregate ONLY "negative_bert" at the speaker/user level, using max.
convo_columns = ['positive_bert', 'negativity_bert'],
user_methods = ['MAX'],
user_columns = ['negative_bert'],
turns = False,
)
tiny_juries_feature_builder_custom_aggregation.featurize(col="message")
testing_feature_builder_custom_aggregation.featurize(col="message")


# # Tiny multi-task
Expand Down
8 changes: 4 additions & 4 deletions src/team_comm_tools/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class FeatureBuilder:
:param convo_aggregation: If true, will aggregate features at the conversational level. Defaults to True.
:type convo_aggregation: bool, optional
:param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the conversational level. Defaults to ['mean', 'max', 'min', 'std'].
:param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the conversational level. Defaults to ['mean', 'max', 'min', 'stdev'].
:type convo_methods: list, optional
:param convo_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the conversational level. Defauts to all all numeric columns.
Expand All @@ -101,7 +101,7 @@ class FeatureBuilder:
:param user_aggregation: If true, will aggregate features at the speaker/user level. Defaults to True.
:type convo_aggregation: bool, optional
:param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, std...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'std'].
:param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'stdev'].
:type convo_methods: list, optional
:param user_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the speaker/user level. Defauts to all all numeric columns.
Expand Down Expand Up @@ -133,10 +133,10 @@ def __init__(
regenerate_vectors: bool = False,
custom_vect_path: str = None,
convo_aggregation = True,
convo_methods: list = ['mean', 'max', 'min', 'std'],
convo_methods: list = ['mean', 'max', 'min', 'stdev'],
convo_columns: list = None,
user_aggregation = True,
user_methods: list = ['mean', 'max', 'min', 'std'],
user_methods: list = ['mean', 'max', 'min', 'stdev'],
user_columns: list = None
) -> None:

Expand Down
186 changes: 90 additions & 96 deletions src/team_comm_tools/utils/calculate_conversation_level_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from team_comm_tools.utils.summarize_features import *
from team_comm_tools.utils.gini_coefficient import *
from team_comm_tools.utils.preprocess import *
from fuzzywuzzy import process

class ConversationLevelFeaturesCalculator:
"""
Expand All @@ -29,13 +30,13 @@ class ConversationLevelFeaturesCalculator:
:type input_columns: list
:param convo_aggregation: If true, will aggregate features at the conversational level
:type convo_aggregation: bool
:param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, std...)
:param convo_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...)
:type convo_methods: list
:param convo_columns: Specifies which columns (at the chat level) users want aggregated
:type convo_columns: list
:param user_aggregation: If true, will aggregate features at the user level
:type convo_aggregation: bool
:param user_methods: Specifies which functions users want to aggregate with (e.g., mean, std...) at the user level
:param user_methods: Specifies which functions users want to aggregate with (e.g., mean, stdev...) at the user level
:type user_methods: list
:param user_columns: Specifies which columns (at the chat level) users want aggregated for the user level
:type user_columns: list
Expand Down Expand Up @@ -78,27 +79,89 @@ def __init__(self, chat_data: pd.DataFrame,
if 'conversation_num' not in self.input_columns:
self.input_columns.append('conversation_num')

# check if user inputted convo_columns is None
if convo_columns is None:
self.columns_to_summarize = [column for column in self.chat_data.columns \
if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
else:
if convo_aggregation == True and len(convo_columns) == 0:
warnings.warn(
print(
"Warning: convo_aggregation is True but no convo_columns specified. Defaulting convo_aggregation to False."
)
self.convo_aggregation = False
else:
convo_columns_in_data = list(set(convo_columns).intersection(set(self.chat_data.columns)))

if(len(convo_columns_in_data) != len(convo_columns)):
warnings.warn(
print(
"Warning: One or more requested user columns are not present in the data. Ignoring them."
)

for i in convo_columns:
matches = process.extract(i, self.chat_data.columns, limit=3)
best_match, similarity = matches[0]

if similarity == 100:
continue
elif similarity >= 80:
print("Did you mean", best_match, "instead of", i, "?")
else:
print(i, "not found in data and no close match.")


self.columns_to_summarize = convo_columns_in_data

# check if user inputted user_columns is None
if user_columns is None:
self.user_columns = [column for column in self.chat_data.columns \
if (column not in self.input_columns) and pd.api.types.is_numeric_dtype(self.chat_data[column])]
else:
if user_aggregation == True and len(user_columns) == 0:
print("Warning: user_aggregation is True but no user_columns specified. Defaulting user_aggregation to False.")
self.user_aggregation = False
else:
user_columns_in_data = list(set(user_columns).intersection(set(self.chat_data.columns)))
if(len(user_columns_in_data) != len(user_columns)):
print(
"Warning: One or more requested user columns are not present in the data. Ignoring them."
)

print(user_columns_in_data, user_columns)

for i in user_columns:
matches = process.extract(i, self.chat_data.columns, limit=3)
best_match, similarity = matches[0]

if similarity == 100:
continue
elif similarity >= 80:
print("Did you mean", best_match, "instead of", i, "?")
else:
print(i, "not found in data and no close match.")

self.user_columns = user_columns_in_data

self.summable_columns = ["num_words", "num_chars", "num_messages"]

# ensure all lowercase
self.convo_methods = [col.lower() for col in self.convo_methods]
self.user_methods = [col.lower() for col in self.user_methods]
self.columns_to_summarize = [col.lower() for col in self.columns_to_summarize]
self.user_columns = [col.lower() for col in self.user_columns]

# replace interchangable words in columns_to_summarize
for i in range(len(self.convo_methods)):
if self.convo_methods[i] == "average":
self.convo_methods[i] = "mean"
elif self.convo_methods[i] == "maximum":
self.convo_methods[i] = "max"
elif self.convo_methods[i] == "minimum":
self.convo_methods[i] = "min"
elif self.convo_methods[i] == "standard deviation":
self.convo_methods[i] = "stdev"
elif self.convo_methods[i] == "sd":
self.convo_methods = "stdev"

def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame:
"""
Main driver function for creating conversation-level features.
Expand Down Expand Up @@ -185,7 +248,7 @@ def get_conversation_level_aggregates(self) -> None:
)

# Standard Deviation of feature across the Conversation
if 'std' in self.convo_methods:
if 'stdev' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col),
Expand All @@ -210,6 +273,15 @@ def get_conversation_level_aggregates(self) -> None:
on=[self.conversation_id_col],
how="inner"
)

# Median for the feature across the Conversation
if 'median' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

# Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini.
for column in self.summable_columns:
Expand Down Expand Up @@ -242,7 +314,7 @@ def get_user_level_aggregates(self) -> None:

if self.convo_aggregation == True and self.user_aggregation == True:

# this may be right??
# aggregates from the user level based on conversation methods
if 'mean' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
Expand All @@ -254,7 +326,7 @@ def get_user_level_aggregates(self) -> None:
how="inner"
)

if 'std' in self.convo_methods:
if 'stdev' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Standard Deviation of User-Level Feature
Expand Down Expand Up @@ -286,95 +358,17 @@ def get_user_level_aggregates(self) -> None:
on=[self.conversation_id_col],
how="inner"
)


# Sum Columns were created using self.get_user_level_summed_features()
# for column in self.columns_to_summarize:
# # change to self.user_columns
# # should be summable_columns

# # for method in self.user_methods:
# # self.conv_data = pd.merge(
# # left=self.conv_data,
# # right=get_average(self.user_data.copy(), method+"_"+column, 'average_user_' + method + "_" +column, self.conversation_id_col),
# # on=[self.conversation_id_col],
# # how="inner"
# # )

# if 'mean' in self.convo_methods:
# # Average/Mean of User-Level Feature
# self.conv_data = pd.merge(
# left=self.conv_data,
# right=get_average(self.user_data.copy(), "sum_"+column, 'average_user_sum_'+column, self.conversation_id_col),
# on=[self.conversation_id_col],
# how="inner"
# )

# if 'std' in self.convo_methods:
# # Standard Deviation of User-Level Feature
# self.conv_data = pd.merge(
# left=self.conv_data,
# right=get_stdev(self.user_data.copy(), "sum_"+column, 'stdev_user_sum_'+column, self.conversation_id_col),
# on=[self.conversation_id_col],
# how="inner"
# )

# if 'min' in self.convo_methods:
# # Minima of User-Level Feature
# self.conv_data = pd.merge(
# left=self.conv_data,
# right=get_min(self.user_data.copy(), "sum_"+column, 'min_user_sum_'+column, self.conversation_id_col),
# on=[self.conversation_id_col],
# how="inner"
# )

# if 'max' in self.convo_methods:
# # Maxima of User-Level Feature
# self.conv_data = pd.merge(
# left=self.conv_data,
# right=get_max(self.user_data.copy(), "sum_"+column, 'max_user_sum_'+column, self.conversation_id_col),
# on=[self.conversation_id_col],
# how="inner"
# )

# Average Columns were created using self.get_user_level_mean_features()
for column in self.columns_to_summarize:

if 'mean' in self.convo_methods:
# Average/Mean of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_mean(self.user_data.copy(), "mean_"+column, 'mean_user_avg_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

if 'std' in self.convo_methods:
# Standard Deviation of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_stdev(self.user_data.copy(), "mean_"+column, 'stdev_user_avg_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

if 'min' in self.convo_methods:
# Minima of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_min(self.user_data.copy(), "mean_"+column, 'min_user_avg_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

if 'max' in self.convo_methods:
# Maxima of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_max(self.user_data.copy(), "mean_"+column, 'max_user_avg_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

if 'median' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Median of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)


def get_discursive_diversity_features(self) -> None:
Expand Down
Loading

0 comments on commit 576a376

Please sign in to comment.