diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py index d56c94c9..b15b309d 100644 --- a/src/team_comm_tools/feature_builder.py +++ b/src/team_comm_tools/feature_builder.py @@ -465,7 +465,7 @@ def __init__( need_sentiment = True # check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname) - check_embeddings(self.chat_data, self.vect_path, self.bert_path, self.original_vect_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname) + check_embeddings(self.chat_data, self.vect_path, self.bert_path, self.original_vect_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname, custom_vect = custom_vect_path is not None) if(need_sentence): self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman') diff --git a/src/team_comm_tools/utils/check_embeddings.py b/src/team_comm_tools/utils/check_embeddings.py index 866a512e..fe4dcfc9 100644 --- a/src/team_comm_tools/utils/check_embeddings.py +++ b/src/team_comm_tools/utils/check_embeddings.py @@ -28,7 +28,7 @@ # Check if embeddings exist def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, original_vect_path: str, need_sentence: bool, - need_sentiment: bool, regenerate_vectors: bool, message_col: str = "message"): + need_sentiment: bool, regenerate_vectors: bool, message_col: str = "message", custom_vect: bool = False): """ Check if embeddings and required lexicons exist, and generate them if they don't. @@ -51,6 +51,8 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, or :type regenerate_vectors: bool, optional :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". :type message_col: str, optional + :param custom_vect: Whether the user has passed in custom vectors + :type custom_vect: bool, optional :return: None :rtype: None @@ -61,64 +63,64 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, or generate_bert(chat_data, bert_path, message_col) try: - vector_df = pd.read_csv(vect_path) - - # check whether the given vector and bert data matches length of chat data - if len(vector_df) != len(chat_data): - print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...") - # reset vector path to default/original - generate_vect(chat_data, original_vect_path, message_col) - else: - # check that message in vector data matches chat data - preprocessed_chat = chat_data[message_col].astype(str).apply(preprocess_text) + if custom_vect == True: + vector_df = pd.read_csv(vect_path) - # preprocess vector data, remove _original if message_col contains to preprocess the text - if '_original' in message_col: - message_col = message_col.replace('_original', '') - - print(message_col, message_col[:-9]) - preprocessed_vector = vector_df[message_col].astype(str).apply(preprocess_text) - - - mismatches = chat_data[preprocessed_chat != preprocessed_vector] - if len(mismatches) != 0: - print("Messages in the vector data do not match the chat data. Regenerating...") + # check whether the given vector and bert data matches length of chat data + if len(vector_df) != len(chat_data): + print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...") + # reset vector path to default/original generate_vect(chat_data, original_vect_path, message_col) - - if "message_embedding" in vector_df.columns: - # check that message_embedding is numeric list - if not vector_df["message_embedding"].apply(is_numeric_list).all(): - print("message_embedding is not a numeric list. Regenerating ...") + else: + # check that message in vector data matches chat data + preprocessed_chat = chat_data[message_col].astype(str).apply(preprocess_text).fillna("") + + # preprocess vector data, remove _original if message_col contains to preprocess the text + while '_original' in message_col: + message_col = message_col.replace('_original', '') + + # print(message_col) + preprocessed_vector = vector_df[message_col].astype(str).apply(preprocess_text).fillna("") + + mismatches = chat_data[preprocessed_chat != preprocessed_vector] + if len(mismatches) != 0: + print("Messages in the vector data do not match the chat data. Regenerating...") generate_vect(chat_data, original_vect_path, message_col) - else: - # check if length of all vectors is the same - vect_lengths = vector_df["message_embedding"].apply(lambda x: ast.literal_eval(x)).apply(lambda x : len(x)) - - if (vect_lengths == 0).any(): - print("One or more value in message_embedding are null. Regenerating ...") - generate_vect(chat_data, original_vect_path, message_col) - - if len(vect_lengths.unique()) > 1: - print("Not all vectors have the same length. Regenerating ...") - generate_vect(chat_data, original_vect_path, message_col) - # check if vectors have a 1-1 mapping with the text - embedding_message_map = {} - for _, row in vector_df.iterrows(): - embedding = row['message_embedding'] - message = row['message'] - - if embedding in embedding_message_map: - if message != embedding_message_map[embedding]: - print("Same embedding maps to multiple unique messages. Regenerating ...") + if "message_embedding" in vector_df.columns: + # check that message_embedding is numeric list + if not vector_df["message_embedding"].apply(is_numeric_list).all(): + print("message_embedding is not a numeric list. Regenerating ...") + generate_vect(chat_data, original_vect_path, message_col) + else: + # check if length of all vectors is the same + vect_lengths = vector_df["message_embedding"].apply(lambda x: ast.literal_eval(x)).apply(lambda x : len(x)) + + if (vect_lengths == 0).any(): + print("One or more value in message_embedding are null. Regenerating ...") + generate_vect(chat_data, original_vect_path, message_col) + + if len(vect_lengths.unique()) > 1: + print("Not all vectors have the same length. Regenerating ...") generate_vect(chat_data, original_vect_path, message_col) - break - else: - embedding_message_map[embedding] = message - else: - print("no message_embedding column. Regenerating ...") - generate_vect(chat_data, original_vect_path, message_col) + # check if vectors have a 1-1 mapping with the text + embedding_message_map = {} + for _, row in vector_df.iterrows(): + embedding = row['message_embedding'] + message = row['message'] + + if embedding in embedding_message_map: + if message != embedding_message_map[embedding]: + print("Same embedding maps to multiple unique messages. Regenerating ...") + generate_vect(chat_data, original_vect_path, message_col) + break + else: + embedding_message_map[embedding] = message + + else: + print("no message_embedding column. Regenerating ...") + generate_vect(chat_data, original_vect_path, message_col) except FileNotFoundError: # It's OK if we don't have the path, if the sentence vectors are not necessary if need_sentence: diff --git a/tests/run_package_grouping_tests.py b/tests/run_package_grouping_tests.py index 35d766fc..49f0172b 100644 --- a/tests/run_package_grouping_tests.py +++ b/tests/run_package_grouping_tests.py @@ -15,131 +15,131 @@ tiny_multi_task_renamed_df = pd.read_csv("data/cleaned_data/multi_task_TINY_cols_renamed.csv", encoding='utf-8') package_agg_df = pd.read_csv("data/cleaned_data/test_package_aggregation.csv", encoding='utf-8') - """ - Testing Package Task 1 - --- - In this test, we simply test the functionaality of everything after we rename everything ("Case 1"). - Here, we use a test dataset that has a different conversation ID, speaker ID, message column, and timestamp - column compared to the defaults, and ensure that nothing breaks. - """ - print("TESTING CASE 1 + FILE PATH ROBUSTNESS ......") - testing_package_task_1 = FeatureBuilder( - input_df = tiny_multi_task_renamed_df, - conversation_id_col = "roundId", - speaker_id_col = "speakerId", - message_col = "text", - timestamp_col = "time", - vector_directory = "./vector_data/", - output_file_path_chat_level = "./tiny_multi_task_PT1_level_chat", - output_file_path_user_level = "./tiny_multi_task_PT1_level_user", - output_file_path_conv_level = "./tiny_multi_task_PT1_level_conv", - turns = False, - ) - testing_package_task_1.featurize() - - """ - Testing Package Task 1 Advanced Features - --- - In this test, we test the functionality of the advanced grouping features. + # """ + # Testing Package Task 1 + # --- + # In this test, we simply test the functionaality of everything after we rename everything ("Case 1"). + # Here, we use a test dataset that has a different conversation ID, speaker ID, message column, and timestamp + # column compared to the defaults, and ensure that nothing breaks. + # """ + # print("TESTING CASE 1 + FILE PATH ROBUSTNESS ......") + # testing_package_task_1 = FeatureBuilder( + # input_df = tiny_multi_task_renamed_df, + # conversation_id_col = "roundId", + # speaker_id_col = "speakerId", + # message_col = "text", + # timestamp_col = "time", + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./tiny_multi_task_PT1_level_chat", + # output_file_path_user_level = "./tiny_multi_task_PT1_level_user", + # output_file_path_conv_level = "./tiny_multi_task_PT1_level_conv", + # turns = False, + # ) + # testing_package_task_1.featurize() + + # """ + # Testing Package Task 1 Advanced Features + # --- + # In this test, we test the functionality of the advanced grouping features. - "Case 2": .ngroup() feature - - Group by ["gameId", "roundId", "stageId"] and assert that the number of groupings matches - the stageId (which will confirm that it worked) - - "Case 3": Complex hieararchical grouping - - (3a) ID: stageID; cumulative: True, within_task: False - - (3b) ID: stageID; cumulative: True; within_task: True - - (3c) ID: roundID; cumulative: True, within_task: True - - Improper examples: - - grouping keys: ["roundID", "stageID"], ID: "gameID" - """ - print("TESTING CASE 2 ....") - testing_case_2 = FeatureBuilder( - input_df = tiny_multi_task_renamed_df, - grouping_keys = ["roundId", "stageId"], - speaker_id_col = "speakerId", - message_col = "text", - timestamp_col = "time", - vector_directory = "./vector_data/", - output_file_path_chat_level = "./output/chat/tiny_multi_task_case2_level_chat.csv", - output_file_path_user_level = "./output/user/tiny_multi_task_case2_level_user.csv", - output_file_path_conv_level = "./output/conv/tiny_multi_task_case2_level_conv.csv", - turns = False - ) - testing_case_2.featurize() - - print("TESTING CASE 3A .....") - testing_case_3_a = FeatureBuilder( - input_df = tiny_multi_task_renamed_df, - conversation_id_col = "stageId", - grouping_keys = ["gameId", "roundId", "stageId"], - speaker_id_col = "speakerId", - message_col = "text", - timestamp_col = "time", - cumulative_grouping = True, - within_task = False, - vector_directory = "./vector_data/", - output_file_path_chat_level = "./output/chat/tiny_multi_task_case3a_level_chat.csv", - output_file_path_user_level = "./output/user/tiny_multi_task_case3a_level_user.csv", - output_file_path_conv_level = "./output/conv/tiny_multi_task_case3a_level_conv.csv", - turns = False - ) - testing_case_3_a.featurize() - - print("TESTING CASE 3B .....") - testing_case_3_b = FeatureBuilder( - input_df = tiny_multi_task_renamed_df, - conversation_id_col = "stageId", - grouping_keys = ["gameId", "roundId", "stageId"], - speaker_id_col = "speakerId", - message_col = "text", - timestamp_col = "time", - cumulative_grouping = True, - within_task = True, - vector_directory = "./vector_data/", - output_file_path_chat_level = "./output/chat/tiny_multi_task_case3b_level_chat.csv", - output_file_path_user_level = "./output/user/tiny_multi_task_case3b_level_user.csv", - output_file_path_conv_level = "./output/conv/tiny_multi_task_case3b_level_conv.csv", - turns = False - ) - testing_case_3_b.featurize() - - print("TESTING CASE 3C .....") - testing_case_3_c = FeatureBuilder( - input_df = tiny_multi_task_renamed_df, - conversation_id_col = "roundId", - grouping_keys = ["gameId", "roundId", "stageId"], - speaker_id_col = "speakerId", - message_col = "text", - timestamp_col = "time", - cumulative_grouping = True, - within_task = True, - vector_directory = "./vector_data/", - output_file_path_chat_level = "./output/chat/tiny_multi_task_case3c_level_chat.csv", - output_file_path_user_level = "./output/user/tiny_multi_task_case3c_level_user.csv", - output_file_path_conv_level = "./output/conv/tiny_multi_task_case3c_level_conv.csv", - turns = False - ) - testing_case_3_c.featurize() - - print("TESTING IMPROPER CASE .....") - testing_case_improper = FeatureBuilder( - input_df = tiny_multi_task_renamed_df, - conversation_id_col = "gameId", - grouping_keys = ["roundId", "stageId"], - speaker_id_col = "speakerId", - message_col = "text", - timestamp_col = "time", - cumulative_grouping = False, - within_task = True, - vector_directory = "./vector_data/", - output_file_path_chat_level = "./output/chat/tiny_multi_task_improper_level_chat.csv", - output_file_path_user_level = "./output/user/tiny_multi_task_improper_level_user.csv", - output_file_path_conv_level = "./output/conv/tiny_multi_task_improper_level_conv.csv", - turns = False - ) - testing_case_improper.featurize() + # "Case 2": .ngroup() feature + # - Group by ["gameId", "roundId", "stageId"] and assert that the number of groupings matches + # the stageId (which will confirm that it worked) + + # "Case 3": Complex hieararchical grouping + # - (3a) ID: stageID; cumulative: True, within_task: False + # - (3b) ID: stageID; cumulative: True; within_task: True + # - (3c) ID: roundID; cumulative: True, within_task: True + + # Improper examples: + # - grouping keys: ["roundID", "stageID"], ID: "gameID" + # """ + # print("TESTING CASE 2 ....") + # testing_case_2 = FeatureBuilder( + # input_df = tiny_multi_task_renamed_df, + # grouping_keys = ["roundId", "stageId"], + # speaker_id_col = "speakerId", + # message_col = "text", + # timestamp_col = "time", + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./output/chat/tiny_multi_task_case2_level_chat.csv", + # output_file_path_user_level = "./output/user/tiny_multi_task_case2_level_user.csv", + # output_file_path_conv_level = "./output/conv/tiny_multi_task_case2_level_conv.csv", + # turns = False + # ) + # testing_case_2.featurize() + + # print("TESTING CASE 3A .....") + # testing_case_3_a = FeatureBuilder( + # input_df = tiny_multi_task_renamed_df, + # conversation_id_col = "stageId", + # grouping_keys = ["gameId", "roundId", "stageId"], + # speaker_id_col = "speakerId", + # message_col = "text", + # timestamp_col = "time", + # cumulative_grouping = True, + # within_task = False, + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./output/chat/tiny_multi_task_case3a_level_chat.csv", + # output_file_path_user_level = "./output/user/tiny_multi_task_case3a_level_user.csv", + # output_file_path_conv_level = "./output/conv/tiny_multi_task_case3a_level_conv.csv", + # turns = False + # ) + # testing_case_3_a.featurize() + + # print("TESTING CASE 3B .....") + # testing_case_3_b = FeatureBuilder( + # input_df = tiny_multi_task_renamed_df, + # conversation_id_col = "stageId", + # grouping_keys = ["gameId", "roundId", "stageId"], + # speaker_id_col = "speakerId", + # message_col = "text", + # timestamp_col = "time", + # cumulative_grouping = True, + # within_task = True, + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./output/chat/tiny_multi_task_case3b_level_chat.csv", + # output_file_path_user_level = "./output/user/tiny_multi_task_case3b_level_user.csv", + # output_file_path_conv_level = "./output/conv/tiny_multi_task_case3b_level_conv.csv", + # turns = False + # ) + # testing_case_3_b.featurize() + + # print("TESTING CASE 3C .....") + # testing_case_3_c = FeatureBuilder( + # input_df = tiny_multi_task_renamed_df, + # conversation_id_col = "roundId", + # grouping_keys = ["gameId", "roundId", "stageId"], + # speaker_id_col = "speakerId", + # message_col = "text", + # timestamp_col = "time", + # cumulative_grouping = True, + # within_task = True, + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./output/chat/tiny_multi_task_case3c_level_chat.csv", + # output_file_path_user_level = "./output/user/tiny_multi_task_case3c_level_user.csv", + # output_file_path_conv_level = "./output/conv/tiny_multi_task_case3c_level_conv.csv", + # turns = False + # ) + # testing_case_3_c.featurize() + + # print("TESTING IMPROPER CASE .....") + # testing_case_improper = FeatureBuilder( + # input_df = tiny_multi_task_renamed_df, + # conversation_id_col = "gameId", + # grouping_keys = ["roundId", "stageId"], + # speaker_id_col = "speakerId", + # message_col = "text", + # timestamp_col = "time", + # cumulative_grouping = False, + # within_task = True, + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./output/chat/tiny_multi_task_improper_level_chat.csv", + # output_file_path_user_level = "./output/user/tiny_multi_task_improper_level_user.csv", + # output_file_path_conv_level = "./output/conv/tiny_multi_task_improper_level_conv.csv", + # turns = False + # ) + # testing_case_improper.featurize() """ Test robustness of the FeatureBuilder to taking in an input that contains existing feature names. @@ -165,69 +165,69 @@ ) testing_chat_existing.featurize() - """ - Test robustness of the vector pipeline to weird inputs: - - Super long input - - Input containing only symbols (e.g,. ":-)") - - Empty input - - Input with many spaces - """ - vector_testing_input = pd.read_csv("data/cleaned_data/test_vector_edge_cases.csv", encoding='latin-1') - - test_vectors = FeatureBuilder( - input_df = vector_testing_input, - vector_directory = "./vector_data/", - output_file_path_chat_level = "./output/chat/test_vectors_chat.csv", - output_file_path_user_level = "./output/user/test_vectors_user.csv", - output_file_path_conv_level = "./output/conv/test_vectors_conv.csv", - custom_features = [ - "(BERT) Mimicry", - "Moving Mimicry", - "Forward Flow", - "Discursive Diversity" - ], - turns = False, - regenerate_vectors = True - ) - test_vectors.featurize() - - """ - Test correctness of the custom aggregation pipeline: - - - Aggregate with all the functions for conversation level: [mean, max, min, stdev, median, sum] - - Specify 'mean' as 'average' instead and ensure it shows up correctly - - Aggregate with "mean" for the user level + a fake method (e.g., "foo") - - Aggregate only "second_person_lexical_wordcount" at the conversation level - - Aggregate "positive_bert" at the user level + a fake column (e.g., "bar") + a non-numeric column (e.g., "dale_chall_classification") - """ - - print("Testing custom aggregation...") - custom_agg_fb = FeatureBuilder( - input_df = package_agg_df, - grouping_keys = ["batch_num", "round_num"], - vector_directory = "./vector_data/", - output_file_base = "custom_agg_test" , - convo_methods = ['average', 'max', 'min', 'stdev', 'median', 'sum'], - convo_columns = ['second_person_lexical_wordcount'], # testing functionality in case of typo - user_methods = ['mean', 'foo'], - user_columns = ['positive_bert', 'bar', 'dale_chall_classification'], # testing functionality in case of typo - ) - custom_agg_fb.featurize() - - - """ - Test aggregation piepline when we switch aggregation to false - - (We should only get the default num words, num chars, and num messages aggregated). - """ - - print("Testing aggregation turned off...") - custom_agg_fb_no_agg = FeatureBuilder( - input_df = package_agg_df, - grouping_keys = ["batch_num", "round_num"], - vector_directory = "./vector_data/", - output_file_base = "custom_agg_test_no_agg" , - convo_aggregation = False, - user_aggregation = False, - ) - custom_agg_fb_no_agg.featurize() + # """ + # Test robustness of the vector pipeline to weird inputs: + # - Super long input + # - Input containing only symbols (e.g,. ":-)") + # - Empty input + # - Input with many spaces + # """ + # vector_testing_input = pd.read_csv("data/cleaned_data/test_vector_edge_cases.csv", encoding='latin-1') + + # test_vectors = FeatureBuilder( + # input_df = vector_testing_input, + # vector_directory = "./vector_data/", + # output_file_path_chat_level = "./output/chat/test_vectors_chat.csv", + # output_file_path_user_level = "./output/user/test_vectors_user.csv", + # output_file_path_conv_level = "./output/conv/test_vectors_conv.csv", + # custom_features = [ + # "(BERT) Mimicry", + # "Moving Mimicry", + # "Forward Flow", + # "Discursive Diversity" + # ], + # turns = False, + # regenerate_vectors = True + # ) + # test_vectors.featurize() + + # """ + # Test correctness of the custom aggregation pipeline: + + # - Aggregate with all the functions for conversation level: [mean, max, min, stdev, median, sum] + # - Specify 'mean' as 'average' instead and ensure it shows up correctly + # - Aggregate with "mean" for the user level + a fake method (e.g., "foo") + # - Aggregate only "second_person_lexical_wordcount" at the conversation level + # - Aggregate "positive_bert" at the user level + a fake column (e.g., "bar") + a non-numeric column (e.g., "dale_chall_classification") + # """ + + # print("Testing custom aggregation...") + # custom_agg_fb = FeatureBuilder( + # input_df = package_agg_df, + # grouping_keys = ["batch_num", "round_num"], + # vector_directory = "./vector_data/", + # output_file_base = "custom_agg_test" , + # convo_methods = ['average', 'max', 'min', 'stdev', 'median', 'sum'], + # convo_columns = ['second_person_lexical_wordcount'], # testing functionality in case of typo + # user_methods = ['mean', 'foo'], + # user_columns = ['positive_bert', 'bar', 'dale_chall_classification'], # testing functionality in case of typo + # ) + # custom_agg_fb.featurize() + + + # """ + # Test aggregation piepline when we switch aggregation to false + + # (We should only get the default num words, num chars, and num messages aggregated). + # """ + + # print("Testing aggregation turned off...") + # custom_agg_fb_no_agg = FeatureBuilder( + # input_df = package_agg_df, + # grouping_keys = ["batch_num", "round_num"], + # vector_directory = "./vector_data/", + # output_file_base = "custom_agg_test_no_agg" , + # convo_aggregation = False, + # user_aggregation = False, + # ) + # custom_agg_fb_no_agg.featurize()