@@ -21,8 +21,7 @@ def assert_key_columns_present(df: pd.DataFrame, column_names: dict) -> None:
2121 # message_col = column_names['message_col']
2222
2323 # remove all special characters from df
24- # df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '', regex=True) # NOTE: This is moved to preprocess_conversation_columns
25-
24+ df .columns = df .columns .str .replace ('[^A-Za-z0-9_]' , '' , regex = True )
2625 # Assert that key columns are present
2726 for role , col in column_names .items ():
2827 if role == 'timestamp_col' :
@@ -31,6 +30,7 @@ def assert_key_columns_present(df: pd.DataFrame, column_names: dict) -> None:
3130 raise KeyError (f"Missing required columns in DataFrame: '{ col } ' (expected for { role } )\n Columns available: { df .columns } " )
3231 else :
3332 print (f"Confirmed that data has { role } column: { col } !" )
33+ df [col ] = df [col ].fillna ('' )
3434
3535 # if {conversation_id_col, speaker_id_col, message_col}.issubset(df.columns):
3636 # print(f"Confirmed that data has conversation_id: {conversation_id_col}, speaker_id: {speaker_id_col} and message: {message_col} columns!")
@@ -66,14 +66,6 @@ def preprocess_conversation_columns(df: pd.DataFrame, column_names: dict, groupi
6666 :return: The preprocessed DataFrame with a conversation number column.
6767 :rtype: pd.DataFrame
6868 """
69-
70- # remove all special characters from df
71- df .columns = df .columns .str .replace ('[^A-Za-z0-9_]' , '' , regex = True )
72- # fillna
73- for role , col in column_names .items ():
74- if role == 'timestamp_col' :
75- continue
76- df [col ] = df [col ].fillna ('' )
7769
7870 if not grouping_keys : # case 1: single identifier
7971 return df
0 commit comments