@@ -21,8 +21,7 @@ def assert_key_columns_present(df: pd.DataFrame, column_names: dict) -> None:
21
21
# message_col = column_names['message_col']
22
22
23
23
# remove all special characters from df
24
- # df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '', regex=True) # NOTE: This is moved to preprocess_conversation_columns
25
-
24
+ df .columns = df .columns .str .replace ('[^A-Za-z0-9_]' , '' , regex = True )
26
25
# Assert that key columns are present
27
26
for role , col in column_names .items ():
28
27
if role == 'timestamp_col' :
@@ -31,6 +30,7 @@ def assert_key_columns_present(df: pd.DataFrame, column_names: dict) -> None:
31
30
raise KeyError (f"Missing required columns in DataFrame: '{ col } ' (expected for { role } )\n Columns available: { df .columns } " )
32
31
else :
33
32
print (f"Confirmed that data has { role } column: { col } !" )
33
+ df [col ] = df [col ].fillna ('' )
34
34
35
35
# if {conversation_id_col, speaker_id_col, message_col}.issubset(df.columns):
36
36
# print(f"Confirmed that data has conversation_id: {conversation_id_col}, speaker_id: {speaker_id_col} and message: {message_col} columns!")
@@ -66,14 +66,6 @@ def preprocess_conversation_columns(df: pd.DataFrame, column_names: dict, groupi
66
66
:return: The preprocessed DataFrame with a conversation number column.
67
67
:rtype: pd.DataFrame
68
68
"""
69
-
70
- # remove all special characters from df
71
- df .columns = df .columns .str .replace ('[^A-Za-z0-9_]' , '' , regex = True )
72
- # fillna
73
- for role , col in column_names .items ():
74
- if role == 'timestamp_col' :
75
- continue
76
- df [col ] = df [col ].fillna ('' )
77
69
78
70
if not grouping_keys : # case 1: single identifier
79
71
return df
0 commit comments