-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Amy/package v2 #326
Amy/package v2 #326
Changes from 45 commits
a47e4ea
e3ad8d1
b2ed12a
c6c64bd
e060a44
6a05e80
1c72695
607548a
650197e
d35aeb1
36cd76e
21987f3
119efe4
6d25efd
7e87679
5678567
d75837f
143cb77
28f85f7
8b8bd24
89cd16b
d04037d
bdf7035
ec2ed64
d83f854
7905240
bf762d0
1dad080
ed17d7a
6b94149
576a376
fd50f83
c4200c5
10f325d
653e386
7c9545d
7c73f8d
b0bbb7a
37080e8
7d75712
1c861a3
1da2ecd
b10bdee
d007ae8
3fca434
a36107d
e050fb6
3f31f07
4f562cc
2892a3c
b027d27
84c126e
4825972
3ba2082
1e0d3f2
6a7cccf
51d833f
23b957b
c4d5608
7e8d985
c34ee7f
a5362bb
727b91e
0643dd3
7141bae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,7 +66,8 @@ class FeatureBuilder: | |
:param timestamp_col: A string representing the column name that should be selected as the message. Defaults to "timestamp". | ||
:type timestamp_col: str, optional | ||
|
||
:param grouping_keys: A list of multiple identifiers that collectively identify a conversation. If non-empty, we will group by all of the keys in the list and use the grouped key as the unique "conversational identifier." | ||
:param grouping_keys: A list of multiple identifiers that collectively identify a conversation. If non-empty, we will group by all of the keys in the list and use the | ||
grouped key as the unique "conversational identifier." | ||
Defaults to an empty list. | ||
:type grouping_keys: list, optional | ||
|
||
|
@@ -86,11 +87,31 @@ class FeatureBuilder: | |
:param ner_cutoff: This is the cutoff value for the confidence of prediction for each named entity. Defaults to 0.9. | ||
:type ner_cutoff: int | ||
|
||
:param regenerate_vectors: If true, will regenerate vector data even if it already exists. Defaults to False. | ||
:param regenerate_vectors: If true, will regenerate vector data even if it already exists. Defaults to False. | ||
:type regenerate_vectors: bool, optional | ||
|
||
:param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (that is, with capitalization and punctuation removed). This was the default behavior for v.0.1.3 and earlier, but we now default to computing metrics on the unpreprocessed text (which INCLUDES capitalization and punctuation). Defaults to False. | ||
:type compute_vectors_from_preprocessed: bool, optional | ||
:param custom_vect_path: If provided, features will be generated using custom vectors rather than default SBERT. Defaults to None. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: need to update documentation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. RESOLVED |
||
:type custom_vect_path: str, optional | ||
|
||
:param convo_aggregation: If true, will aggregate features at the conversational level. Defaults to True. | ||
:type convo_aggregation: bool, optional | ||
|
||
:param convo_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the conversational level. Defaults to ['mean', 'max', 'min', 'stdev']. | ||
:type convo_methods: list, optional | ||
|
||
:param convo_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the conversational level. Defauts to all all numeric columns. | ||
:type convo_columns: list, optional | ||
|
||
:param user_aggregation: If true, will aggregate features at the speaker/user level. Defaults to True. | ||
:type convo_aggregation: bool, optional | ||
|
||
:param user_methods: Specifies which functions that you want to aggregate with (e.g., mean, stdev...) at the speaker/user level. Defaults to ['mean', 'max', 'min', 'stdev']. | ||
:type convo_methods: list, optional | ||
|
||
:param user_columns: Specifies which columns (at the utterance/chat level) that you want aggregated for the speaker/user level. Defauts to all all numeric columns. | ||
:type convo_columns: list, optional | ||
|
||
:return: The FeatureBuilder doesn't return anything; instead, it writes the generated features to files in the specified paths. It will also print out its progress, so you should see "All Done!" in the terminal, which will indicate that the features have been generated. | ||
:rtype: None | ||
|
@@ -117,7 +138,14 @@ def __init__( | |
ner_training_df: pd.DataFrame = None, | ||
ner_cutoff: int = 0.9, | ||
regenerate_vectors: bool = False, | ||
compute_vectors_from_preprocessed: bool = False | ||
compute_vectors_from_preprocessed: bool = False, | ||
custom_vect_path: str = None, | ||
convo_aggregation = True, | ||
convo_methods: list = ['mean', 'max', 'min', 'stdev'], | ||
convo_columns: list = None, | ||
user_aggregation = True, | ||
user_methods: list = ['mean', 'max', 'min', 'stdev'], | ||
user_columns: list = None | ||
) -> None: | ||
|
||
# Defining input and output paths. | ||
|
@@ -224,6 +252,12 @@ def __init__( | |
self.within_task = within_task | ||
self.ner_cutoff = ner_cutoff | ||
self.regenerate_vectors = regenerate_vectors | ||
self.convo_aggregation = convo_aggregation | ||
self.convo_methods = convo_methods | ||
self.convo_columns = convo_columns | ||
self.user_aggregation = user_aggregation | ||
self.user_methods = user_methods | ||
self.user_columns = user_columns | ||
|
||
if(compute_vectors_from_preprocessed == True): | ||
self.vector_colname = self.message_col # because the message col will eventually get preprocessed | ||
|
@@ -358,7 +392,24 @@ def __init__( | |
if not re.match(r"(.*\/|^)output\/", self.output_file_path_user_level): | ||
self.output_file_path_user_level = re.sub(r'/user/', r'/output/user/', self.output_file_path_user_level) | ||
|
||
self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name | ||
if custom_vect_path is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: it seems like this PR build in some of the initial infrastructure for custom vectors (document this) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. RESOLVED -- custom vector infrastructure has been removed |
||
print("Detected that user has requested custom vectors...") | ||
print("We will generate features using custom vectors rather than default SBERT") | ||
self.vect_path = custom_vect_path | ||
else: | ||
self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name | ||
|
||
self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name | ||
|
||
if custom_vect_path is not None: | ||
print("Detected that user has requested custom vectors...") | ||
print("We will generate features using custom vectors rather than default SBERT") | ||
self.vect_path = custom_vect_path | ||
else: | ||
self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name | ||
|
||
self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name | ||
|
||
self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name | ||
|
||
# Check + generate embeddings | ||
|
@@ -375,7 +426,11 @@ def __init__( | |
if(not need_sentiment and feature_dict[feature]["bert_sentiment_data"]): | ||
need_sentiment = True | ||
|
||
check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname) | ||
# preprocess chat data again | ||
self.preprocess_chat_data() | ||
# preprocess chat data again | ||
self.preprocess_chat_data() | ||
check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, self.message_col) | ||
|
||
if(need_sentence): | ||
self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman') | ||
|
@@ -487,7 +542,12 @@ def featurize(self) -> None: | |
Path(self.output_file_path_user_level).parent.mkdir(parents=True, exist_ok=True) | ||
Path(self.output_file_path_chat_level).parent.mkdir(parents=True, exist_ok=True) | ||
Path(self.output_file_path_conv_level).parent.mkdir(parents=True, exist_ok=True) | ||
|
||
|
||
# Store column names of what we generated, so that the user can easily access them | ||
self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"])) | ||
self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"])) | ||
self.conv_features_all = [col for col in self.conv_data if col not in self.orig_data and col != 'conversation_num'] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note --- check this; we likely want the last line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. UPDATE: fixed |
||
# Step 3a. Create user level features. | ||
print("Generating User Level Features ...") | ||
self.user_level_features() | ||
|
@@ -497,14 +557,9 @@ def featurize(self) -> None: | |
self.conv_level_features() | ||
self.merge_conv_data_with_original() | ||
|
||
# Step 4. Write the feartures into the files defined in the output paths. | ||
# Step 4. Write the features into the files defined in the output paths. | ||
print("All Done!") | ||
|
||
# Store column names of what we generated, so that the user can easily access them | ||
self.chat_features = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Chat"])) | ||
self.conv_features_base = list(itertools.chain(*[feature_dict[feature]["columns"] for feature in self.feature_names if feature_dict[feature]["level"] == "Conversation"])) | ||
self.conv_features_all = [col for col in self.conv_data if col not in self.orig_data and col != 'conversation_num'] | ||
|
||
self.save_features() | ||
|
||
def preprocess_chat_data(self) -> None: | ||
|
@@ -607,7 +662,11 @@ def user_level_features(self) -> None: | |
vect_data= self.vect_data, | ||
conversation_id_col = self.conversation_id_col, | ||
speaker_id_col = self.speaker_id_col, | ||
input_columns = self.input_columns | ||
input_columns = self.input_columns, | ||
user_aggregation = self.user_aggregation, | ||
user_methods = self.user_methods, | ||
user_columns = self.user_columns, | ||
chat_features = self.chat_features | ||
) | ||
self.user_data = user_feature_builder.calculate_user_level_features() | ||
# Remove special characters in column names | ||
|
@@ -633,7 +692,14 @@ def conv_level_features(self) -> None: | |
speaker_id_col = self.speaker_id_col, | ||
message_col = self.message_col, | ||
timestamp_col = self.timestamp_col, | ||
input_columns = self.input_columns | ||
input_columns = self.input_columns, | ||
convo_aggregation = self.convo_aggregation, | ||
convo_methods = self.convo_methods, | ||
convo_columns = self.convo_columns, | ||
user_aggregation = self.user_aggregation, | ||
user_methods = self.user_methods, | ||
user_columns = self.user_columns, | ||
chat_features = self.chat_features, | ||
) | ||
# Calling the driver inside this class to create the features. | ||
self.conv_data = conv_feature_builder.calculate_conversation_level_features(self.feature_methods_conv) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@amytangzheng I think this is a reference to a local path and it needs to be updated!