diff --git a/column2Vec/research/column2Vec_re.py b/column2Vec/research/column2Vec_re.py index 1ba4b02..88224a4 100644 --- a/column2Vec/research/column2Vec_re.py +++ b/column2Vec/research/column2Vec_re.py @@ -48,7 +48,8 @@ ] MODEL = "paraphrase-multilingual-mpnet-base-v2" # 'bert-base-nli-mean-tokens' THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -model = SentenceTransformer(MODEL) +model = SentenceTransformer(MODEL, tokenizer_kwargs={ + 'clean_up_tokenization_spaces': True}) def count_embedding(column1: pd.Series, function, key: str) -> pd.Series: diff --git a/constants.py b/constants.py index 1530cff..0bb6719 100644 --- a/constants.py +++ b/constants.py @@ -51,7 +51,9 @@ class TrainedModel: """ configure() - __model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") + __model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", tokenizer_kwargs={ + 'clean_up_tokenization_spaces': True, + }) def set_module(self, model: SentenceTransformer): """ diff --git a/similarity/DataFrameMetadataCreator.py b/similarity/DataFrameMetadataCreator.py index d798b07..504fc6e 100644 --- a/similarity/DataFrameMetadataCreator.py +++ b/similarity/DataFrameMetadataCreator.py @@ -63,7 +63,8 @@ def __init__(self, dataframe: pd.DataFrame): """ self.dataframe = dataframe self.metadata = DataFrameMetadata() - self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens") + self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={ + 'clean_up_tokenization_spaces': True}) self.metadata.size = dataframe.shape[0] self.metadata.column_names = list(dataframe.columns) self.metadata.column_names_clean = {i: re.sub("[^(0-9 |a-z).]", " ", i.lower()) for i in self.metadata.column_names} @@ -152,7 +153,8 @@ def __get_model(self) -> SentenceTransformer: :return: embedding model if exists or creates new one """ if not self.model: - self.model = SentenceTransformer("bert-base-nli-mean-tokens") + self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={ + 'clean_up_tokenization_spaces': True}) return self.model # Setting Creator diff --git a/test/test_column2Vec.py b/test/test_column2Vec.py index 7e09015..afb9713 100644 --- a/test/test_column2Vec.py +++ b/test/test_column2Vec.py @@ -33,7 +33,8 @@ def get_vectors(function, data): count = 1 for key in data: # print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%") - result[key] = function(data[key], SentenceTransformer(MODEL), key) + result[key] = function(data[key], SentenceTransformer(MODEL, tokenizer_kwargs={ + 'clean_up_tokenization_spaces': True}), key) count += 1 end = time.time() print(f"ELAPSED TIME :{end - start}")