Skip to content

Commit fe6c08c

Browse files
Fix clean up tokenization error
1 parent 5ea367d commit fe6c08c

File tree

4 files changed

+11
-5
lines changed

4 files changed

+11
-5
lines changed

column2Vec/research/column2Vec_re.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@
4848
]
4949
MODEL = "paraphrase-multilingual-mpnet-base-v2" # 'bert-base-nli-mean-tokens'
5050
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
51-
model = SentenceTransformer(MODEL)
51+
model = SentenceTransformer(MODEL, tokenizer_kwargs={
52+
'clean_up_tokenization_spaces': True})
5253

5354

5455
def count_embedding(column1: pd.Series, function, key: str) -> pd.Series:

constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ class TrainedModel:
5151
"""
5252

5353
configure()
54-
__model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
54+
__model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", tokenizer_kwargs={
55+
'clean_up_tokenization_spaces': True,
56+
})
5557

5658
def set_module(self, model: SentenceTransformer):
5759
"""

similarity/DataFrameMetadataCreator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def __init__(self, dataframe: pd.DataFrame):
6363
"""
6464
self.dataframe = dataframe
6565
self.metadata = DataFrameMetadata()
66-
self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens")
66+
self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={
67+
'clean_up_tokenization_spaces': True})
6768
self.metadata.size = dataframe.shape[0]
6869
self.metadata.column_names = list(dataframe.columns)
6970
self.metadata.column_names_clean = {i: re.sub("[^(0-9 |a-z).]", " ", i.lower()) for i in self.metadata.column_names}
@@ -152,7 +153,8 @@ def __get_model(self) -> SentenceTransformer:
152153
:return: embedding model if exists or creates new one
153154
"""
154155
if not self.model:
155-
self.model = SentenceTransformer("bert-base-nli-mean-tokens")
156+
self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={
157+
'clean_up_tokenization_spaces': True})
156158
return self.model
157159

158160
# Setting Creator

test/test_column2Vec.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def get_vectors(function, data):
3333
count = 1
3434
for key in data:
3535
# print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%")
36-
result[key] = function(data[key], SentenceTransformer(MODEL), key)
36+
result[key] = function(data[key], SentenceTransformer(MODEL, tokenizer_kwargs={
37+
'clean_up_tokenization_spaces': True}), key)
3738
count += 1
3839
end = time.time()
3940
print(f"ELAPSED TIME :{end - start}")

0 commit comments

Comments
 (0)