Skip to content

Commit

Permalink
Fix clean up tokenization error
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Oct 1, 2024
1 parent 5ea367d commit fe6c08c
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 5 deletions.
3 changes: 2 additions & 1 deletion column2Vec/research/column2Vec_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
]
MODEL = "paraphrase-multilingual-mpnet-base-v2" # 'bert-base-nli-mean-tokens'
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
model = SentenceTransformer(MODEL)
model = SentenceTransformer(MODEL, tokenizer_kwargs={
'clean_up_tokenization_spaces': True})


def count_embedding(column1: pd.Series, function, key: str) -> pd.Series:
Expand Down
4 changes: 3 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ class TrainedModel:
"""

configure()
__model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
__model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", tokenizer_kwargs={
'clean_up_tokenization_spaces': True,
})

def set_module(self, model: SentenceTransformer):
"""
Expand Down
6 changes: 4 additions & 2 deletions similarity/DataFrameMetadataCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def __init__(self, dataframe: pd.DataFrame):
"""
self.dataframe = dataframe
self.metadata = DataFrameMetadata()
self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens")
self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={
'clean_up_tokenization_spaces': True})
self.metadata.size = dataframe.shape[0]
self.metadata.column_names = list(dataframe.columns)
self.metadata.column_names_clean = {i: re.sub("[^(0-9 |a-z).]", " ", i.lower()) for i in self.metadata.column_names}
Expand Down Expand Up @@ -152,7 +153,8 @@ def __get_model(self) -> SentenceTransformer:
:return: embedding model if exists or creates new one
"""
if not self.model:
self.model = SentenceTransformer("bert-base-nli-mean-tokens")
self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={
'clean_up_tokenization_spaces': True})
return self.model

# Setting Creator
Expand Down
3 changes: 2 additions & 1 deletion test/test_column2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def get_vectors(function, data):
count = 1
for key in data:
# print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%")
result[key] = function(data[key], SentenceTransformer(MODEL), key)
result[key] = function(data[key], SentenceTransformer(MODEL, tokenizer_kwargs={
'clean_up_tokenization_spaces': True}), key)
count += 1
end = time.time()
print(f"ELAPSED TIME :{end - start}")
Expand Down

0 comments on commit fe6c08c

Please sign in to comment.