Skip to content

Commit

Permalink
Fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Dec 9, 2024
1 parent 7f67d8b commit 03ce876
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 10 deletions.
14 changes: 5 additions & 9 deletions similarity_framework/src/impl/comparator/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def __init__(self, compare_kind=None, weight=1):
else:
self.kind_weight = weight

def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # todo add type
def compute_embeddings_distance(self, embeddings1, embeddings2) -> float:
"""
Creates table of distances between embeddings for each row and computes mean
of row and column minimums then pick max.
Expand Down Expand Up @@ -463,7 +463,6 @@ def __create_dist_matrix(self, embeddings1: list[Tensor], embeddings2: list[Tens
for embed1 in embeddings1:
siml_line = []
for embed2 in embeddings2:
# todo rounding for 3 digits ? ok -> two because of minus 0
siml_line.append(
round(
1
Expand Down Expand Up @@ -504,10 +503,9 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data
ratio = get_ratio(categorical1.count_categories, categorical1.count_categories)
result.loc[id1, id2] = dist * ratio
name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2])
# todo p value or correlation
return concat(result, name_distance)

class CategoricalHandlerSimilar(CategoricalHandler):
class CategoricalHandlerSimilar(CategoricalHandler):# pragma: no cover
"""
Handler for column category
"""
Expand Down Expand Up @@ -550,14 +548,13 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data
for id2, (column2, categorical2) in enumerate(metadata2.categorical_metadata.items()):
simil_matrix = self.__create_sim_matrix(categorical1.category_embedding, categorical2.category_embedding)
_, score = self.__compute_similarity_score(simil_matrix)
ratio = get_ratio(categorical1.count_categories, categorical1.count_categories) # todo 1-ratio???
ratio = get_ratio(categorical1.count_categories, categorical1.count_categories)
result.loc[id1, id2] = 1 - (score * ratio)
name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2])
# todo p value or correlation
return concat(result, name_distance)


class KindHandlerOldByType(HandlerType):
class KindHandlerOldByType(HandlerType):# pragma: no cover
"""
Handler for column kind
"""
Expand Down Expand Up @@ -659,7 +656,7 @@ def compare_constants(self, metadata1: Metadata, metadata2: Metadata) -> pd.Data
value_re.loc[column1, column2] = int(meta1.value != meta2.value)
else:
value_re.loc[column1, column2] = 1 - cosine_sim(
meta1.value_embeddings[0], # todo 0 nebo 1
meta1.value_embeddings[0],
meta2.value_embeddings[0],
)

Expand Down Expand Up @@ -767,7 +764,6 @@ def compare_categorical(self, metadata1: Metadata, metadata2: Metadata) -> pd.Da
count1 = metadata1.categorical_metadata[column1].count_categories
count2 = metadata2.categorical_metadata[column2].count_categories
count_re.loc[column1, column2] = count1 / count2 if count1 < count2 else count2 / count1
# todo compare categories_with_count for metadata1 and metadata2
# firstly normalize dictionary categories_with_count then
# compare the difference between the two dictionaries
return concat(value_re, count_re)
Expand Down
24 changes: 23 additions & 1 deletion tests/similarity_framework/test_similarity_comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import unittest

import pandas as pd
from pyarrow import Tensor
from sentence_transformers import SentenceTransformer

from similarity_framework.src.impl.comparator.comparator_by_type import ComparatorByType
from similarity_framework.src.impl.comparator.handlers import HausdorffDistanceMin, SizeHandler, get_ratio, \
Expand All @@ -14,7 +16,7 @@
)
from similarity_framework.src.impl.comparator.distance_functions import AverageDist
from similarity_framework.src.impl.comparator.utils import concat, cosine_sim, fill_result, are_columns_null, create_string_from_columns
from similarity_framework.src.models.metadata import MetadataCreatorInput
from similarity_framework.src.models.metadata import MetadataCreatorInput, Metadata, CategoricalMetadata
from similarity_framework.src.models.similarity import Settings
from similarity_framework.src.impl.metadata.type_metadata_creator import TypeMetadataCreator
from similarity_framework.src.models.types_ import DataKind
Expand Down Expand Up @@ -145,6 +147,8 @@ def setUp(self):
self.compartor.types_compare = False
self.compartor.kinds_compare = False

self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def test_size_compare(self):
self.compartor.add_comparator_type(SizeHandler())

Expand Down Expand Up @@ -211,6 +215,24 @@ def test_kind_ID_compare(self):
self.compartor.compare(self.metadata1, self.metadata1).distance, 0)
self.assertEqual(self.compartor.compare(self.metadata1, self.metadata_diff_column_names).distance, 0)

def test_kind_CATEGORICAL_compare(self):
self.compartor.set_types(False)
metadata = Metadata()
metadata.column_kind[DataKind.CATEGORICAL] = {'column_0', 'column_1'}
metadata.categorical_metadata = {'column_0':
CategoricalMetadata(3, ["One", "Two", "Three"],
pd.Series({'One': 10, 'Two': 5, 'Three': 8}),
[self.model.encode('One'), self.model.encode('Two'), self.model.encode('Three')]
),
'column_1':
CategoricalMetadata(3, ["One", "Two", "Three"],
pd.Series({'One': 15, 'Two': 1, 'Three': 7}),
[self.model.encode('One'), self.model.encode('Two'), self.model.encode('Three')]
)
}
self.compartor.add_comparator_type(ColumnKindHandler(compare_kind=[DataKind.CATEGORICAL]))
self.assertEqual(self.compartor.compare(metadata, metadata).distance, 0)


def test_kind_CONSTANT_compare(self):
self.compartor.set_types(False)
Expand Down

0 comments on commit 03ce876

Please sign in to comment.