From e09c45d3bd706096eb96a03bd2c7d6fb64dcc679 Mon Sep 17 00:00:00 2001 From: OlivieFranklova Date: Tue, 26 Nov 2024 17:53:36 +0100 Subject: [PATCH] add suggestions from PR --- column2vec/src/column2vec.py | 1 - similarity_framework/main.py | 8 ++++---- .../src/impl/comparator/comparator_by_column.py | 3 ++- .../src/impl/comparator/distance_functions.py | 1 + similarity_framework/src/impl/comparator/utils.py | 4 ++-- similarity_framework/src/models/metadata.py | 1 - similarity_framework/src/models/similarity.py | 1 - tests/column2vec/test_column2vec.py | 2 +- 8 files changed, 10 insertions(+), 11 deletions(-) diff --git a/column2vec/src/column2vec.py b/column2vec/src/column2vec.py index ed0d603..03fdf9b 100644 --- a/column2vec/src/column2vec.py +++ b/column2vec/src/column2vec.py @@ -60,7 +60,6 @@ def get_cache(self, key: str, function: str) -> list | None: tmp = self.__cache.loc[function, key] if (tmp != "nan" and tmp is not int) or (tmp is int and not math.isnan(tmp)): return json.loads(tmp) # json is faster than ast - # print(f"NO CACHE key: {key}, function: {function}") return None def save( diff --git a/similarity_framework/main.py b/similarity_framework/main.py index d431921..9bed26a 100644 --- a/similarity_framework/main.py +++ b/similarity_framework/main.py @@ -22,10 +22,6 @@ def create_metadata(data): - """ - This function creates metadata - :return created metadata - """ return (TypeMetadataCreator(data).compute_advanced_structural_types().compute_column_kind().compute_column_names_embeddings()).get_metadata() @@ -43,14 +39,18 @@ def compare_datasets(path1, path2): metadata2 = create_metadata(data2) comparator_by_column = ( ComparatorByColumn() + ## different option # .add_comparator_type(SizeComparatorByColumn()) .add_comparator_type(IncompleteColumnsComparatorByColumn()).add_comparator_type(ColumnNamesEmbeddingsComparatorByColumn()) + ## different option # .add_comparator_type(ColumnKindHandler()) ) compartor = ( ComparatorByType() + ## different option # .add_comparator_type(SizeHandler()) .add_comparator_type(IncompleteColumnsHandler()) + ## different option # .add_comparator_type(KindHandler()) .add_comparator_type(ColumnNamesEmbeddingsHandler()) ) diff --git a/similarity_framework/src/impl/comparator/comparator_by_column.py b/similarity_framework/src/impl/comparator/comparator_by_column.py index e481448..605c37c 100644 --- a/similarity_framework/src/impl/comparator/comparator_by_column.py +++ b/similarity_framework/src/impl/comparator/comparator_by_column.py @@ -185,6 +185,7 @@ def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # tod :param embeddings2: values for column2 :return: float from 0 to 1 """ + # alternative version # res = pd.DataFrame() # row_mins = [] # for id1, embed1 in enumerate(embeddings1): @@ -273,7 +274,7 @@ def compare_constants( value: float = 0 if metadata1.value == metadata2.value else 1 else: value = 1 - cosine_sim( - metadata1.value_embeddings[0], # todo 0 nebo 1 + metadata1.value_embeddings[0], metadata2.value_embeddings[0], ) # if nulls are equal and exist diff --git a/similarity_framework/src/impl/comparator/distance_functions.py b/similarity_framework/src/impl/comparator/distance_functions.py index 0772042..97ccdd3 100644 --- a/similarity_framework/src/impl/comparator/distance_functions.py +++ b/similarity_framework/src/impl/comparator/distance_functions.py @@ -32,6 +32,7 @@ def compute(self, distance_table: pd.DataFrame) -> float: if distance_table.size == 0: return np.nan row_avg = distance_table.min(axis=1) + # alternative add these lines # column_avg = distance_table.min(axis=0) # return min(row_avg.mean(), column_avg.mean()) return row_avg.mean() diff --git a/similarity_framework/src/impl/comparator/utils.py b/similarity_framework/src/impl/comparator/utils.py index c168003..af9d74f 100644 --- a/similarity_framework/src/impl/comparator/utils.py +++ b/similarity_framework/src/impl/comparator/utils.py @@ -20,7 +20,7 @@ def concat(*data_frames: pd.DataFrame) -> pd.DataFrame: return res.map(lambda x: x / len(data_frames)) -def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: # todo move to functions.py? +def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: """ Compute cosine similarity (range 0 to 1) 1 teh same 0 completely different :param u: embeddings 1 @@ -30,7 +30,7 @@ def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: # todo move to fun return round( np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)), 3, - ) # todo change rounding to 4, 5 6 ...etc + ) def get_ratio(count1: int, count2: int) -> float: diff --git a/similarity_framework/src/models/metadata.py b/similarity_framework/src/models/metadata.py index 69f7841..7e53840 100644 --- a/similarity_framework/src/models/metadata.py +++ b/similarity_framework/src/models/metadata.py @@ -114,7 +114,6 @@ def __init__( self.nulls = null_values self.value = value self.distribution = distribution - # model.encode(list(value)).view(-1, 1) self.value_embeddings = None if type(value[0]) is not str else model.encode(list(value)) def __str__(self): diff --git a/similarity_framework/src/models/similarity.py b/similarity_framework/src/models/similarity.py index 60de285..9d9d035 100644 --- a/similarity_framework/src/models/similarity.py +++ b/similarity_framework/src/models/similarity.py @@ -13,7 +13,6 @@ class SimilarityConfiguration: @dataclass class SimilarityOutput: distance: float - # TODO: Thesis add other proper fields @dataclass diff --git a/tests/column2vec/test_column2vec.py b/tests/column2vec/test_column2vec.py index ac5ea78..e55b59f 100644 --- a/tests/column2vec/test_column2vec.py +++ b/tests/column2vec/test_column2vec.py @@ -15,6 +15,7 @@ SKIP_CLUSTERS = True SKIP_SIMILAR = False +# alternative model # MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens MODEL = 'bert-base-nli-mean-tokens' # THIS_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -32,7 +33,6 @@ def get_vectors(function, data): result = {} count = 1 for key in data: - # print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%") result[key] = function(data[key], SentenceTransformer(MODEL, tokenizer_kwargs={ 'clean_up_tokenization_spaces': True}), key) count += 1