Skip to content

Commit

Permalink
Format with black
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Dec 9, 2024
1 parent 03ce876 commit 537b22d
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 67 deletions.
6 changes: 2 additions & 4 deletions similarity_framework/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@


def create_metadata(data):
return (TypeMetadataCreator().
compute_advanced_structural_types()
.compute_column_kind().compute_column_names_embeddings()).get_metadata(data)
return (TypeMetadataCreator().compute_advanced_structural_types().compute_column_kind().compute_column_names_embeddings()).get_metadata(data)


def compare_datasets(path1:str, path2):
def compare_datasets(path1: str, path2):
"""
This function compare two tables
It will read datasets, create metadata and comparator, compare them
Expand Down
13 changes: 10 additions & 3 deletions similarity_framework/src/impl/comparator/comparator_by_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,22 @@


from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin, AverageDist
from similarity_framework.src.impl.comparator.handlers import SizeHandler, IncompleteColumnsHandler, ColumnExactNamesHandler, ColumnNamesEmbeddingsHandler, \
ColumnEmbeddingsHandler, ColumnKindHandler, ColumnTypeHandler, TableHandler
from similarity_framework.src.impl.comparator.handlers import (
SizeHandler,
IncompleteColumnsHandler,
ColumnExactNamesHandler,
ColumnNamesEmbeddingsHandler,
ColumnEmbeddingsHandler,
ColumnKindHandler,
ColumnTypeHandler,
TableHandler,
)
from similarity_framework.src.interfaces.comparator.comparator import HandlerType, Comparator
from similarity_framework.src.models.metadata import Metadata
from similarity_framework.src.models.similarity import SimilarityOutput
from similarity_framework.src.models.settings import AnalysisSettings



class ComparatorByColumn(Comparator):
"""
Comparator for comparing two tables
Expand Down
83 changes: 46 additions & 37 deletions similarity_framework/src/impl/comparator/comparator_by_type.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
from __future__ import annotations

import math
from statistics import mean

import numpy as np
import pandas as pd
from torch import Tensor

from logging_ import logger
from similarity_framework.src.impl.comparator.comparator_by_column import ColumnTypeHandler, IncompleteColumnsHandler, ColumnExactNamesHandler, \
ColumnNamesEmbeddingsHandler, ColumnEmbeddingsHandler, SizeHandler, ColumnKindHandler
from similarity_framework.src.impl.comparator.utils import cosine_sim, get_ratio, concat, fill_result
from similarity_framework.src.interfaces.common import DistanceFunction
from similarity_framework.src.impl.comparator.comparator_by_column import (
ColumnTypeHandler,
IncompleteColumnsHandler,
ColumnExactNamesHandler,
ColumnNamesEmbeddingsHandler,
ColumnEmbeddingsHandler,
SizeHandler,
ColumnKindHandler,
)
from similarity_framework.src.impl.comparator.utils import get_ratio, concat
from similarity_framework.src.impl.comparator.distance_functions import HausdorffDistanceMin, AverageDist
from similarity_framework.src.interfaces.comparator.comparator import HandlerType, Comparator
from similarity_framework.src.models.metadata import Metadata
from similarity_framework.src.models.similarity import SimilarityOutput, Settings
from similarity_framework.src.models.types_ import DataKind, Type
from similarity_framework.src.models.settings import AnalysisSettings


Expand Down Expand Up @@ -59,6 +62,7 @@ def __init__(self):
self.types_compare = True
self.kind_weight = 1
self.type_weight = 1

def set_kinds(self, value: bool) -> "ComparatorByType":
"""
Set if kinds should be compared
Expand All @@ -80,9 +84,9 @@ def add_comparator_type(self, comparator: HandlerType) -> "ComparatorByType":
self.comparator_type.append(comparator)
return self

def __compare_all_columns(self, metadata1: Metadata, metadata2: Metadata,
column_names1: set[str], column_names2: set[str],
comparators: list[HandlerType]) -> pd.DataFrame:
def __compare_all_columns(
self, metadata1: Metadata, metadata2: Metadata, column_names1: set[str], column_names2: set[str], comparators: list[HandlerType]
) -> pd.DataFrame:
all_compares = []
for comparator in comparators:
col_to_col = pd.DataFrame()
Expand All @@ -91,25 +95,22 @@ def __compare_all_columns(self, metadata1: Metadata, metadata2: Metadata,
result = comparator.compare(metadata1, metadata2, index1=name1, index2=name2)
if result is not np.nan:
col_to_col.loc[idx1, idx2] = result
if not col_to_col.empty: all_compares.append(col_to_col) # todo add , comparator.weight
if not col_to_col.empty:
all_compares.append(col_to_col) # todo add , comparator.weight
return pd.DataFrame if all_compares == [] else concat(*all_compares)

def __compare_types(self, type_, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame:
comparators = self.comparator_type.copy()
if self.types_compare: comparators.append(ColumnTypeHandler())
all_compares = self.__compare_all_columns(metadata1, metadata2,
metadata1.column_type[type_],
metadata2.column_type[type_],
comparators)
if self.types_compare:
comparators.append(ColumnTypeHandler())
all_compares = self.__compare_all_columns(metadata1, metadata2, metadata1.column_type[type_], metadata2.column_type[type_], comparators)
return all_compares

def __compare_kinds(self, kind, metadata1: Metadata, metadata2: Metadata) -> pd.DataFrame:
comparators = self.comparator_type.copy()
if self.kinds_compare: comparators.append(ColumnKindHandler())
all_compares = self.__compare_all_columns(metadata1, metadata2,
metadata1.column_kind[kind],
metadata2.column_kind[kind],
comparators)
if self.kinds_compare:
comparators.append(ColumnKindHandler())
all_compares = self.__compare_all_columns(metadata1, metadata2, metadata1.column_kind[kind], metadata2.column_kind[kind], comparators)
return all_compares

def _compare(self, metadata1: Metadata, metadata2: Metadata) -> SimilarityOutput:
Expand All @@ -123,35 +124,43 @@ def _compare(self, metadata1: Metadata, metadata2: Metadata) -> SimilarityOutput
continue
dist_table = self.__compare_types(type_, metadata1, metadata2)
if not dist_table.empty:
distances.append((self.distance_function.compute(dist_table),
get_ratio(
dist_table.shape[0],
dist_table.shape[1],
),
self.type_weight))
distances.append(
(
self.distance_function.compute(dist_table),
get_ratio(
dist_table.shape[0],
dist_table.shape[1],
),
self.type_weight,
)
)
if self.kinds:
for kind in metadata1.column_kind.keys():
if metadata1.column_kind[kind] != () and metadata2.column_kind[kind] != ():
if metadata1.column_kind[kind] != () and metadata2.column_kind[kind] != ():
dist_table = self.__compare_kinds(kind, metadata1, metadata2)
if not dist_table.empty:
distances.append((self.distance_function.compute(dist_table),
get_ratio(
dist_table.shape[0],
dist_table.shape[1],
),
self.kind_weight))
distances.append(
(
self.distance_function.compute(dist_table),
get_ratio(
dist_table.shape[0],
dist_table.shape[1],
),
self.kind_weight,
)
)

result = 0
nan = 0
sum_weight = sum([weight for _,_, weight in distances if not np.isnan(weight)])
sum_weight = sum([weight for _, _, weight in distances if not np.isnan(weight)])
for dist, ratio, weight in distances:
if math.isnan(dist):
nan += 1
continue
if Settings.NO_RATIO in self.settings:
result += dist * dist * weight/sum_weight
result += dist * dist * weight / sum_weight
else:
result += dist * dist * ratio * weight/sum_weight
result += dist * dist * ratio * weight / sum_weight
if nan == len(distances):
return SimilarityOutput(distance=1)
return SimilarityOutput(distance=np.sqrt(result))
30 changes: 16 additions & 14 deletions similarity_framework/src/impl/comparator/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,16 @@ def compare_bools(
:return: float number in range <0, 1>
"""
nulls = 0 if metadata1.nulls == metadata2.nulls else 1
dist1 = metadata1.distribution[0] / metadata1.distribution[1] if metadata1.distribution[1] > metadata1.distribution[0] else metadata1.distribution[1] / metadata1.distribution[0]
dist2 = metadata2.distribution[0] / metadata2.distribution[1] if metadata2.distribution[1] > metadata2.distribution[0] else metadata2.distribution[1] / metadata2.distribution[0]
dist1 = (
metadata1.distribution[0] / metadata1.distribution[1]
if metadata1.distribution[1] > metadata1.distribution[0]
else metadata1.distribution[1] / metadata1.distribution[0]
)
dist2 = (
metadata2.distribution[0] / metadata2.distribution[1]
if metadata2.distribution[1] > metadata2.distribution[0]
else metadata2.distribution[1] / metadata2.distribution[0]
)
distr = abs(dist1 - dist2)
if metadata1.value_embeddings is None or metadata2.value_embeddings is None:
return (nulls + distr) / 2
Expand Down Expand Up @@ -374,9 +382,7 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str,

class ColumnTypeHandler(SpecificColumnHandler):

def __numerical_compare1(
self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int
) -> float:
def __numerical_compare1(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int) -> float:
num_met1 = metadata1.numerical_metadata[index1]
num_met2 = metadata2.numerical_metadata[index2]
if num_met1.same_value_length == num_met2.same_value_length:
Expand All @@ -393,9 +399,7 @@ def __numerical_compare1(
score += 2
return 1 - score / 9

def __nonnumerical_compare1(
self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int
) -> float:
def __nonnumerical_compare1(self, metadata1: Metadata, metadata2: Metadata, index1: str, index2: str, score: int) -> float:
num_met1 = metadata1.nonnumerical_metadata[index1]
num_met2 = metadata2.nonnumerical_metadata[index2]
if num_met1.longest == num_met2.longest or num_met1.longest is num_met2.longest:
Expand Down Expand Up @@ -431,9 +435,6 @@ def _inner_compare(self, metadata1: Metadata, metadata2: Metadata, index1: str,
return 1





class CategoricalHandler(HandlerType):
"""
Categorical Handler class
Expand Down Expand Up @@ -505,7 +506,8 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data
name_distance.loc[id1, id2] = 1 - cosine_sim(metadata1.column_name_embeddings[column1], metadata2.column_name_embeddings[column2])
return concat(result, name_distance)

class CategoricalHandlerSimilar(CategoricalHandler):# pragma: no cover

class CategoricalHandlerSimilar(CategoricalHandler): # pragma: no cover
"""
Handler for column category
"""
Expand Down Expand Up @@ -554,7 +556,7 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data
return concat(result, name_distance)


class KindHandlerOldByType(HandlerType):# pragma: no cover
class KindHandlerOldByType(HandlerType): # pragma: no cover
"""
Handler for column kind
"""
Expand Down Expand Up @@ -808,4 +810,4 @@ def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.Data
self.settings,
self.kind_weight[DataKind.CATEGORICAL],
)
return pd.DataFrame([result])
return pd.DataFrame([result])
1 change: 0 additions & 1 deletion similarity_framework/src/impl/comparator/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
import os

import numpy as np
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ def __init__(self):
True for incomplete data and False otherwise
"""
super().__init__()
self.model: Optional[SentenceTransformer] = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', tokenizer_kwargs={"clean_up_tokenization_spaces": True})
self.model: Optional[SentenceTransformer] = SentenceTransformer(
"sentence-transformers/all-mpnet-base-v2", tokenizer_kwargs={"clean_up_tokenization_spaces": True}
)

def __normalize(self, num1: int, num2: int) -> tuple[int, int]:
"""
Expand Down Expand Up @@ -165,7 +167,7 @@ def get_model(self) -> SentenceTransformer:
:return: embedding model if exists or creates new one
"""
if not self.model:
self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', tokenizer_kwargs={"clean_up_tokenization_spaces": True})
self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", tokenizer_kwargs={"clean_up_tokenization_spaces": True})
return self.model

# Setting Creator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from similarity_framework.src.models.similarity import Settings, SimilarityOutput
from similarity_framework.src.models.settings import AnalysisSettings


class Comparator(ABC):
"""
Abstract Comparator class
Expand Down
1 change: 0 additions & 1 deletion similarity_framework/src/interfaces/comparator/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,3 @@ def __init__(self, weight: int = 1, analysis_settings: AnalysisSettings = None):
@abstractmethod
def compare(self, metadata1: Metadata, metadata2: Metadata, **kwargs) -> pd.DataFrame | float:
"""This method should compare two tables and return distance table"""

5 changes: 0 additions & 5 deletions similarity_runner/src/interfaces/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,3 @@ def run(self):
result[(first.name, second.name)] = comparator.compare(first, second, analysis_settings)
# TODO: based on analysis settings get specified metadata objects
self.show(result, analysis_settings)





0 comments on commit 537b22d

Please sign in to comment.