Skip to content

Commit

Permalink
add suggestions from PR
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Nov 26, 2024
1 parent bdcf6e2 commit e09c45d
Show file tree
Hide file tree
Showing 8 changed files with 10 additions and 11 deletions.
1 change: 0 additions & 1 deletion column2vec/src/column2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def get_cache(self, key: str, function: str) -> list | None:
tmp = self.__cache.loc[function, key]
if (tmp != "nan" and tmp is not int) or (tmp is int and not math.isnan(tmp)):
return json.loads(tmp) # json is faster than ast
# print(f"NO CACHE key: {key}, function: {function}")
return None

def save(
Expand Down
8 changes: 4 additions & 4 deletions similarity_framework/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@


def create_metadata(data):
"""
This function creates metadata
:return created metadata
"""
return (TypeMetadataCreator(data).compute_advanced_structural_types().compute_column_kind().compute_column_names_embeddings()).get_metadata()


Expand All @@ -43,14 +39,18 @@ def compare_datasets(path1, path2):
metadata2 = create_metadata(data2)
comparator_by_column = (
ComparatorByColumn()
## different option
# .add_comparator_type(SizeComparatorByColumn())
.add_comparator_type(IncompleteColumnsComparatorByColumn()).add_comparator_type(ColumnNamesEmbeddingsComparatorByColumn())
## different option
# .add_comparator_type(ColumnKindHandler())
)
compartor = (
ComparatorByType()
## different option
# .add_comparator_type(SizeHandler())
.add_comparator_type(IncompleteColumnsHandler())
## different option
# .add_comparator_type(KindHandler())
.add_comparator_type(ColumnNamesEmbeddingsHandler())
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # tod
:param embeddings2: values for column2
:return: float from 0 to 1
"""
# alternative version
# res = pd.DataFrame()
# row_mins = []
# for id1, embed1 in enumerate(embeddings1):
Expand Down Expand Up @@ -273,7 +274,7 @@ def compare_constants(
value: float = 0 if metadata1.value == metadata2.value else 1
else:
value = 1 - cosine_sim(
metadata1.value_embeddings[0], # todo 0 nebo 1
metadata1.value_embeddings[0],
metadata2.value_embeddings[0],
)
# if nulls are equal and exist
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def compute(self, distance_table: pd.DataFrame) -> float:
if distance_table.size == 0:
return np.nan
row_avg = distance_table.min(axis=1)
# alternative add these lines
# column_avg = distance_table.min(axis=0)
# return min(row_avg.mean(), column_avg.mean())
return row_avg.mean()
4 changes: 2 additions & 2 deletions similarity_framework/src/impl/comparator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def concat(*data_frames: pd.DataFrame) -> pd.DataFrame:
return res.map(lambda x: x / len(data_frames))


def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: # todo move to functions.py?
def cosine_sim(u: list | Tensor, v: list | Tensor) -> float:
"""
Compute cosine similarity (range 0 to 1) 1 teh same 0 completely different
:param u: embeddings 1
Expand All @@ -30,7 +30,7 @@ def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: # todo move to fun
return round(
np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)),
3,
) # todo change rounding to 4, 5 6 ...etc
)


def get_ratio(count1: int, count2: int) -> float:
Expand Down
1 change: 0 additions & 1 deletion similarity_framework/src/models/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def __init__(
self.nulls = null_values
self.value = value
self.distribution = distribution
# model.encode(list(value)).view(-1, 1)
self.value_embeddings = None if type(value[0]) is not str else model.encode(list(value))

def __str__(self):
Expand Down
1 change: 0 additions & 1 deletion similarity_framework/src/models/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class SimilarityConfiguration:
@dataclass
class SimilarityOutput:
distance: float
# TODO: Thesis add other proper fields


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion tests/column2vec/test_column2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

SKIP_CLUSTERS = True
SKIP_SIMILAR = False
# alternative model
# MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens
MODEL = 'bert-base-nli-mean-tokens' #
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -32,7 +33,6 @@ def get_vectors(function, data):
result = {}
count = 1
for key in data:
# print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%")
result[key] = function(data[key], SentenceTransformer(MODEL, tokenizer_kwargs={
'clean_up_tokenization_spaces': True}), key)
count += 1
Expand Down

0 comments on commit e09c45d

Please sign in to comment.