Skip to content

Commit e09c45d

Browse files
add suggestions from PR
1 parent bdcf6e2 commit e09c45d

File tree

8 files changed

+10
-11
lines changed

8 files changed

+10
-11
lines changed

column2vec/src/column2vec.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ def get_cache(self, key: str, function: str) -> list | None:
6060
tmp = self.__cache.loc[function, key]
6161
if (tmp != "nan" and tmp is not int) or (tmp is int and not math.isnan(tmp)):
6262
return json.loads(tmp) # json is faster than ast
63-
# print(f"NO CACHE key: {key}, function: {function}")
6463
return None
6564

6665
def save(

similarity_framework/main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@
2222

2323

2424
def create_metadata(data):
25-
"""
26-
This function creates metadata
27-
:return created metadata
28-
"""
2925
return (TypeMetadataCreator(data).compute_advanced_structural_types().compute_column_kind().compute_column_names_embeddings()).get_metadata()
3026

3127

@@ -43,14 +39,18 @@ def compare_datasets(path1, path2):
4339
metadata2 = create_metadata(data2)
4440
comparator_by_column = (
4541
ComparatorByColumn()
42+
## different option
4643
# .add_comparator_type(SizeComparatorByColumn())
4744
.add_comparator_type(IncompleteColumnsComparatorByColumn()).add_comparator_type(ColumnNamesEmbeddingsComparatorByColumn())
45+
## different option
4846
# .add_comparator_type(ColumnKindHandler())
4947
)
5048
compartor = (
5149
ComparatorByType()
50+
## different option
5251
# .add_comparator_type(SizeHandler())
5352
.add_comparator_type(IncompleteColumnsHandler())
53+
## different option
5454
# .add_comparator_type(KindHandler())
5555
.add_comparator_type(ColumnNamesEmbeddingsHandler())
5656
)

similarity_framework/src/impl/comparator/comparator_by_column.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # tod
185185
:param embeddings2: values for column2
186186
:return: float from 0 to 1
187187
"""
188+
# alternative version
188189
# res = pd.DataFrame()
189190
# row_mins = []
190191
# for id1, embed1 in enumerate(embeddings1):
@@ -273,7 +274,7 @@ def compare_constants(
273274
value: float = 0 if metadata1.value == metadata2.value else 1
274275
else:
275276
value = 1 - cosine_sim(
276-
metadata1.value_embeddings[0], # todo 0 nebo 1
277+
metadata1.value_embeddings[0],
277278
metadata2.value_embeddings[0],
278279
)
279280
# if nulls are equal and exist

similarity_framework/src/impl/comparator/distance_functions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def compute(self, distance_table: pd.DataFrame) -> float:
3232
if distance_table.size == 0:
3333
return np.nan
3434
row_avg = distance_table.min(axis=1)
35+
# alternative add these lines
3536
# column_avg = distance_table.min(axis=0)
3637
# return min(row_avg.mean(), column_avg.mean())
3738
return row_avg.mean()

similarity_framework/src/impl/comparator/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def concat(*data_frames: pd.DataFrame) -> pd.DataFrame:
2020
return res.map(lambda x: x / len(data_frames))
2121

2222

23-
def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: # todo move to functions.py?
23+
def cosine_sim(u: list | Tensor, v: list | Tensor) -> float:
2424
"""
2525
Compute cosine similarity (range 0 to 1) 1 teh same 0 completely different
2626
:param u: embeddings 1
@@ -30,7 +30,7 @@ def cosine_sim(u: list | Tensor, v: list | Tensor) -> float: # todo move to fun
3030
return round(
3131
np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)),
3232
3,
33-
) # todo change rounding to 4, 5 6 ...etc
33+
)
3434

3535

3636
def get_ratio(count1: int, count2: int) -> float:

similarity_framework/src/models/metadata.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ def __init__(
114114
self.nulls = null_values
115115
self.value = value
116116
self.distribution = distribution
117-
# model.encode(list(value)).view(-1, 1)
118117
self.value_embeddings = None if type(value[0]) is not str else model.encode(list(value))
119118

120119
def __str__(self):

similarity_framework/src/models/similarity.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ class SimilarityConfiguration:
1313
@dataclass
1414
class SimilarityOutput:
1515
distance: float
16-
# TODO: Thesis add other proper fields
1716

1817

1918
@dataclass

tests/column2vec/test_column2vec.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
SKIP_CLUSTERS = True
1717
SKIP_SIMILAR = False
18+
# alternative model
1819
# MODEL = 'all-mpnet-base-v2' # bert-base-nli-mean-tokens
1920
MODEL = 'bert-base-nli-mean-tokens' #
2021
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -32,7 +33,6 @@ def get_vectors(function, data):
3233
result = {}
3334
count = 1
3435
for key in data:
35-
# print("Processing column: " + key + " " + str(round((count / len(data)) * 100, 2)) + "%")
3636
result[key] = function(data[key], SentenceTransformer(MODEL, tokenizer_kwargs={
3737
'clean_up_tokenization_spaces': True}), key)
3838
count += 1

0 commit comments

Comments
 (0)