Skip to content

Commit 8526550

Browse files
Add runner
1 parent 6241574 commit 8526550

File tree

5 files changed

+64
-30
lines changed

5 files changed

+64
-30
lines changed

similarity/ComparatorByColumn.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,10 @@ def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # tod
226226
"""
227227
res = pd.DataFrame()
228228
row_mins = []
229-
for id1, embed1 in enumerate(embeddings1.items()):
230-
for id2, embed2 in enumerate(embeddings2.items()):
229+
for id1, embed1 in enumerate(embeddings1):
230+
for id2, embed2 in enumerate(embeddings2):
231231
res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2)
232-
row_mins.append(min(res[id1]))
232+
row_mins.append(res.loc[id1].min())
233233
column_mins = []
234234
for _, column in res.items():
235235
column_mins.append(min(column))

similarityRunner/UI/run_similarity.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,25 @@
11
import sys
22

3-
from models.connector_models import ConnectorSettings
4-
from models.user_models import SimilaritySettings, MetadataSettings
3+
4+
from models.connector_models import FSConnectorSettings, FileType as ft
5+
from models.user_models import SimilaritySettings, MetadataSettings, ComparatorType as ct, RunType
56
import runner as r
67

7-
if __name__ == "__main__":
8+
def get_arg(index, message):
89
try:
9-
directory = sys.argv[1]
10-
run_type = sys.argv[2] # all, metadata, similarity
11-
settings = SimilaritySettings()
12-
settings.connector = ConnectorSettings(file_type=("csv", "parquet"), files_paths=[], directory_paths=directory)
13-
settings.metadata = MetadataSettings(all=True, kinds=True, types=True, embeddings=True)
14-
settings.run_type = run_type
15-
r.run(settings)
10+
return sys.argv[index]
1611
except IndexError:
17-
print("Add path to directory")
12+
print(message)
13+
sys.exit(1)
14+
15+
if __name__ == "__main__":
16+
directory = get_arg(1,"Add path to directory")
17+
run_type = get_arg(2,"Add run type, all metadata, similarity") # all, metadata, similarity
18+
comparator_type = get_arg(3,"Add comparator type: by_column, by_type ") # by_column, by_type
19+
settings = SimilaritySettings(connector=FSConnectorSettings(file_type=(ft.CSV, ft.PARQUET), files_paths=[], directory_paths=[directory]),
20+
metadata=MetadataSettings(all=True, kinds=True, types=True, embeddings=True),
21+
run_type=RunType(run_type),
22+
comparator_type=ct.BY_COLUMN if comparator_type == "by_column" else ct.BY_TYPE
23+
)
24+
result = r.run(settings)
25+
print(result)

similarityRunner/models/connector_models.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class ConnectorSettings(BaseModel):
2020
ConnectorSettings class is a base class for connector settings.
2121
"""
2222

23-
file_type: tuple[FileType] # csv, parquet, etc., tuple for immutability
23+
file_type: tuple[FileType, ...] # csv, parquet, etc., tuple for immutability
2424
class Config:
2525
# arbitrary_types_allowed is set to True to allow tuple FileType
2626
arbitrary_types_allowed = True
@@ -44,3 +44,9 @@ class FSConnectorSettings(ConnectorSettings):
4444
"""
4545
files_paths: list[str]
4646
directory_paths: list[str]
47+
48+
class S3ConnectorSettings(ConnectorSettings):
49+
"""
50+
S3ConnectorSettings class is a derived class for S3 connector settings.
51+
"""
52+
pass

similarityRunner/models/user_models.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
This module contains the user models
33
"""
4-
from enum import EnumType
4+
from enum import Enum
55

66
from pydantic import BaseModel
77

@@ -12,13 +12,17 @@
1212

1313
class SimilarityOutput(BaseModel):
1414
"""
15-
SimilarityOutput class isclass containing similarity output.
15+
SimilarityOutput class is class containing similarity output.
1616
"""
1717

1818
# here will be common fields for all similarity models
1919
table_names: list[str]
2020
distances: dict[(str, str), float]
2121

22+
class Config:
23+
# arbitrary_types_allowed is set to True to allow list and dictionary
24+
arbitrary_types_allowed = True
25+
2226
class MetadataSettings(BaseModel):
2327
"""
2428
MetadataSettings class is a base class for metadata settings.
@@ -28,14 +32,15 @@ class MetadataSettings(BaseModel):
2832
types: bool
2933
embeddings: bool
3034

31-
class RunType(EnumType):
35+
class RunType(str, Enum):
3236
ALL = "all"
3337
METADATA = "metadata"
3438
SIMILARITY = "similarity"
3539

36-
class ComparatorType(EnumType):
40+
class ComparatorType(Enum):
3741
BY_COLUMN = ComparatorByColumn()
3842
BY_TYPE = Comparator()
43+
3944
class SimilaritySettings(BaseModel):
4045
"""
4146
SimilaritySettings class is a base class for similarity settings.
@@ -44,3 +49,7 @@ class SimilaritySettings(BaseModel):
4449
metadata: MetadataSettings
4550
run_type: RunType
4651
comparator_type: ComparatorType
52+
53+
class Config:
54+
# arbitrary_types_allowed is set to True to allow Enum Types
55+
arbitrary_types_allowed = True

similarityRunner/runner.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
"""
22
This
33
"""
4+
import time
45

5-
from Comparator import Comparator
6+
from Comparator import Comparator, KindComparator, ColumnExactNamesComparator as ExactNames
67
from ComparatorByColumn import ComparatorByColumn, ColumnKindComparator, ColumnExactNamesComparator
78
from DataFrameMetadata import DataFrameMetadata
89
from DataFrameMetadataCreator import DataFrameMetadataCreator
910
from connectors.filesystem_connector import FilesystemConnector
10-
from interfaces.OutputFormaterInterface import OutputFormaterInterface
11+
from formators.jason_formater import JsonFormater
12+
from main import BY_COLUMN
1113
from models.connector_models import Output
12-
from models.user_models import SimilaritySettings
14+
from models.user_models import SimilaritySettings, ComparatorType
15+
1316

1417
def create_metadata(settings: SimilaritySettings, data: Output) -> dict[str, DataFrameMetadata]:
1518
"""
@@ -35,21 +38,24 @@ def __get_comparator(settings: SimilaritySettings):
3538
"""
3639
Get comparator based on settings
3740
"""
38-
if settings.comparator_type == "BY_COLUMN":
41+
if settings.comparator_type == ComparatorType.BY_COLUMN:
3942
comp = ComparatorByColumn()
4043
return comp.add_comparator_type(ColumnKindComparator()).add_comparator_type(ColumnExactNamesComparator())
4144
# todo add by settings #35
42-
return Comparator() # todo #35
45+
else:
46+
comp = Comparator() # todo add by settings #35
47+
return comp.add_comparator_type(KindComparator()).add_comparator_type(ExactNames())
4348

4449
def compute_similarity(settings: SimilaritySettings, data: dict[str, DataFrameMetadata]):
4550
"""
4651
Compute similarity between tables
4752
"""
4853
comparator = __get_comparator(settings)
49-
similarity = {}
50-
for name, met in data.items():
51-
for name2, met2 in data.items():
52-
similarity[(name, name2)] = comparator.compare(met, met2)
54+
names = list(data.keys())
55+
similarity = {
56+
name: {name2: comparator.compare(data[name], data[name2]) for name2 in names}
57+
for name in names
58+
}
5359
return similarity
5460

5561
def run(settings: SimilaritySettings):
@@ -58,12 +64,17 @@ def run(settings: SimilaritySettings):
5864
"""
5965
data = FilesystemConnector().get_data(settings.connector)
6066
if settings.run_type == "all":
67+
start = time.time()
6168
print("Creating metadata ...")
6269
met = create_metadata(settings, data)
63-
print("Metadata created")
70+
end = time.time()
71+
print("Metadata created in", end - start, "s")
6472
print("Computing similarity ...")
73+
start = time.time()
6574
res = compute_similarity(settings, met)
66-
return OutputFormaterInterface().format_output(res)
75+
end = time.time()
76+
print("Similarity computed in", end - start, "s")
77+
return JsonFormater().format(res)
6778
elif settings.run_type == "metadata":
6879
create_metadata(settings, data)
6980
elif settings.run_type == "similarity":

0 commit comments

Comments
 (0)