Skip to content

Commit 5ea367d

Browse files
#26 add basic runner
1 parent 832080a commit 5ea367d

File tree

5 files changed

+132
-0
lines changed

5 files changed

+132
-0
lines changed

similarityRunner/UI/__init__.py

Whitespace-only changes.

similarityRunner/UI/run_similarity.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import sys
2+
3+
from models.connector_models import ConnectorSettings
4+
from models.user_models import SimilaritySettings, MetadataSettings
5+
import runner as r
6+
7+
if __name__ == "__main__":
8+
try:
9+
directory = sys.argv[1]
10+
run_type = sys.argv[2] # all, metadata, similarity
11+
settings = SimilaritySettings()
12+
settings.connector = ConnectorSettings(file_type=("csv", "parquet"), files_paths=[], directory_paths=directory)
13+
settings.metadata = MetadataSettings(all=True, kinds=True, types=True, embeddings=True)
14+
settings.run_type = run_type
15+
r.run(settings)
16+
except IndexError:
17+
print("Add path to directory")
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""
2+
3+
"""
4+
import abc
5+
6+
class OutputFormaterInterface(metaclass=abc.ABCMeta):
7+
"""
8+
OutputFormaterInterface class is an abstract interface that defines
9+
the methods that must be implemented by the concrete formater classes.
10+
"""
11+
12+
@abc.abstractmethod
13+
def format(self, data: dict):
14+
pass

similarityRunner/models/user_models.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
"""
22
This module contains the user models
33
"""
4+
from enum import EnumType
45

56
from pydantic import BaseModel
67

8+
from Comparator import Comparator
9+
from ComparatorByColumn import ComparatorByColumn
10+
from models.connector_models import ConnectorSettings
11+
712

813
class SimilarityOutput(BaseModel):
914
"""
@@ -13,3 +18,29 @@ class SimilarityOutput(BaseModel):
1318
# here will be common fields for all similarity models
1419
table_names: list[str]
1520
distances: dict[(str, str), float]
21+
22+
class MetadataSettings(BaseModel):
23+
"""
24+
MetadataSettings class is a base class for metadata settings.
25+
"""
26+
all: bool
27+
kinds: bool
28+
types: bool
29+
embeddings: bool
30+
31+
class RunType(EnumType):
32+
ALL = "all"
33+
METADATA = "metadata"
34+
SIMILARITY = "similarity"
35+
36+
class ComparatorType(EnumType):
37+
BY_COLUMN = ComparatorByColumn()
38+
BY_TYPE = Comparator()
39+
class SimilaritySettings(BaseModel):
40+
"""
41+
SimilaritySettings class is a base class for similarity settings.
42+
"""
43+
connector: ConnectorSettings
44+
metadata: MetadataSettings
45+
run_type: RunType
46+
comparator_type: ComparatorType

similarityRunner/runner.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
This
3+
"""
4+
5+
from Comparator import Comparator
6+
from ComparatorByColumn import ComparatorByColumn, ColumnKindComparator, ColumnExactNamesComparator
7+
from DataFrameMetadata import DataFrameMetadata
8+
from DataFrameMetadataCreator import DataFrameMetadataCreator
9+
from connectors.filesystem_connector import FilesystemConnector
10+
from interfaces.OutputFormaterInterface import OutputFormaterInterface
11+
from models.connector_models import Output
12+
from models.user_models import SimilaritySettings
13+
14+
def create_metadata(settings: SimilaritySettings, data: Output) -> dict[str, DataFrameMetadata]:
15+
"""
16+
Create metadata for each table in the data
17+
"""
18+
dataframes, names = data
19+
df_metadata = {}
20+
if settings.metadata.all:
21+
for df, name in zip(dataframes, names):
22+
df_metadata[name] = (DataFrameMetadataCreator(df)
23+
.create_column_embeddings()
24+
.compute_advanced_structural_types()
25+
.compute_column_kind()
26+
.get_metadata())
27+
else:
28+
... # todo after #35
29+
30+
# todo save metadata after #35
31+
return df_metadata
32+
33+
34+
def __get_comparator(settings: SimilaritySettings):
35+
"""
36+
Get comparator based on settings
37+
"""
38+
if settings.comparator_type == "BY_COLUMN":
39+
comp = ComparatorByColumn()
40+
return comp.add_comparator_type(ColumnKindComparator()).add_comparator_type(ColumnExactNamesComparator())
41+
# todo add by settings #35
42+
return Comparator() # todo #35
43+
44+
def compute_similarity(settings: SimilaritySettings, data: dict[str, DataFrameMetadata]):
45+
"""
46+
Compute similarity between tables
47+
"""
48+
comparator = __get_comparator(settings)
49+
similarity = {}
50+
for name, met in data.items():
51+
for name2, met2 in data.items():
52+
similarity[(name, name2)] = comparator.compare(met, met2)
53+
return similarity
54+
55+
def run(settings: SimilaritySettings):
56+
"""
57+
Run the similarity pipeline
58+
"""
59+
data = FilesystemConnector().get_data(settings.connector)
60+
if settings.run_type == "all":
61+
print("Creating metadata ...")
62+
met = create_metadata(settings, data)
63+
print("Metadata created")
64+
print("Computing similarity ...")
65+
res = compute_similarity(settings, met)
66+
return OutputFormaterInterface().format_output(res)
67+
elif settings.run_type == "metadata":
68+
create_metadata(settings, data)
69+
elif settings.run_type == "similarity":
70+
print("Similarity") # todo after #35

0 commit comments

Comments
 (0)