Skip to content

Commit

Permalink
Add runner
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Oct 2, 2024
1 parent 6241574 commit 8526550
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 30 deletions.
6 changes: 3 additions & 3 deletions similarity/ComparatorByColumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,10 @@ def compute_embeddings_distance(self, embeddings1, embeddings2) -> float: # tod
"""
res = pd.DataFrame()
row_mins = []
for id1, embed1 in enumerate(embeddings1.items()):
for id2, embed2 in enumerate(embeddings2.items()):
for id1, embed1 in enumerate(embeddings1):
for id2, embed2 in enumerate(embeddings2):
res.loc[id1, id2] = 1 - cosine_sim(embed1, embed2)
row_mins.append(min(res[id1]))
row_mins.append(res.loc[id1].min())
column_mins = []
for _, column in res.items():
column_mins.append(min(column))
Expand Down
30 changes: 19 additions & 11 deletions similarityRunner/UI/run_similarity.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
import sys

from models.connector_models import ConnectorSettings
from models.user_models import SimilaritySettings, MetadataSettings

from models.connector_models import FSConnectorSettings, FileType as ft
from models.user_models import SimilaritySettings, MetadataSettings, ComparatorType as ct, RunType
import runner as r

if __name__ == "__main__":
def get_arg(index, message):
try:
directory = sys.argv[1]
run_type = sys.argv[2] # all, metadata, similarity
settings = SimilaritySettings()
settings.connector = ConnectorSettings(file_type=("csv", "parquet"), files_paths=[], directory_paths=directory)
settings.metadata = MetadataSettings(all=True, kinds=True, types=True, embeddings=True)
settings.run_type = run_type
r.run(settings)
return sys.argv[index]
except IndexError:
print("Add path to directory")
print(message)
sys.exit(1)

if __name__ == "__main__":
directory = get_arg(1,"Add path to directory")
run_type = get_arg(2,"Add run type, all metadata, similarity") # all, metadata, similarity
comparator_type = get_arg(3,"Add comparator type: by_column, by_type ") # by_column, by_type
settings = SimilaritySettings(connector=FSConnectorSettings(file_type=(ft.CSV, ft.PARQUET), files_paths=[], directory_paths=[directory]),
metadata=MetadataSettings(all=True, kinds=True, types=True, embeddings=True),
run_type=RunType(run_type),
comparator_type=ct.BY_COLUMN if comparator_type == "by_column" else ct.BY_TYPE
)
result = r.run(settings)
print(result)
8 changes: 7 additions & 1 deletion similarityRunner/models/connector_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class ConnectorSettings(BaseModel):
ConnectorSettings class is a base class for connector settings.
"""

file_type: tuple[FileType] # csv, parquet, etc., tuple for immutability
file_type: tuple[FileType, ...] # csv, parquet, etc., tuple for immutability
class Config:
# arbitrary_types_allowed is set to True to allow tuple FileType
arbitrary_types_allowed = True
Expand All @@ -44,3 +44,9 @@ class FSConnectorSettings(ConnectorSettings):
"""
files_paths: list[str]
directory_paths: list[str]

class S3ConnectorSettings(ConnectorSettings):
"""
S3ConnectorSettings class is a derived class for S3 connector settings.
"""
pass
17 changes: 13 additions & 4 deletions similarityRunner/models/user_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
This module contains the user models
"""
from enum import EnumType
from enum import Enum

from pydantic import BaseModel

Expand All @@ -12,13 +12,17 @@

class SimilarityOutput(BaseModel):
"""
SimilarityOutput class isclass containing similarity output.
SimilarityOutput class is class containing similarity output.
"""

# here will be common fields for all similarity models
table_names: list[str]
distances: dict[(str, str), float]

class Config:
# arbitrary_types_allowed is set to True to allow list and dictionary
arbitrary_types_allowed = True

class MetadataSettings(BaseModel):
"""
MetadataSettings class is a base class for metadata settings.
Expand All @@ -28,14 +32,15 @@ class MetadataSettings(BaseModel):
types: bool
embeddings: bool

class RunType(EnumType):
class RunType(str, Enum):
ALL = "all"
METADATA = "metadata"
SIMILARITY = "similarity"

class ComparatorType(EnumType):
class ComparatorType(Enum):
BY_COLUMN = ComparatorByColumn()
BY_TYPE = Comparator()

class SimilaritySettings(BaseModel):
"""
SimilaritySettings class is a base class for similarity settings.
Expand All @@ -44,3 +49,7 @@ class SimilaritySettings(BaseModel):
metadata: MetadataSettings
run_type: RunType
comparator_type: ComparatorType

class Config:
# arbitrary_types_allowed is set to True to allow Enum Types
arbitrary_types_allowed = True
33 changes: 22 additions & 11 deletions similarityRunner/runner.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""
This
"""
import time

from Comparator import Comparator
from Comparator import Comparator, KindComparator, ColumnExactNamesComparator as ExactNames
from ComparatorByColumn import ComparatorByColumn, ColumnKindComparator, ColumnExactNamesComparator
from DataFrameMetadata import DataFrameMetadata
from DataFrameMetadataCreator import DataFrameMetadataCreator
from connectors.filesystem_connector import FilesystemConnector
from interfaces.OutputFormaterInterface import OutputFormaterInterface
from formators.jason_formater import JsonFormater
from main import BY_COLUMN
from models.connector_models import Output
from models.user_models import SimilaritySettings
from models.user_models import SimilaritySettings, ComparatorType


def create_metadata(settings: SimilaritySettings, data: Output) -> dict[str, DataFrameMetadata]:
"""
Expand All @@ -35,21 +38,24 @@ def __get_comparator(settings: SimilaritySettings):
"""
Get comparator based on settings
"""
if settings.comparator_type == "BY_COLUMN":
if settings.comparator_type == ComparatorType.BY_COLUMN:
comp = ComparatorByColumn()
return comp.add_comparator_type(ColumnKindComparator()).add_comparator_type(ColumnExactNamesComparator())
# todo add by settings #35
return Comparator() # todo #35
else:
comp = Comparator() # todo add by settings #35
return comp.add_comparator_type(KindComparator()).add_comparator_type(ExactNames())

def compute_similarity(settings: SimilaritySettings, data: dict[str, DataFrameMetadata]):
"""
Compute similarity between tables
"""
comparator = __get_comparator(settings)
similarity = {}
for name, met in data.items():
for name2, met2 in data.items():
similarity[(name, name2)] = comparator.compare(met, met2)
names = list(data.keys())
similarity = {
name: {name2: comparator.compare(data[name], data[name2]) for name2 in names}
for name in names
}
return similarity

def run(settings: SimilaritySettings):
Expand All @@ -58,12 +64,17 @@ def run(settings: SimilaritySettings):
"""
data = FilesystemConnector().get_data(settings.connector)
if settings.run_type == "all":
start = time.time()
print("Creating metadata ...")
met = create_metadata(settings, data)
print("Metadata created")
end = time.time()
print("Metadata created in", end - start, "s")
print("Computing similarity ...")
start = time.time()
res = compute_similarity(settings, met)
return OutputFormaterInterface().format_output(res)
end = time.time()
print("Similarity computed in", end - start, "s")
return JsonFormater().format(res)
elif settings.run_type == "metadata":
create_metadata(settings, data)
elif settings.run_type == "similarity":
Expand Down

0 comments on commit 8526550

Please sign in to comment.