Skip to content

Commit 832080a

Browse files
#23 Add tests and improve filesystem connector
1 parent b271786 commit 832080a

File tree

5 files changed

+86
-16
lines changed

5 files changed

+86
-16
lines changed

similarityRunner/connectors/filesystem_connector.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,21 @@
55

66
from functionsRunner import load_files_from_list
77
from interfaces.ConnectorInterface import ConnectorInterface
8-
from models.connector_models import ConnectorSettings, Output, ConnectorOutput, FSConnectorSettings
8+
from models.connector_models import Output, ConnectorOutput, FSConnectorSettings
99

1010

1111
class FilesystemConnector(ConnectorInterface):
12-
def __init__(self, config):
13-
self.config = config
1412

1513
def _connect_and_load_data_source(self, settings: FSConnectorSettings) -> ConnectorOutput:
1614
file_list = settings.files_paths
1715
for folder in settings.directory_paths:
18-
file_list = file_list + [folder + "/" + s for s in os.listdir(folder)]
16+
file_list = file_list + [os.path.join(folder, s) for s in os.listdir(folder)]
1917

20-
names, tables = load_files_from_list(os.listdir(file_list), settings.file_type)
18+
tables, names = load_files_from_list(file_list, settings.file_type)
2119
return ConnectorOutput(names=names, tables=tables)
2220

2321
def _format_data(self, data: ConnectorOutput) -> Output:
24-
pass
22+
return data.tables, data.names
2523

2624
def close(self):
2725
pass
28-

similarityRunner/functionsRunner.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,9 @@
33
from models.connector_models import FileType
44

55

6-
def load_files_from_list(folder: list[str], file_type: tuple[FileType] = FileType.CSV) -> tuple[list[pd.DataFrame], list[str]]:
6+
def load_files_from_list(folder: list[str], file_type: tuple = (FileType.CSV, )) -> tuple[list[pd.DataFrame], list[str]]:
77
"""
8-
it loads cvs files from folder and returns list of loaded dataframe and list of names
9-
:param folder: from which we load the files
10-
:param file_type: type of file, csv, parquet, etc.
11-
:return: two lists
8+
129
"""
1310
data = []
1411
names = []
@@ -20,3 +17,8 @@ def load_files_from_list(folder: list[str], file_type: tuple[FileType] = FileTyp
2017
data.append(pd.read_parquet(file))
2118
names.append(file.replace(".parquet", ""))
2219
return data, names
20+
21+
def csv_to_parquet(file: str):
22+
df = pd.read_csv(file)
23+
df.to_parquet(file.replace(".csv", ".parquet"))
24+
return file.replace(".csv", ".parquet")

similarityRunner/models/connector_models.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pandas as pd
99
from pydantic import BaseModel
1010

11-
Output = pd.DataFrame
11+
Output = tuple[list[pd.DataFrame], list[str]]
1212

1313
class FileType(Enum):
1414
CSV = "csv"
@@ -20,17 +20,23 @@ class ConnectorSettings(BaseModel):
2020
ConnectorSettings class is a base class for connector settings.
2121
"""
2222

23-
# here will be common fields for all connectors
2423
file_type: tuple[FileType] # csv, parquet, etc., tuple for immutability
24+
class Config:
25+
# arbitrary_types_allowed is set to True to allow tuple FileType
26+
arbitrary_types_allowed = True
2527

2628

2729
class ConnectorOutput(BaseModel):
2830
"""
2931
ConnectorOutput class is a base class for connector output.
3032
"""
3133
names: list[str]
32-
tables: tuple[list[pd.DataFrame]]
33-
# here will be common fields for all connectors
34+
tables: list[pd.DataFrame]
35+
36+
class Config:
37+
# arbitrary_types_allowed is set to True to allow list of pandas DataFrames
38+
arbitrary_types_allowed = True
39+
3440

3541
class FSConnectorSettings(ConnectorSettings):
3642
"""

test/test_connectors.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import unittest
2+
3+
from connectors.filesystem_connector import FilesystemConnector
4+
from models.connector_models import FSConnectorSettings
5+
6+
7+
class TestFileSystemConnector(unittest.TestCase):
8+
def test_get_data_files(self):
9+
connector = FilesystemConnector()
10+
settings = FSConnectorSettings(files_paths=["../data/netflix_titles.csv",
11+
"../data/disney_movies.csv"],
12+
directory_paths=[],
13+
file_type=("csv",))
14+
data, names = connector.get_data(settings)
15+
self.assertEqual(len(data), 2)
16+
self.assertEqual(names[0], "../data/netflix_titles")
17+
self.assertEqual(names[1], "../data/disney_movies")
18+
19+
def test_get_data_folder(self):
20+
connector = FilesystemConnector()
21+
settings = FSConnectorSettings(files_paths=[],
22+
directory_paths=["../data"],
23+
file_type=("csv",))
24+
data, _ = connector.get_data(settings)
25+
self.assertEqual(len(data), 11)

test/test_functions_runner.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import unittest
2+
3+
from functionsRunner import load_files_from_list, csv_to_parquet
4+
from models.connector_models import FileType
5+
6+
7+
class TestLoadFilesFromList(unittest.TestCase):
8+
def test_load_csv_file(self):
9+
data, names = load_files_from_list(["../data/netflix_titles.csv"], (FileType.CSV, ))
10+
self.assertEqual(len(data), 1)
11+
self.assertEqual(names[0], "../data/netflix_titles")
12+
13+
def test_load_csv_files(self):
14+
data, names = load_files_from_list(["../data/netflix_titles.csv", "../data/disney_movies.csv"], (FileType.CSV, ))
15+
self.assertEqual(len(data), 2)
16+
self.assertEqual(names[0], "../data/netflix_titles")
17+
self.assertEqual(names[1], "../data/disney_movies")
18+
19+
20+
def test_load_parquet_file(self):
21+
csv_to_parquet("../data/netflix_titles.csv")
22+
data, names = load_files_from_list(["../data/netflix_titles.parquet"], (FileType.PARQUET, ))
23+
self.assertEqual(len(data), 1)
24+
self.assertEqual(names[0], "../data/netflix_titles")
25+
26+
def test_load_parquet_files(self):
27+
csv_to_parquet("../data/netflix_titles.csv")
28+
csv_to_parquet("../data/disney_movies.csv")
29+
data, names = load_files_from_list(["../data/netflix_titles.parquet", "../data/disney_movies.parquet"], (FileType.PARQUET, ))
30+
self.assertEqual(len(data), 2)
31+
self.assertEqual(names[0], "../data/netflix_titles")
32+
self.assertEqual(names[1], "../data/disney_movies")
33+
34+
35+
def test_load_csv_and_parquet_files(self):
36+
csv_to_parquet("../data/netflix_titles.csv")
37+
data, names = load_files_from_list(["../data/netflix_titles.parquet", "../data/disney_movies.csv"], (FileType.PARQUET, FileType.CSV))
38+
self.assertEqual(len(data), 2)
39+
self.assertEqual(names[0], "../data/netflix_titles")
40+
self.assertEqual(names[1], "../data/disney_movies")

0 commit comments

Comments
 (0)