Merge pull request #30 from mathysgrapotte/csvhandlerpolar

suzannejin · web-flow · commit c182af8c9ca5 · 2024-03-13T11:54:48.000+01:00
Csvhandlerpolar
diff --git a/bin/requirements.txt b/bin/requirements.txt
@@ -3,4 +3,4 @@
 numpy==1.26.0
 pytorch-lightning==2.0.1
 scikit-learn==1.3.0
-pandas==2.0.3
+polars==0.20.15
diff --git a/bin/src/data/csv.py b/bin/src/data/csv.py
@@ -11,11 +11,20 @@
 The parser is a class that takes as input a CSV file and a experiment class that defines data types to be used, noising procedures, splitting etc. 
 """
 
-import pandas as pd
-from typing import Any, Tuple
+import polars as pl
+from typing import Any, Tuple, Union
+from functools import partial
 
+class CsvHandler:
+    """
+    Class for handling CSV files. #TODO add extensive description
+    """
 
-class CSVParser: # change to CsvHandler
+    def __init__(self, experiment: Any, csv_path: str) -> None:
+        self.experiment = experiment
+        self.csv_path = csv_path
+    
+class CsvLoader(CsvHandler): # change to CsvHandler
     """
     Class for parsing CSV files.
     
@@ -24,46 +33,32 @@ class CSVParser: # change to CsvHandler
     Then, one can get one or many items from the data, encoded.
     """
     
-    def __init__(self, experiment: Any, csv_path: str) -> None:
-        self.experiment = experiment
-        self.csv_path = csv_path
-        self.input, self.label, self.meta = self.parse_csv_to_input_label_meta(self.csv_path)
-        self.padding_value = self.find_padding_value(self.input)
-        
-    def parse_csv_to_input_label_meta(self, csv_path: str) -> Tuple[dict, dict, dict]:
+    def __init__(self, experiment: Any, csv_path: str, split: Union[int, None] = None) -> None:
+        super().__init__(experiment, csv_path)
+        if split is not None:
+            # if split is present, we defined the prefered load method to be the load_csv_per_split method with default argument split
+            prefered_load_method = partial(self.load_csv_per_split, split=split)
+        else:
+            prefered_load_method = self.load_all_csv
+        self.input, self.label, self.meta = self.parse_csv_to_input_label_meta(self.csv_path, prefered_load_method)
+    
+    def load_all_csv(self, csv_path: str) -> pl.DataFrame:
         """
-        This function reads the csv file into a dictionary, 
-        and then parses each key with the form name:category:type 
-        into three dictionaries, one for each category [input, label, meta].
-        The keys of each new dictionary are in this form name:type.
+        Loads the csv file into a polars dataframe.
         """
-        # read csv file into a dictionary of lists
-        # the keys of the dictionary are the column names and the values are the column values
-        data = pd.read_csv(csv_path, dtype=str).to_dict(orient="list")
-        
-        # parse the dictionary into three dictionaries, one for each category [input, label, meta]
-        input_data, label_data, meta_data = {}, {}, {}
-        for key in data:
-            name, category, data_type = key.split(":")
-            if category.lower() == "input":
-                input_data[f"{name}:{data_type}"] = data[key]
-            elif category.lower() == "label":
-                label_data[f"{name}:{data_type}"] = data[key]
-            elif category.lower() == "meta":
-                meta_data[f"{name}:{data_type}"] = data[key]
-            else:
-                raise ValueError(f"Unknown category {category}, category (the second element of the csv column, seperated by ':') should be input, label or meta. The specified csv column is {key}.")
-        return input_data, label_data, meta_data
-
-    def find_padding_value(self, data: dict) -> int:
+        return pl.read_csv(csv_path)
+    
+    def load_csv_per_split(self, csv_path: str, split: int) -> pl.DataFrame:
         """
-        Find an integer that is not present in any of the lists of the data dictionary
+        Split is the number of split to load, 0 is train, 1 is validation, 2 is test.
+        This is accessed through the column named "split:meta:int"
         """
-        i = 0
-        while True:
-            if i not in [item for sublist in data.values() for item in sublist]:
-                return i
-            i += 1
+        data = pl.read_csv(csv_path)
+        # check that the selected split value is present in the column split:meta:int
+        if split not in data["split:meta:int"].unique().to_list():
+            raise ValueError(f"The split value {split} is not present in the column split:meta:int. The available values are {data['split:meta:int'].unique().to_list()}")
+        
+        return data.filter(data["split:meta:int"] == split)
     
     def get_and_encode(self, dictionary: dict, idx: Any) -> dict:
         """
@@ -79,7 +74,7 @@ def get_and_encode(self, dictionary: dict, idx: Any) -> dict:
         """
         output = {}
         for key in dictionary: # processing each column
-            
+
             # get the name and data_type
             name = key.split(":")[0]
             data_type = key.split(":")[1]
@@ -97,31 +92,66 @@ def get_and_encode(self, dictionary: dict, idx: Any) -> dict:
             
             # encode the data at given index
             # For that, it first retrieves the data object and then calls the encode_all method to encode the data
-
-            
             output[name] = self.experiment.get_encoding_all(data_type)(dictionary[key][idx])
 
-    
         return output
     
-    def get_encoded_item(self, idx: Any) -> Tuple[dict, dict, dict]:
+    def __len__(self) -> int:
+        """
+        returns the length of the first list in input, assumes that all are the same length
+        """
+        return len(list(self.input.values())[0])
+    
+    def parse_csv_to_input_label_meta(self, csv_path: str, load_method: Any) -> Tuple[dict, dict, dict]:
+        """
+        This function reads the csv file into a dictionary, 
+        and then parses each key with the form name:category:type 
+        into three dictionaries, one for each category [input, label, meta].
+        The keys of each new dictionary are in this form name:type.
+        """
+        # read csv file into a dictionary of lists
+        # the keys of the dictionary are the column names and the values are the column values
+        data = load_method(csv_path).to_dict(as_series=False)
+        
+        # parse the dictionary into three dictionaries, one for each category [input, label, meta]
+        input_data, label_data, meta_data = {}, {}, {}
+        for key in data:
+            name, category, data_type = key.split(":")
+            if category.lower() == "input":
+                input_data[f"{name}:{data_type}"] = data[key]
+            elif category.lower() == "label":
+                label_data[f"{name}:{data_type}"] = data[key]
+            elif category.lower() == "meta":
+                meta_data[f"{name}:{data_type}"] = data[key]
+            else:
+                raise ValueError(f"Unknown category {category}, category (the second element of the csv column, seperated by ':') should be input, label or meta. The specified csv column is {key}.")
+        return input_data, label_data, meta_data
+    
+    def __getitem__(self, idx: Any) -> dict:
         """
         It gets the data at a given index, and encodes the input and label, leaving meta as it is.
         """
         x = self.get_and_encode(self.input, idx)
         y = self.get_and_encode(self.label, idx)
         return x, y, self.meta
     
-    def __len__(self) -> int:
+class CsvParser(CsvHandler):
+    """
+    Class for loading
+    """
+
+    def __init__(self, experiment: Any, csv_path: str) -> None:
+        super().__init__(experiment, csv_path)  
+
+    def save(self, path: str) -> None:
         """
-        returns the length of the first list in input, assumes that all are the same length
+        Saves the data to a csv file.
         """
-        return len(list(self.input.values())[0])
-    
-    def __getitem__(self, idx: Any) -> dict:
+        pass
+
+    def noise(self, data):
         """
-        get a dictionary with all the keys for the data at a given index
+        Adds noise to the data.
         """
-        data = {**self.input, **self.label, **self.meta}
-        return { key: data[key][idx] for key in data }
+        pass
     
diff --git a/bin/src/data/encoding/encoders.py b/bin/src/data/encoding/encoders.py
@@ -77,8 +77,8 @@ def encode_all(self, data: Union[list, str]) -> np.array:
         Encodes the data, if the list is length one, call encode instead.
         It resturns a list with all the encoded data entries.
         """
-        # check if the data is a str, in that case it should use the encode sequence method
-        if isinstance(data, str):
+        # check if the data is not a list, in this case it should use the encode method
+        if not isinstance(data, list):
             return [self.encode(data)]
         else:
             return self.encode_multiprocess(data)
@@ -106,8 +106,8 @@ def encode_all(self, data: list) -> list:
         This method takes as input a list of data points, should be mappable to a single output. 
         """
 
-        # check if data is a string, in that case it should use the encode sequence method
-        if isinstance(data, str):
+        # check if data is not a list, in that case it should use the encode sequence method
+        if not isinstance(data, list):
             return [self.encode(data)]
         else:
             return [float(d) for d in data]
diff --git a/bin/src/data/handlertorch.py b/bin/src/data/handlertorch.py
@@ -6,7 +6,7 @@
 import numpy as np
 from torch.utils.data import Dataset, DataLoader
 from torch.nn.utils.rnn import pad_sequence
-from .csv_parser import CSVParser
+from .csv import CsvLoader
 from typing import Any, Tuple
 
 class TorchDataset(Dataset):
@@ -15,7 +15,7 @@ class TorchDataset(Dataset):
     """
     def __init__(self, csvpath : str, experiment : Any) -> None:
         self.csvpath = csvpath
-        self.parser = CSVParser(experiment, csvpath)
+        self.parser = CsvLoader(experiment, csvpath)
 
     def convert_list_of_numpy_arrays_to_tensor(self, data: list) -> Tuple[torch.Tensor, torch.Tensor]:
         """
@@ -38,13 +38,13 @@ def convert_list_of_numpy_arrays_to_tensor(self, data: list) -> Tuple[torch.Tens
                 data = [torch.from_numpy(d) for d in data] # convert the np arrays to tensors
 
                 # pad sequences
-                padded_data = pad_sequence(data, batch_first=True, padding_value=self.parser.padding_value)
+                padded_data = pad_sequence(data, batch_first=True, padding_value=42)
 
                 # create a mask of the same shape as the padded data
                 mask = torch.zeros_like(padded_data)
 
                 # mask should have ones everywhere the data is not padded (so values are not 42)
-                mask[padded_data != self.parser.padding_value] = 1
+                mask[padded_data != 42] = 1
 
                 return padded_data, mask
 
@@ -65,7 +65,7 @@ def __len__(self) -> int:
         return len(self.parser)
 
     def __getitem__(self, idx: int) -> Tuple[dict, dict, dict]:
-        x, y, meta = self.parser.get_encoded_item(idx)
+        x, y, meta = self.parser[idx]
         # convert the content in the x and y directories to torch tensors
         x, x_mask = self.convert_dict_to_tensor(x)
         y, y_mask = self.convert_dict_to_tensor(y)
diff --git a/bin/tests/test_csv.py b/bin/tests/test_csv.py
@@ -1,22 +1,21 @@
-import numpy as np
-import numpy.testing as npt
 import unittest
 import os
-from bin.src.data.csv_parser import CSVParser
+from bin.src.data.csv import CsvLoader
 from bin.src.data.experiments import DnaToFloatExperiment
 
-class TestDnaToFloatCsvParser(unittest.TestCase):
+class TestDnaToFloatCsvLoader(unittest.TestCase):
 
     def setUp(self):
-        self.csv_parser = CSVParser(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test.csv"))
+        self.csv_loader = CsvLoader(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test.csv"))
+        self.csv_loader_split = CsvLoader(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test_with_split.csv"), split=0)
 
     def test_get_encoded_item_unique(self):
         """ 
-        It tests that the csv_parser.get_encoded_item works well when getting one item.
+        It tests that the csv_loader.get_encoded_item works well when getting one item.
         The following test is performed on the item at idx=0.
         """
         # get the encoded item from the csv file at idx 0
-        encoded_item = self.csv_parser.get_encoded_item(0)
+        encoded_item = self.csv_loader[0]
         
         # test that the encoded item is a tuple of three dictionaries [input, label, meta]
         self.assertEqual(len(encoded_item), 3)
@@ -41,12 +40,12 @@ def test_get_encoded_item_unique(self):
 
     def test_get_encoded_item_multiple(self):
         """
-        It tests that the csv_parser.get_encoded_item works well when getting multiple items using slice.
+        It tests that the csv_loader.get_encoded_item works well when getting multiple items using slice.
         The following test is performed on the item at idx=0 and idx=1.
         """
         
         # get the encoded items from the csv file at idx 0 and 1
-        encoded_item = self.csv_parser.get_encoded_item(slice(0, 2))
+        encoded_item = self.csv_loader[slice(0, 2)]
         
         # test that the encoded item is a tuple of three dictionaries [input, label, meta]
         self.assertEqual(len(encoded_item), 3)
@@ -70,5 +69,20 @@ def test_get_encoded_item_multiple(self):
             self.assertEqual(len(encoded_item[1][key]), 2)
 
     def test_len(self):
-        self.assertEqual(len(self.csv_parser), 2)
+        self.assertEqual(len(self.csv_loader), 2)
+
+    def test_load_with_split(self):
+        # try loading with different split values, should run with 0,1 and 2 and raise an error for other values
+        self.csv_loader_split = CsvLoader(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test_with_split.csv"), split=0)
+        # self.csv_loader_split.input['hello'] should have only one value 
+        self.assertEqual(len(self.csv_loader_split.input['hello:dna']), 1)
+        # check that self.csv_loader_split.meta has only one value in the ['split:int'] column which is 0
+        self.assertEqual(len(self.csv_loader_split.meta['split:int']), 1)
+        self.assertEqual(self.csv_loader_split.meta['split:int'][0], 0)
+        self.csv_loader_split = CsvLoader(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test_with_split.csv"), split=1)
+        self.csv_loader_split = CsvLoader(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test_with_split.csv"), split=2)
+        with self.assertRaises(ValueError): # should raise an error
+            self.csv_loader_split = CsvLoader(DnaToFloatExperiment(), os.path.abspath("bin/tests/test_data/test_with_split.csv"), split=3)
+
+        
         
diff --git a/bin/tests/test_data/test_with_split.csv b/bin/tests/test_data/test_with_split.csv
@@ -0,0 +1,4 @@
+hello:input:dna,hola:label:float,split:meta:int
+ACTGACTGATCGATGC,12,0
+ACTGACTGATCGATGC,12,1
+ACTGACTGATCGATGC,12,2
diff --git a/bin/tests/test_handlertorch.py b/bin/tests/test_handlertorch.py
@@ -1,11 +1,9 @@
 import numpy as np
-import numpy.testing as npt
 import unittest
 import os
 import torch
 from bin.src.data.handlertorch import TorchDataset
 from bin.src.data.experiments import DnaToFloatExperiment
-from bin.src.data.csv_parser import CSVParser
 
 # initialize unittest class
 class TestDnaToFloatTorchDataset(unittest.TestCase):
@@ -32,7 +30,7 @@ def test_convert_dict_to_tensor_same_lengths(self):
         self.assertIsNone(mask_dict["hola"])
 
 
-        input_data = self.torchdataset_same_length.parser.get_encoded_item(slice(0, 2))
+        input_data = self.torchdataset_same_length.parser[slice(0, 2)]
         output_dict, mask_dict = self.torchdataset_same_length.convert_dict_to_tensor(input_data[0])
 
     def test_get_item_same_lenghts(self):