Merge pull request #25 from mathysgrapotte/add-protein-type

alessiovignoli · web-flow · commit fcc320e0f02e · 2024-03-08T18:23:50.000+01:00
Add protein type
diff --git a/bin/src/data/csv_parser.py b/bin/src/data/csv_parser.py
@@ -84,14 +84,20 @@ def get_and_encode(self, dictionary: dict, idx: Any) -> dict:
             name = key.split(":")[0]
             data_type = key.split(":")[1]
 
+            # get the data at the given index
+            # if the data is not a list, it is converted to a list
+            # otherwise it breaks Float().encode_all(data) because it expects a list
+            data = dictionary[key][idx]
+            if not isinstance(data, list):
+                data = [data]
+
             # check if 'data_type' is in the experiment class attributes
             if not hasattr(self.experiment, data_type.lower()):
                 raise ValueError(f"The data type {data_type} is not in the experiment class attributes. the column name is {key}, the available attributes are {self.experiment.__dict__}")
             
             # encode the data at given index
             # For that, it first retrieves the data object and then calls the encode_all method to encode the data
-            # BUG when there is only one element in the list, then we don't get one list anymore, but only the element. And this creates error at Float.encode_all() since here [np.array(float(d)) for d in data] data is only a string and not a list of strings.
-            output[name] = self.experiment.__getattribute__(data_type.lower()).encode_all(dictionary[key][idx])
+            output[name] = self.experiment.__getattribute__(data_type.lower()).encode_all(data)
     
         return output
     
diff --git a/bin/src/data/data_types/data_types.py b/bin/src/data/data_types/data_types.py
@@ -55,36 +55,83 @@ def encode(self, data: str, encoder: Literal['one_hot'] = 'one_hot') -> Any: #TO
         else:
             raise ValueError(f"Unknown encoder {encoder}")
 
-
     def encode_all(self, data: list, encoder: Literal['one_hot'] = 'one_hot') -> list[np.array]:
         if encoder == 'one_hot':
             return self.one_hot_encode_all(data)
         else:
             raise ValueError(f"Unknown encoder {encoder}")
 
+    def add_noise_uniform_text_masker(self, data: str, seed: float = None, **noise_params) -> str:
+        """
+        Adds noise to the data of a single input.
+        """
+        # get the probability param from noise_params, default value is set to 0.1
+        probability = noise_params.get("probability", 0.1)
+        return self.uniform_text_masker.add_noise(data, probability=probability, mask='N', seed=seed)
+    
+    def add_noise_uniform_text_masker_all_inputs(self, data: list, seed: float = None, **noise_params) -> list:
+        """
+        Adds noise to the data of multiple inputs.
+        """
+        # get the probability param from noise_params, default value is set to 0.1 
+        probability = noise_params.get("probability", 0.1)
+        return self.uniform_text_masker.add_noise_multiprocess(data, probability=probability, mask='N', seed=seed)
     
+
+class Prot(AbstractType):
+    """
+    class for dealing with protein data
+    """
+
+    def __init__(self, **parameters) -> None:
+        self.one_hot_encoder = encoders.TextOneHotEncoder(alphabet=parameters.get("one_hot_encoder_alphabet", "acdefghiklmnpqrstvwy"))
+        self.uniform_text_masker = noise_generators.UniformTextMasker()
+        
+    def one_hot_encode(self, data: str) -> np.array:
+        """
+        Encodes the data of a single input.
+        """
+        return self.one_hot_encoder.encode(data)
+
+    def one_hot_encode_all(self, data: list) -> list:
+        """
+        Encodes the data of multiple inputs.
+        """
+        return self.one_hot_encoder.encode_all(data)
+    
+    def encode(self, data: str, encoder: Literal['one_hot'] = 'one_hot') -> Any: #TODO call from get attribute instead of using if else
+        if encoder == 'one_hot':
+            return self.one_hot_encode(data)
+        else:
+            raise ValueError(f"Unknown encoder {encoder}")
+
+    def encode_all(self, data: list, encoder: Literal['one_hot'] = 'one_hot') -> list[np.array]:
+        if encoder == 'one_hot':
+            return self.one_hot_encode_all(data)
+        else:
+            raise ValueError(f"Unknown encoder {encoder}")
+
     def add_noise_uniform_text_masker(self, data: str, seed: float = None, **noise_params) -> str:
         """
         Adds noise to the data of a single input.
         """
         # get the probability param from noise_params, default value is set to 0.1
         probability = noise_params.get("probability", 0.1)
-        return self.uniform_text_masker.add_noise(data, probability=probability, seed=seed)
+        return self.uniform_text_masker.add_noise(data, probability=probability, mask='X', seed=seed)
     
     def add_noise_uniform_text_masker_all_inputs(self, data: list, seed: float = None, **noise_params) -> list:
         """
         Adds noise to the data of multiple inputs.
         """
         # get the probability param from noise_params, default value is set to 0.1 
         probability = noise_params.get("probability", 0.1)
-        return self.uniform_text_masker.add_noise_multiprocess(data, probability=probability, seed=seed)
+        return self.uniform_text_masker.add_noise_multiprocess(data, probability=probability, mask='X', seed=seed)
     
 
 class Float():
     """
     class for dealing with float data
     """
-    
     def __init__(self) -> None:
         self.gaussian_noise = noise_generators.GaussianNoise()
 
@@ -105,4 +152,4 @@ def encode(self, data: Any) -> float:
     
     def encode_all(self, data: list) -> list[np.array]:
         return [np.array(float(d)) for d in data]
-    
+    
diff --git a/bin/src/data/data_types/encoding/encoders.py b/bin/src/data/data_types/encoding/encoders.py
@@ -48,6 +48,9 @@ def encode_multiprocess(self, data: list) -> list:
 class TextOneHotEncoder(AbstractEncoder):
     """
     One hot encoder for text data.
+
+    NOTE that it will onehot encode based on the alphabet. 
+    If there is any character not included in the alphabet, that character will be presented by a vector of zeros.
     """
 
     def __init__(self, alphabet: str = "acgt") -> None:
@@ -57,6 +60,7 @@ def __init__(self, alphabet: str = "acgt") -> None:
     def _sequence_to_array(self, sequence: str) -> np.array:
         """
         This function transforms the given sequence to an array.
+        eg. 'abcd' -> array(['a'],['b'],['c'],['d'])
         """
         sequence_lower_case = sequence.lower()
         sequence_array = np.array(list(sequence_lower_case))
@@ -71,6 +75,7 @@ def encode(self, data: str) -> np.array:
     def encode_all(self, data: Union[list, str]) -> np.array:
         """
         Encodes the data, if the list is length one, call encode instead.
+        It resturns a list with all the encoded data entries.
         """
         # check if the data is a str, in that case it should use the encode sequence method
         if isinstance(data, str):
diff --git a/bin/src/data/data_types/noise/noise_generators.py b/bin/src/data/data_types/noise/noise_generators.py
@@ -37,38 +37,33 @@ def add_noise_multiprocess(self, data: list, seed: float = None) -> list:
 
 class UniformTextMasker(AbstractNoiseGenerator):
     """
-    This noise generators replace characters with 'N' with a given probability.
+    This noise generators replace characters with a masking character with a given probability.
     """
 
-
-    def add_noise(self, data: str, probability: float = 0.1, seed: float = None) -> str:
+    def add_noise(self, data: str, probability: float = 0.1, mask='N', seed: float = None) -> str:
         """
         Adds noise to the data.
         """
-
         np.random.seed(seed)
-        return ''.join([c if np.random.rand() > probability else 'N' for c in data])
+        return ''.join([c if np.random.rand() > probability else mask for c in data])
 
-    def add_noise_multiprocess(self, data: list, probability: float = 0.1, seed: float = None) -> list:
+    def add_noise_multiprocess(self, data: list, probability: float = 0.1, mask='N', seed: float = None) -> list:
         """
         Adds noise to the data using multiprocessing.
         """
-
         with mp.Pool(mp.cpu_count()) as pool:
-            function_specific_input = [(item, probability, seed) for item in data]
+            function_specific_input = [(item, probability, mask, seed) for item in data]
             return pool.starmap(self.add_noise, function_specific_input)
 
 class GaussianNoise(AbstractNoiseGenerator):
     """
     This noise generator adds gaussian noise to float values
     """
 
-
     def add_noise(self, data: float, mean: float = 0, std: float= 0, seed: float = None) -> float:
         """
         Adds noise to a single point of data.
         """
-
         np.random.seed(seed)
         return data + np.random.normal(mean, std)
     
@@ -77,6 +72,5 @@ def add_noise_multiprocess(self, data: list, mean: float = 0, std: float = 0, se
         Adds noise to the data using np arrays
         # TODO return a np array to gain performance.
         """
-
         np.random.seed(seed)
         return list(np.array(data) + np.random.normal(mean, std, len(data)))
diff --git a/bin/tests/test_data/test.csv b/bin/tests/test_data/test.csv
@@ -1,3 +1,3 @@
 hello:input:dna,hola:label:float
-ACTGACTGATCGATGC,5
-ACTGACTGATCGATGC,5
+ACTGACTGATCGATGC,12
+ACTGACTGATCGATGC,12
diff --git a/bin/tests/test_data_types.py b/bin/tests/test_data_types.py
@@ -1,25 +1,69 @@
 import numpy as np
 import numpy.testing as npt
 import unittest
-from bin.src.data.data_types.data_types import Dna
+from bin.src.data.data_types.data_types import Dna, Prot
 
 class TestDna(unittest.TestCase):
 
     def setUp(self):
         self.dna = Dna()
 
-    # test if the encode_all method runs with default arguments
     def test_encode_all(self):
-        # Test encoding a valid list of sequences
+        """
+        Test if the encode_all method runs with default arguments
+        """
+        # encode a list of sequences
         encoded_data_list = self.dna.encode_all(["ACGT", "AAA", "tt", "Bubba"])
+
+        # check that the encoding returns a list
         self.assertIsInstance(encoded_data_list, list)
-        # check if the length of the list is 4
-        self.assertEqual(len(encoded_data_list), 4)
+
+        # check if the arrays have the correct shape
+        self.assertEqual(encoded_data_list[0].shape, (4, 4))
+
+        # check we get the correct encoded arrays - first sequence
         correct_output = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
         npt.assert_array_equal(encoded_data_list[0], correct_output)
         
-    # test if the encode_all method returns an error when the specified encoder is not within the list of possible encoders
     def test_encode_all_error(self):
-        # Test encoding a valid list of sequences
+        """
+        Test if the encode_all method returns an error when the specified encoder is not within the list of possible encoders
+        """
+        with self.assertRaises(ValueError):
+            self.dna.encode_all(["ACGT", "AAA", "tt", "Bubba"], encoder="not_a_valid_encoder")
+
+
+class TestProt(unittest.TestCase):
+
+    def setUp(self):
+        self.prot = Prot()
+
+    def test_encode_all(self):
+        """
+        Test if the encode_all method runs with default arguments
+        acdefghiklmnpqrstvwy
+        """
+        # encode a list of sequences
+        encoded_data_list = self.prot.encode_all(["ACDE", "FFF", "gg", "uuu"])
+
+        # check that the encoding returns a list
+        self.assertIsInstance(encoded_data_list, list)
+
+        # check if the arrays have the correct shape
+        self.assertEqual(encoded_data_list[0].shape, (4, 20))
+
+        # check we get the correct encoded array - first sequence
+        correct_output = np.array([
+            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+            [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+            [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        ])
+        npt.assert_array_equal(encoded_data_list[0], correct_output)
+        
+    def test_encode_all_error(self):
+        """
+        Test if the encode_all method returns an error when the specified encoder is not within the list of possible encoders
+        """
         with self.assertRaises(ValueError):
-            self.dna.encode_all(["ACGT", "AAA", "tt", "Bubba"], encoder="not_a_valid_encoder")
+            self.prot.encode_all(["ACDE", "FFF", "gg", "uuu"], encoder="not_a_valid_encoder")