nf-core · mathysgrapotte · Mar 18, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 18, 2024
diff --git a/bin/launch_csv_handling.py b/bin/launch_csv_handling.py
@@ -20,6 +20,11 @@ def get_args():
 
 
 def main(data_csv, config_json):
+    """
+    This launcher will be the connection between the csv and one json configuration.
+    It should also handle some sanity checks.
+    TODO add a check so that it launches error when split is not defined in json neither in the csv data.
+    """
 
     print(data_csv, config_json)
 

diff --git a/bin/src/data/csv.py b/bin/src/data/csv.py
@@ -11,6 +11,7 @@
 The parser is a class that takes as input a CSV file and a experiment class that defines data types to be used, noising procedures, splitting etc. 
 """
 
+import numpy as np
 import polars as pl
 from typing import Any, Tuple, Union
 from functools import partial
@@ -74,15 +75,38 @@ class CsvProcessing(CsvHandler):
     """
     Class to load the input csv data and add noise accordingly.
     """
-
     def __init__(self, experiment: Any, csv_path: str) -> None:
         super().__init__(experiment, csv_path)
         self.data = self.load_csv()
-
+
+    def add_split(self, split_method: str, split: list, seed: float = None, force=False) -> None:
+        """
+        Add a column specifying the train, validation, test splits of the data.
+        An error exception is raised if the split column is already present in the csv file. This behaviour can be overriden by setting force=True.
+
+        args:
+            split_method (str) : The method to split the data, should be one of the keys of the split dictionary in the experiment class.
+            split (list) : The proportions for [train, validation, test] splits.
+            seed (float) : The seed for reproducibility.
+            force (bool) : If True, the split column will be added even if it is already present in the csv file.
+        """
+        if ('split' in self.categories) and (not force):
+            raise ValueError(f"The category split is already present in the csv file. If you want to still use this function, set force=True")
+
+        # get the indices for train, validation and test using the specified split method
+        train, validation, test = self.experiment.get_function_split(split_method)(len(self.data), split, seed)
+
+        # add the split column to the data
+        split_column = np.full(len(self.data), np.nan)
+        split_column[train] = 0
+        split_column[validation] = 1
+        split_column[test] = 2
+        self.data = self.data.with_columns(pl.Series('split:split:int', split_column))
+
     def add_noise(self, configs: list) -> None:
         """
         Adds noise to the data.
-        Noise is added for each column with the configurations specified in the configs list.
+        Noise is added for each column with the specified configurations.
         """
         # for each column configuration
         for dictionary in configs:
@@ -96,11 +120,11 @@ def add_noise(self, configs: list) -> None:
             # change the column with the new values
             self.data = self.data.with_columns(pl.Series(key, new_column))
 
-    def save(self, path: str) -> None:
+    def save(self, data: pl.DataFrame, path: str) -> None:
         """
         Saves the data to a csv file.
         """
-        self.data.write_csv(path)
+        data.write_csv(path)
 
 
 class CsvLoader(CsvHandler):

diff --git a/bin/src/data/experiments.py b/bin/src/data/experiments.py
@@ -23,13 +23,6 @@ class AbstractExperiment(ABC):
     def __init__(self, seed: float = None) -> None:
         # allow ability to add a seed for reproducibility
         self.seed = seed
-
-
-    def get_split_indexes(self, data: list, split: tuple) -> list | list | list:
-        """
-        Returns the indexes of the split data.
-        """
-        raise NotImplementedError
 
     def get_function_encode_all(self, data_type: str) -> Any:
         """
@@ -42,6 +35,12 @@ def get_function_noise_all(self, data_type: str, noise_generator: str) -> Any:
         This method adds noise to all the entries.
         """
         return getattr(self, data_type)['noise_generators'][noise_generator].add_noise_all
+
+    def get_function_split(self, split_method: str) -> Any:
+        """
+        This method returns the function for splitting the data.
+        """
+        return self.split[split_method].get_split_indexes
 
 
 class DnaToFloatExperiment(AbstractExperiment):
@@ -52,6 +51,7 @@ def __init__(self) -> None:
         super().__init__()
         self.dna = {'encoder': encoders.TextOneHotEncoder(alphabet='acgt'), 'noise_generators': {'UniformTextMasker': noise_generators.UniformTextMasker(mask='N')}}
         self.float = {'encoder': encoders.FloatEncoder(), 'noise_generators': {'GaussianNoise': noise_generators.GaussianNoise()}}
+        self.split = {'RandomSplitter': spliters.RandomSplitter()}
 
 
 class ProtDnaToFloatExperiment(DnaToFloatExperiment):

diff --git a/bin/src/data/spliters/spliters.py b/bin/src/data/spliters/spliters.py
@@ -11,48 +11,67 @@ class AbstractSplitter(ABC):
     """
     Abstract class for splitters.
     """
-    def __init__(self, seed: float = 0) -> None:
-        # allow ability to add a seed for reproducibility
-        if seed != 0:
-            np.random.seed(seed)
 
     @abstractmethod
-    def split(self, data: list) -> list:
+    def get_split_indexes(self, length_of_data: int, split: list, seed: float = None) -> list:
         """
         Splits the data. Always return indices mapping to the original list. 
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def distance(self, data_one: Any, data_two: Any) -> float:
-        """
-        Calculates the distance between two elements of the data.
-        """
-        raise NotImplementedError
+    # @abstractmethod
+    # def distance(self, data_one: Any, data_two: Any) -> float:
+    #     """
+    #     Calculates the distance between two elements of the data.
+    #     """
+    #     raise NotImplementedError
 
 
 class RandomSplitter(AbstractSplitter):
     """
     This splitter randomly splits the data.
     """
 
-    def __init__(self, seed: float = 0) -> None:
-        super().__init__(seed=seed)
+    def __init__(self) -> None:
+        super().__init__()
 
-    def split(self, length_of_data: int, split: tuple) -> list | list | list:
+    def get_split_indexes(self, length_of_data: int, split: list, seed: float = None) -> list | list | list:
         """
-        Randomly splits the data in three lists according to the split tuple, the split tuple should contain two values between 0 and 1 in an ascending manner.
-        Instead of returning the original data, returns three lists of indexes mapping to the indexes in the original data.
+        Splits the data indices into train, validation, and test sets. 
+        One can use these lists of indices to parse the data afterwards.
+
+        args:
+            length_of_data: int
+                The length of the data.
+            split: list
+                The proportions for [train, validation, test] splits.
+            seed: float
+                The seed for reproducibility.
+        returns:
+            train: list
+                The indices for the training set.
+            validation: list
+                The indices for the validation set.
+            test: list
+                The indices for the test set.
         """
-        if split[0] >= split[1]:
-            raise ValueError("The split tuple should contain two values between 0 and 1 in an ascending manner.")
-        train, test, validation = [], [], []
-        for i in range(length_of_data):
-            r = np.random.rand()
-            if r < split[0]:
-                train.append(i)
-            elif r < split[1]:
-                test.append(i)
-            else:
-                validation.append(i)
-        return train, test, validation
+        if len(split) != 3:
+            raise ValueError("The split argument should be a list with length 3 that contains the proportions for [train, validation, test] splits.")
+        if sum(split) != 1.0:
+            raise ValueError("The sum of the split proportions should be 1.")
+
+        # Generate a list of indices and shuffle it
+        indices = np.arange(length_of_data)
+        np.random.seed(seed)
+        np.random.shuffle(indices)
+
+        # Calculate the sizes of the train, validation, and test sets
+        train_size = int(split[0] * length_of_data)
+        validation_size = int(split[1] * length_of_data)
+
+        # Split the shuffled indices according to the calculated sizes
+        train = indices[:train_size].tolist()
+        validation = indices[train_size:train_size+validation_size].tolist()
+        test = indices[train_size+validation_size:].tolist()
+
+        return train, validation, test
diff --git a/bin/tests/test_csv.py b/bin/tests/test_csv.py
@@ -1,6 +1,8 @@
 import json
 import os
 import unittest
+import sys
+sys.path.append('./')
 from bin.src.data.csv import CsvProcessing, CsvLoader
 from bin.src.data.experiments import DnaToFloatExperiment,ProtDnaToFloatExperiment
 from bin.src.data.experiments import DnaToFloatExperiment, ProtDnaToFloatExperiment
@@ -9,12 +11,27 @@ class AbstractTestCsvProcessing(unittest.TestCase):
     """
     Abstract class for testing CsvProcessing class.
     """
-    def _test_data_shape(self):
+    def _test_len(self):
         """
         It tests that it loads correctly the data with the correct shape.
         """
-        self.assertEqual(self.csv_processing.data.shape[0], self.data_shape[0])
-        self.assertEqual(self.csv_processing.data.shape[1], self.data_shape[1])
+        self.assertEqual(len(self.csv_processing.data), self.data_length)
+
+    def _add_split(self):
+        config = self.configs['split']
+        self.csv_processing.add_split(
+            split_method = config['name'],
+            split = config['params']['split'],
+            seed = config['params']['seed']
+        )
+
+    def _test_random_splitter(self, expected_splits):
+        """
+        It tests that the data is split correctly.
+        """
+        for i in range(self.data_length):
+            col = self.csv_processing.data['split:split:int'][i]
+            self.assertEqual(self.csv_processing.data['split:split:int'][i], expected_splits[i])
 
     def _add_noise(self):
         self.csv_processing.add_noise(self.configs['noise'])
@@ -27,9 +44,6 @@ def _test_value_from_column(self, column_name, expected_value, position=0):
         if isinstance(observed_value, float):
             observed_value = round(observed_value, 2)
         self.assertEqual(observed_value, expected_value)
-
-    def _test_split(self):
-        pass
 
 
 class TestDnaToFloatCsvProcessing(AbstractTestCsvProcessing):
@@ -42,14 +56,16 @@ def setUp(self):
         self.csv_processing = CsvProcessing(self.experiment, self.csv_path)
         with open('bin/tests/test_data/dna_experiment/test_config.json', 'rb') as f:
             self.configs = json.load(f)
+        self.data_length = 2
 
-    def test_data_shape(self):
-        self.data_shape = [2,3]
-        self._test_data_shape()
-
-    def test_add_noise(self):
+    def test_len(self):
+        self._test_len()
+
+    def test_split_and_noise(self):
         self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATGC')
         self._test_value_from_column('hola:label:float', 12)
+        self._add_split()
+        self._test_random_splitter([1, 0])
         self._add_noise()
         self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATNN')
         self._test_value_from_column('hola:label:float', 12.68)
@@ -66,15 +82,17 @@ def setUp(self):
         self.csv_processing = CsvProcessing(self.experiment, self.csv_path)
         with open('bin/tests/test_data/prot_dna_experiment/test_config.json', 'rb') as f:
             self.configs = json.load(f)
+        self.data_length = 2
 
-    def test_load_csv(self):
-        self.data_shape = [2,4]
-        self._test_data_shape()
+    def test_len(self):
+        self._test_len()
 
-    def test_add_noise(self):
+    def test_split_and_noise(self):
         self._test_value_from_column('bonjour:input:prot', 'GPRTTIKAKQLETLK')
         self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATGC')
         self._test_value_from_column('hola:label:float', 12)
+        self._add_split()
+        self._test_random_splitter([1, 0])
         self._add_noise()
         self._test_value_from_column('bonjour:input:prot', 'GPRTTIKAKQLETLX')
         self._test_value_from_column('hello:input:dna', 'ACTGACTGATCGATNN')

diff --git a/bin/tests/test_data/dna_experiment/test_config.json b/bin/tests/test_data/dna_experiment/test_config.json
@@ -12,10 +12,8 @@
             "params": {"mean": 0.5, "std": 0.1, "seed": 0}
         }
     ],
-    "split": [
-        {
+    "split": {
             "name": "RandomSplitter",
-            "params": {"split": [0.6, 0.8], "seed": 0}
-        }
-    ]
+            "params": {"split": [0.5, 0.5, 0.0], "seed": 0}
+    }
 }
diff --git a/bin/tests/test_data/prot_dna_experiment/test_config.json b/bin/tests/test_data/prot_dna_experiment/test_config.json
@@ -17,10 +17,8 @@
             "params": {"mean": 0.5, "std": 0.1, "seed": 0}
         }
     ],
-    "split": [
-        {
+    "split": {
             "name": "RandomSplitter",
-            "params": {"split": [0.6, 0.8], "seed": 0}
-        }
-    ]
+            "params": {"split": [0.5, 0.5, 0], "seed": 0}
+    }
 }
diff --git a/examples/pipeline_generated.json b/examples/pipeline_generated.json
@@ -2,12 +2,16 @@
     "experiment": "DnaToFloatExperiment",
     "noise": [
         {
-            "column_name": "inphello:input1:dnaut1",
+
+            "column_name": "hello:input:dna",
+
             "name": "UniformTextMasker",
             "params": {"probability": 0.1}
         },
         {
-            "column_name": "hello:input2:prot",
+
+            "column_name": "bonjour:input:prot",
+
             "name": "UniformTextMasker",
             "params": {"probability": 0.4}
         },
@@ -17,6 +21,7 @@
             "params": {"mean": 0.5, "std": 0.1}
         }
     ],
+
     "split": 
         {
             "name": "RandomSplitter",
@@ -26,4 +31,5 @@
 
 
         }
+
 }