nf-core · mathysgrapotte · Mar 18, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/bin/json_schema.py b/bin/json_schema.py
@@ -1,6 +1,7 @@
 
 from abc import ABC, abstractmethod
-from typing import Literal 
+from typing import Literal
+from itertools import product
 
 class JsonSchema(ABC):
     """
@@ -83,7 +84,143 @@ def _check_params_schema(self) -> int:
             return num_params_list[0]
         else:
             raise ValueError(f"Expected the same number of values for all the params under noise value, but received a discordant ammount instead.")
+
+
+    def _transform_noise_dict(self):
+        """
+        TODO helper fucntion section
+        """
+        noise_dict = {}
+        for col_name_dictionary in self.noise_arg:
+            # The name: field of a noise: can be either a simlpe string or list of strings, so convert such variable to a list if it's a string, otherwise leave it unchanged
+            noiser_list = [col_name_dictionary['name']] if isinstance(col_name_dictionary['name'], str) else col_name_dictionary['name']
+            # Now get the parametrs or set of parameters associated with each noiser and store bot in a tuple and append to list noiser names associated to a given clumn_name
+            for k, noiser_name in enumerate(noiser_list):
+                # handle the fact that params can have "default" as value and not a list
+                if col_name_dictionary['params'] == "default":
+                    params_to_be_added = "default"
+                else:
+                    params_to_be_added =  col_name_dictionary['params'][k]
+                # handle the case of multiple noiser with same name in the same list associated to the column_name, solution -> create a scheme to modify the name
+                if noise_dict.get(col_name_dictionary["column_name"]) and noiser_name in noise_dict.get(col_name_dictionary["column_name"]) :
+                    # Modify the noiser name already present appending a unique key to it
+                    noiser_name = noiser_name + '-#' + str(k)
+                #noise_dict.setdefault(col_name_dictionary["column_name"], []).append( {noiser_name : params_to_be_added} )
+                noise_dict.setdefault(col_name_dictionary["column_name"], {})[noiser_name] = params_to_be_added
+        return noise_dict
+
+
+    def _generate_cartesian_product_combinations(self, d: dict) -> list:
+        """
+        Helper function for creating cartesian product combinations out of a dictionary.
+        TODO expand explanation
+        """
+        keys = d.keys()
+        value_lists = d.values()
+
+        # Generate Cartesian product of value lists
+        combinations = product(*value_lists)
+
+        # Create dictionaries for each combination
+        result = []
+        for combination in combinations:
+            combined_dict = {}
+            for key, value in zip(keys, combination):
+                nested_dict = {value : d[key][value]}
+                combined_dict.update({key: nested_dict})
+            result.append(combined_dict)
+
+        return result
+
+
+
+    def _handle_parameter_selection(self, d: dict, param_index: int) -> dict:
+        """
+        TODO helper fucntion section
+        """
 
+        for key, param_dict in d.items():
+            # remove the appendix used to handle same noise names for same column_name, this is done in the _transform_noise_dict function, this line does nothing if that key is not present afterall
+            key = key.split('-#')[0]
+            # handle "defualt" as params value
+            if param_dict == 'default':
+                return {"name" : key, "params" : param_dict}
+            else:
+                tmp_param_dict = {}
+                # iterate through the possible multiple parameter otpions
+                for param_name, param_value in param_dict.items():
+                    tmp_param_dict[param_name] = param_value[param_index]
+                return {"name": key, "params": tmp_param_dict}  
+
+
+
+    def noise_column_wise_combination(self) -> list:
+        """
+        works on the self.noise_arg dictionary to compute all column wise combinations for parametrs and noise function specified.
+        The combinations of noisers is all against all, except there can not be two noisers for the same column_name.
+        Combinations of noisers will always include at least one noiser per column_name.
+        example for noisers ->
+
+        column_name : 1                                  column_name : 2
+        name : [noiser1, noiser2]                        name: [othernoiser]
+
+        combinations ->
+            noiser1 - othernoiser
+            noiser2 - othernoiser
+
+        Now this is how noiser functions are selected but for each of the above combination there are as many as there are parameters.
+        Again an example shows it better ->
+
+        column_name : 1                                                 column_name : 2
+        name : [noiser1, noiser2]                                       name: [othernoiser]
+        parameters : [{p1 : [1 ,2 ,3]}, {p1 : [1.5, 2.5, 3.5 ]}]        parameters : [{p1 : [4 ,5 ,6], p2 : [7, 8, 9]}]
+
+        combinations ->
+            noiser1 (p1 = 1) - othernoiser (p1 = 4, p2 = 7)
+            noiser1 (p1 = 2) - othernoiser (p1 = 5, p2 = 8)
+            noiser1 (p1 = 3) - othernoiser (p1 = 6, p2 = 9)
+            noiser2 (p1 = 1.5) - othernoiser (p1 = 4, p2 = 7)
+            noiser2 (p1 = 2.5) - othernoiser (p1 = 5, p2 = 8)
+            noiser2 (p1 = 3.5) - othernoiser (p1 = 6, p2 = 9)
+        """
+
+        # transform noise entry in a nested dictionary, with structure {col_name: { noiser_name : {parameters : {p1 : [1]} }}}
+        noise_as_dict = self._transform_noise_dict()
+
+        # Create cartesian product of noiser names based on the above dictionary
+        noiser_combination_list = self._generate_cartesian_product_combinations(noise_as_dict)
+
+        # for each noiser combination create the column wise selection of parameters associated
+        all_noise_combination = []
+        for noise_combo in noiser_combination_list:
+            # select the parameter iterating through the total number of parameters value fopr each col type 
+            for params_index in range(self.number_culumn_wise_val):
+                noise_list = []
+                for col_name, noise_dict in noise_combo.items():
+                    single_param_dict = self._handle_parameter_selection(noise_dict, params_index)
+                    # add the column_name field to this dictionary
+                    single_param_dict["column_name"] = col_name
+                    # reorder the entries by key alphabetically for readability
+                    sorted_dict = {key: single_param_dict[key] for key in sorted(single_param_dict)}
+                    noise_list.append(sorted_dict)
+                all_noise_combination.append({'noise' : noise_list })
+        return all_noise_combination
+
+
+
+    def noise_all_combination(self) -> list:
+        """
+        works on the self.noise_arg dictionary to compute all possible combinations of parameters and nboisers in a all against all fashion.
+        """
+
+        # TODO implement this function
+        raise ValueError("the function noise_all_combination for the flag interpret_parmas_mode : all_combinations is not implemented yet ")
 
 
 
+    def split_combination(self) -> list:
+        """
+        TODO add description
+        """
+
+        # iterate through the split entry and return a list of split possibilities, where each splitter_name has one/set of one parametyers
diff --git a/bin/launch_interpret_json.py b/bin/launch_interpret_json.py
@@ -7,7 +7,8 @@
 
 def get_args():
 
-    "get the arguments when using from the commandline"
+    """get the arguments when using from the commandline
+    TODO write help function description"""
 
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("-j", "--json", type=str, required=True, metavar="FILE", help='The json config file that hold all parameter info')
@@ -30,11 +31,18 @@ def interpret_json(input_json: dict) -> list:
     #print("\nnoise_configurations :\n", schema.noise_arg, "\n", type(schema.noise_arg))
     #print("\nsplit_configurations :\n", schema.split_arg, "\n", type(schema.split_arg))
     #print("\ncustom_configurations :\n", schema.custom_arg, "\n", type(schema.custom_arg))
-    print(schema.number_culumn_wise_val)
+    #print(schema.number_culumn_wise_val)
     #print(schema.experiment, schema.interpret_params_mode, schema.column_names)
-
 
-
+    # compute all noise combinations
+    # first set right fucntion call based on schema.interpret_params_mode, done like following because if are inefficient
+    function_call_dict = {"culumn_wise": schema.noise_column_wise_combination, "all_combinations": schema.noise_all_combination}
+    list_noise_combinations = function_call_dict[schema.interpret_params_mode]()
+    print(list_noise_combinations, len(list_noise_combinations))
+
+    # compute all split combinations, this will only be all vs all because there is no concept of column_name
+    list_split_combinations = schema.split_combination()
+    print(list_split_combinations, len(list_split_combinations))
 
 
 def main(config_json: str) -> str:
@@ -44,7 +52,7 @@ def main(config_json: str) -> str:
     with open(config_json, 'r') as in_json:
         config = json.load(in_json)
 
-    # initialize the json scheme class 
+    # interpret the json
     interpret_json(config)
 
 

diff --git a/examples/pipeline_generated.json b/examples/pipeline_generated.json
@@ -2,25 +2,25 @@
     "experiment": "DnaToFloatExperiment",
     "noise": [
         {
-            "column_name": "input1",
+            "column_name": "inphello:input1:dnaut1",
             "name": "UniformTextMasker",
-            "params": [{"probability": [0.1]}]
+            "params": {"probability": 0.1}
         },
         {
-            "column_name": "input2",
+            "column_name": "hello:input2:prot",
             "name": "UniformTextMasker",
-            "params": [{"probability": [0.4]}]
+            "params": {"probability": 0.4}
         },
         {
-            "column_name": "label",
+            "column_name": "hola:label:float",
             "name": "GaussianNoise",
-            "params": [{"mean": [0.5], "std": [0.1]}]
+            "params": {"mean": 0.5, "std": 0.1}
         }
     ],
     "split": [
         {
             "name": "RandomSplitter",
-            "params": [{"split": [[0.6, 0.8]]}]
+            "params": {"split": [0.6, 0.8]}
         }
     ]
 }
diff --git a/examples/user_given.json b/examples/user_given.json
@@ -1,19 +1,19 @@
 {
-    "experiment": "DnaToFloatExperiment",
+    "experiment": "MyCustomExperiment",
     "interpret_parmas_mode": "culumn_wise", 
     "noise": [
         {
-            "column_name": "input1",
-            "name": ["UniformTextMasker", "AnotherNoiser", "YetAnotherNoiser"],
+            "column_name": "hello:input1:dna",
+            "name": ["UniformTextMasker", "AnotherNoiser", "AnotherNoiser"],
             "params": [{"probability": [0.1, 0.2, 0.3]}, {"probability": [0.11, 0.21, 0.31]}, {"probability": [0.12, 0.22, 0.32]}]
         },
         {
-            "column_name": "input2",
+            "column_name": "hello:input2:prot",
             "name": ["UniformTextMasker", "AnotherNoiser"],
-            "params": [{"probability": [0.4, 0.5, 0.6]}, {"probability": [0.1, 0.2, 0.3]}]
+            "params": "default"
         },
         {
-            "column_name": "label",
+            "column_name": "hola:label:float",
             "name": "GaussianNoise",
             "params": [{"mean": [0.5, 0.6, 0.7], "std": [0.1, 0.2, 0.3]}]
         }
@@ -22,6 +22,10 @@
         {
             "name": "RandomSplitter",
             "params": [{"split": [[0.6, 0.8], [0.7, 0.85]]}]
+        },
+        {
+            "name": "SomeSplitter",
+            "params": "default"
         }
     ],
     "custom": [